feat(in-batch-sample): add in-batch negative sampling support

yangzhou23 · yangzhou23 · commit 280162705211 · 2025-11-27T21:10:50.000+08:00
- add in_batch_negative_sampling helper
- cover in-batch sampling with unit tests
- ensure Matching tutorial runs with the new sampler
diff --git a/tests/test_inbatch_sampling.py b/tests/test_inbatch_sampling.py
@@ -0,0 +1,70 @@
+import numpy as np
+import pandas as pd
+import torch
+
+from torch_rechub.basic.features import SequenceFeature, SparseFeature
+from torch_rechub.models.matching import DSSM
+from torch_rechub.trainers import MatchTrainer
+from torch_rechub.utils.data import MatchDataGenerator, df_to_dict
+from torch_rechub.utils.match import gather_inbatch_logits, generate_seq_feature_match, gen_model_input, inbatch_negative_sampling
+
+
+def test_inbatch_negative_sampling_random_and_uniform():
+    scores = torch.zeros((4, 4))
+    neg_idx = inbatch_negative_sampling(scores, neg_ratio=2, generator=torch.Generator().manual_seed(0))
+    logits = gather_inbatch_logits(scores, neg_idx)
+    assert logits.shape == (4, 3)
+    assert neg_idx.shape == (4, 2)
+    for row, sampled in enumerate(neg_idx):
+        assert row not in sampled.tolist()
+
+    # Different seed should give different permutations to ensure randomness
+    neg_idx_second = inbatch_negative_sampling(scores, neg_ratio=2, generator=torch.Generator().manual_seed(1))
+    assert not torch.equal(neg_idx, neg_idx_second)
+
+
+def test_inbatch_negative_sampling_hard_negative():
+    scores = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 0.0]])
+    neg_idx = inbatch_negative_sampling(scores, neg_ratio=1, hard_negative=True)
+    # highest non-diagonal scores for each row
+    assert torch.equal(neg_idx.squeeze(1), torch.tensor([2, 2, 1]))
+
+
+def _build_small_match_dataloader():
+    n_users, n_items, n_samples = 12, 24, 80
+    data = pd.DataFrame(
+        {
+            "user_id": np.random.randint(0, n_users, n_samples),
+            "item_id": np.random.randint(0, n_items, n_samples),
+            "time": np.arange(n_samples),
+        }
+    )
+    user_profile = pd.DataFrame({"user_id": np.arange(n_users)})
+    item_profile = pd.DataFrame({"item_id": np.arange(n_items)})
+
+    df_train, _ = generate_seq_feature_match(data, "user_id", "item_id", "time", mode=0, neg_ratio=0)
+    x_train = gen_model_input(df_train, user_profile, "user_id", item_profile, "item_id", seq_max_len=8)
+    # labels are unused in in-batch mode; keep zero array for shape alignment
+    y_train = np.zeros(len(df_train))
+
+    user_features = [
+        SparseFeature("user_id", n_users, embed_dim=8),
+        SequenceFeature("hist_item_id", n_items, embed_dim=8, pooling="mean", shared_with="item_id"),
+    ]
+    item_features = [SparseFeature("item_id", n_items, embed_dim=8)]
+
+    dg = MatchDataGenerator(x_train, y_train)
+    train_dl, _, _ = dg.generate_dataloader(x_train, df_to_dict(item_profile), batch_size=8, num_workers=0)
+
+    model = DSSM(user_features, item_features, user_params={"dims": [16]}, item_params={"dims": [16]})
+    return train_dl, model
+
+
+def test_match_trainer_inbatch_flow_runs_and_updates():
+    train_dl, model = _build_small_match_dataloader()
+
+    trainer = MatchTrainer(model, mode=0, in_batch_neg=True, in_batch_neg_ratio=3, sampler_seed=2, n_epoch=1, device="cpu")
+    trainer.train_one_epoch(train_dl, log_interval=100)
+
+    grads = [p.grad for p in model.parameters() if p.requires_grad]
+    assert any(g is not None for g in grads)
diff --git a/torch_rechub/basic/loss_func.py b/torch_rechub/basic/loss_func.py
@@ -68,7 +68,8 @@ def __init__(self, margin=2, num_items=None):
         self.margin = margin
         self.n_items = num_items
 
-    def forward(self, pos_score, neg_score):
+    def forward(self, pos_score, neg_score, in_batch_neg=False):
+        pos_score = pos_score.view(-1)
         loss = torch.maximum(torch.max(neg_score, dim=-1).values - pos_score + self.margin, torch.tensor([0]).type_as(pos_score))
         if self.n_items is not None:
             impostors = neg_score - pos_score.view(-1, 1) + self.margin > 0
@@ -83,9 +84,14 @@ class BPRLoss(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
-    def forward(self, pos_score, neg_score):
-        loss = torch.mean(-(pos_score - neg_score).sigmoid().log(), dim=-1)
-        return loss
+    def forward(self, pos_score, neg_score, in_batch_neg=False):
+        pos_score = pos_score.view(-1)
+        if neg_score.dim() == 1:
+            diff = pos_score - neg_score
+        else:
+            diff = pos_score.view(-1, 1) - neg_score
+        loss = -diff.sigmoid().log()
+        return loss.mean()
 
 
 # loss = -torch.mean(F.logsigmoid(pos_score - torch.max(neg_score,
diff --git a/torch_rechub/trainers/match_trainer.py b/torch_rechub/trainers/match_trainer.py
@@ -6,6 +6,7 @@
 
 from ..basic.callback import EarlyStopper
 from ..basic.loss_func import BPRLoss, RegularizationLoss
+from ..utils.match import gather_inbatch_logits, inbatch_negative_sampling
 
 
 class MatchTrainer(object):
@@ -23,12 +24,20 @@ class MatchTrainer(object):
         device (str): `"cpu"` or `"cuda:0"`
         gpus (list): id of multi gpu (default=[]). If the length >=1, then the model will wrapped by nn.DataParallel.
         model_path (str): the path you want to save the model (default="./"). Note only save the best weight in the validation data.
+        in_batch_neg (bool): whether to use in-batch negative sampling instead of global negatives.
+        in_batch_neg_ratio (int): number of negatives to draw from the batch per positive sample when in_batch_neg is True.
+        hard_negative (bool): whether to choose hardest negatives within batch (top-k by score) instead of uniform random.
+        sampler_seed (int): optional random seed for in-batch sampler to ease reproducibility/testing.
     """
 
     def __init__(
         self,
         model,
         mode=0,
+        in_batch_neg=False,
+        in_batch_neg_ratio=None,
+        hard_negative=False,
+        sampler_seed=None,
         optimizer_fn=torch.optim.Adam,
         optimizer_params=None,
         regularization_params=None,
@@ -50,13 +59,21 @@ def __init__(
         # torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         self.device = torch.device(device)
         self.model.to(self.device)
+        self.in_batch_neg = in_batch_neg
+        self.in_batch_neg_ratio = in_batch_neg_ratio
+        self.hard_negative = hard_negative
+        self._sampler_generator = None
+        if sampler_seed is not None:
+            self._sampler_generator = torch.Generator(device=self.device)
+            self._sampler_generator.manual_seed(sampler_seed)
         if optimizer_params is None:
             optimizer_params = {"lr": 1e-3, "weight_decay": 1e-5}
         if regularization_params is None:
             regularization_params = {"embedding_l1": 0.0, "embedding_l2": 0.0, "dense_l1": 0.0, "dense_l2": 0.0}
         self.mode = mode
         if mode == 0:  # point-wise loss, binary cross_entropy
-            self.criterion = torch.nn.BCELoss()  # default loss binary cross_entropy
+            # With in-batch negatives we treat it as list-wise classification over sampled negatives
+            self.criterion = torch.nn.CrossEntropyLoss() if in_batch_neg else torch.nn.BCELoss()
         elif mode == 1:  # pair-wise loss
             self.criterion = BPRLoss()
         elif mode == 2:  # list-wise loss, softmax
@@ -85,12 +102,34 @@ def train_one_epoch(self, data_loader, log_interval=10):
                 y = y.float()  # torch._C._nn.binary_cross_entropy expected Float
             else:
                 y = y.long()  #
-            if self.mode == 1:  # pair_wise
-                pos_score, neg_score = self.model(x_dict)
-                loss = self.criterion(pos_score, neg_score)
+            if self.in_batch_neg:
+                base_model = self.model.module if isinstance(self.model, torch.nn.DataParallel) else self.model
+                user_embedding = base_model.user_tower(x_dict)
+                item_embedding = base_model.item_tower(x_dict)
+                if user_embedding is None or item_embedding is None:
+                    raise ValueError("Model must return user/item embeddings when in_batch_neg is True.")
+                if user_embedding.dim() > 2 and user_embedding.size(1) == 1:
+                    user_embedding = user_embedding.squeeze(1)
+                if item_embedding.dim() > 2 and item_embedding.size(1) == 1:
+                    item_embedding = item_embedding.squeeze(1)
+                if user_embedding.dim() != 2 or item_embedding.dim() != 2:
+                    raise ValueError(f"In-batch negative sampling requires 2D embeddings, got shapes {user_embedding.shape} and {item_embedding.shape}")
+
+                scores = torch.matmul(user_embedding, item_embedding.t()) # bs x bs
+                neg_indices = inbatch_negative_sampling(scores, neg_ratio=self.in_batch_neg_ratio, hard_negative=self.hard_negative, generator=self._sampler_generator)
+                logits = gather_inbatch_logits(scores, neg_indices)
+                if self.mode == 1:  # pair_wise
+                    loss = self.criterion(logits[:, 0], logits[:, 1:], in_batch_neg=True)
+                else:  # point-wise/list-wise -> cross entropy on sampled logits
+                    targets = torch.zeros(logits.size(0), dtype=torch.long, device=self.device)
+                    loss = self.criterion(logits, targets)
             else:
-                y_pred = self.model(x_dict)
-                loss = self.criterion(y_pred, y)
+                if self.mode == 1:  # pair_wise
+                    pos_score, neg_score = self.model(x_dict)
+                    loss = self.criterion(pos_score, neg_score)
+                else:
+                    y_pred = self.model(x_dict)
+                    loss = self.criterion(y_pred, y)
 
             # Add regularization loss
             reg_loss = self.reg_loss_fn(self.model)
diff --git a/torch_rechub/utils/match.py b/torch_rechub/utils/match.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import pandas as pd
+import torch
 import tqdm
 
 from .data import df_to_dict, pad_sequences
@@ -16,7 +17,6 @@
     ANNOY_AVAILABLE = False
 
 try:
-    import torch
     from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, utility
     MILVUS_AVAILABLE = True
 except ImportError:
@@ -101,6 +101,67 @@ def negative_sample(items_cnt_order, ratio, method_id=0):
     return neg_items
 
 
+def inbatch_negative_sampling(scores, neg_ratio=None, hard_negative=False, generator=None):
+    """Generate in-batch negative indices from a similarity matrix.
+
+    This mirrors the offline ``negative_sample`` API by only returning sampled
+    indices; score gathering is handled separately to keep responsibilities clear.
+
+    Args:
+        scores (torch.Tensor): similarity matrix with shape (batch_size, batch_size).
+        neg_ratio (int, optional): number of negatives for each positive sample.
+            Defaults to batch_size-1 when omitted or out of range.
+        hard_negative (bool, optional): whether to pick top-k highest scores as negatives
+            instead of uniform random sampling. Defaults to False.
+        generator (torch.Generator, optional): generator to control randomness for tests/reproducibility.
+
+    Returns:
+        torch.Tensor: sampled negative indices with shape (batch_size, neg_ratio).
+    """
+    if scores.dim() != 2: # must be batch_size x batch_size
+        raise ValueError(f"inbatch_negative_sampling expects 2D scores, got shape {tuple(scores.shape)}")
+    batch_size = scores.size(0)
+    if batch_size <= 1:
+        raise ValueError("In-batch negative sampling requires batch_size > 1")
+
+    max_neg = batch_size - 1 # each col can provide at most batch_size-1 negatives
+    if neg_ratio is None or neg_ratio <= 0 or neg_ratio > max_neg:
+        neg_ratio = max_neg
+
+    device = scores.device
+    index_range = torch.arange(batch_size, device=device)
+    neg_indices = torch.empty((batch_size, neg_ratio), dtype=torch.long, device=device)
+
+    # for each sample, pick neg_ratio negatives
+    for i in range(batch_size):
+        if hard_negative:
+            row_scores = scores[i].clone()
+            row_scores[i] = float("-inf")  # mask positive
+            topk = torch.topk(row_scores, k=neg_ratio).indices
+            neg_indices[i] = topk
+        else:
+            candidates = torch.cat([index_range[:i], index_range[i + 1 :]]) # all except i
+            perm = torch.randperm(candidates.size(0), device=device, generator=generator) # random negative sampling
+            neg_indices[i] = candidates[perm[:neg_ratio]]
+
+    return neg_indices
+
+
+def gather_inbatch_logits(scores, neg_indices):
+    """
+    scores: (B, B)
+        scores[i][j] = user_i ⋅ item_j
+    neg_indices: (B, K)
+        neg_indices[i] = the K negative items for user_i
+    """
+    # positive: scores[i][i]
+    positive_logits = torch.diagonal(scores).reshape(-1, 1)  # (B,1)
+    # negatives: scores[i][neg_indices[i, j]]
+    negative_logits = scores[torch.arange(scores.size(0)).unsqueeze(1),
+                             neg_indices]  # (B,K)
+    return torch.cat([positive_logits, negative_logits], dim=1)
+
+
 def generate_seq_feature_match(data, user_col, item_col, time_col, item_attribute_cols=None, sample_method=0, mode=0, neg_ratio=0, min_item=0):
     """Generate sequence feature and negative sample for match.
 
diff --git a/tutorials/Matching.ipynb b/tutorials/Matching.ipynb