[GKD] interpolate in prob. space (#2204)

* interpolate in prob. space * better var names * use logsumexp * set beta dtype * beta tensor
huggingface · Oct 10, 2024 · 1661bc2 · 1661bc2
1 parent fdbcaae
commit 1661bc2
Showing 1 changed file with 9 additions and 4 deletions.
diff --git a/trl/trainer/gkd_trainer.py b/trl/trainer/gkd_trainer.py
@@ -124,13 +124,18 @@ def generalized_jsd_loss(
         student_log_probs = F.log_softmax(student_logits, dim=-1)
         teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
 
-        # Compute the interpolated log probabilities
-        interpolated_log_probs = beta * student_log_probs + (1 - beta) * teacher_log_probs
+        # Compute the log of the mixture distribution
+        # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture
+        beta = torch.tensor(beta, dtype=student_log_probs.dtype)
+        mixture_log_probs = torch.logsumexp(
+            torch.stack([student_log_probs + torch.log(beta), teacher_log_probs + torch.log(1 - beta)]),
+            dim=0,
+        )
 
         # Compute KL divergences using F.kl_div
         # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper.
-        kl_teacher = F.kl_div(interpolated_log_probs, teacher_log_probs, reduction="none", log_target=True)
-        kl_student = F.kl_div(interpolated_log_probs, student_log_probs, reduction="none", log_target=True)
+        kl_teacher = F.kl_div(mixture_log_probs, teacher_log_probs, reduction="none", log_target=True)
+        kl_student = F.kl_div(mixture_log_probs, student_log_probs, reduction="none", log_target=True)
 
         # Compute the Generalized Jensen-Shannon Divergence
         jsd = beta * kl_teacher + (1 - beta) * kl_student