microsoft · ganik · Jul 20, 2020 · Jul 21, 2020 · Aug 2, 2020 · Aug 2, 2020
diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
@@ -174,7 +174,9 @@ def linear(w,b,x):
         if self.talking_head:
             attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2)
 
-        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        #attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        nodex = torch.nn.Softmax(-1)
+        attention_probs = nodex(attention_scores)
         attention_probs = self.dropout(attention_probs)
         if self.talking_head:
             attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2)

diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py
@@ -115,7 +115,11 @@ def backward(ctx, grad_output):
     else:
       return grad_output, None
 
-class StableDropout(torch.nn.Module):
+class StableDropout(torch.nn.Dropout):
+  def __init__(self, drop_prob):
+      super().__init__()
+
+class StableDropout1(torch.nn.Module):
   """ Optimized dropout module for stabilizing the training
 
   Args: