microsoft · ganik · Jul 20, 2020 · Jul 21, 2020 · Aug 2, 2020 · Aug 2, 2020
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+tmp/
diff --git a/DeBERTa/apps/multi_choice.py b/DeBERTa/apps/multi_choice.py
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     self.num_labels = num_labels
     self.classifier = nn.Linear(config.hidden_size, 1)
     drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
     self.apply(self.init_weights)
 
   def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):

diff --git a/DeBERTa/apps/ner.py b/DeBERTa/apps/ner.py
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     self.proj = nn.Linear(config.hidden_size, config.hidden_size)
     self.classifier = nn.Linear(config.hidden_size, self.num_labels)
     drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
     self.apply(self.init_weights)
 
   def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):

diff --git a/DeBERTa/apps/orttrain.py b/DeBERTa/apps/orttrain.py
@@ -0,0 +1,195 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import os
+import argparse
+import random
+
+import numpy as np
+import torch
+from ..deberta import GPT2Tokenizer, DebertaPreTrainedTokenizer
+from ..onnx import ORTGlueTest
+from ..utils import *
+from .task_registry import tasks
+from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size, get_mpi_context_world_rank, get_mpi_context_world_size
+
+def create_model(args, num_labels, model_class_fn):
+  # Prepare model
+  rank = getattr(args, 'rank', 0)
+  init_model = args.init_model if rank<1 else None
+  model = model_class_fn(init_model, args.model_config, num_labels=num_labels, \
+      drop_out=args.cls_drop_out, \
+      pre_trained = args.pre_trained)
+  if args.fp16:
+    model = model.half()
+  return model
+
+def main(args):
+  os.makedirs(args.output_dir, exist_ok=True)
+  logger.info("Using seed " + str(args.seed))
+  random.seed(args.seed)
+  np.random.seed(args.seed)
+  torch.manual_seed(args.seed)
+
+  # load model based on task
+  tokenizer = GPT2Tokenizer()
+  processor = tasks[args.task_name.lower()](tokenizer = tokenizer, max_seq_len = args.max_seq_length, data_dir = args.data_dir)
+  label_list = processor.get_labels()
+  model_class_fn = processor.get_model_class_fn()
+  model = create_model(args, len(label_list), model_class_fn)
+  logger.info("Model config {}".format(model.config))
+
+  # train with ORT
+  test = ORTGlueTest()
+  test.setUp(args)
+  test.local_rank = get_mpi_context_local_rank()
+  test.world_size = get_mpi_context_world_size()
+  print("mpirun launch, local_rank / world_size: ", test.local_rank, test.world_size)
+  os.environ['RANK'] = str(test.local_rank)
+  os.environ['WORLD_SIZE'] = str(test.world_size)
+  os.environ['MASTER_ADDR'] = '127.0.0.1'
+  os.environ['MASTER_PORT'] = '29501'
+  test.model = model
+  test.tokenizer = DebertaPreTrainedTokenizer()
+  test.run_glue(task_name=args.task_name, fp16=False, use_new_api=True)
+
+def build_argument_parser():
+  parser = argparse.ArgumentParser()
+
+  ## Required parameters
+  parser.add_argument("--data_dir",
+            default=None,
+            type=str,
+            required=True,
+            help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+  parser.add_argument("--task_name",
+            default=None,
+            type=str,
+            required=True,
+            help="The name of the task to train.")
+  parser.add_argument("--output_dir",
+            default=None,
+            type=str,
+            required=True,
+            help="The output directory where the model checkpoints will be written.")
+  parser.add_argument("--cache_dir",
+            default=None,
+            type=str,
+            required=True,
+            help="The directory to store the pretrained models downloaded from s3.")
+  ## Other parameters, 
+  parser.add_argument("--max_seq_length",
+            default=128,
+            type=int,
+            help="The maximum total input sequence length after WordPiece tokenization. \n"
+              "Sequences longer than this will be truncated, and sequences shorter \n"
+              "than this will be padded.")
+  parser.add_argument("--train_batch_size",
+            default=32,
+            type=int,
+            help="Total batch size for training.")
+  parser.add_argument("--eval_batch_size",
+            default=32,
+            type=int,
+            help="Total batch size for eval.")
+  parser.add_argument("--learning_rate",
+            default=5e-5,
+            type=float,
+            help="The initial learning rate for Adam.")
+  parser.add_argument("--num_train_epochs",
+            default=3.0,
+            type=float,
+            help="Total number of training epochs to perform.")
+  parser.add_argument('--seed',
+            type=int,
+            default=random.randint(0,  2**32 - 1),
+            help="random seed for initialization")
+  parser.add_argument('--fp16',
+            default=False,
+            type=boolean_string,
+            help="Whether to use 16-bit float precision instead of 32-bit")
+  parser.add_argument('--init_model',
+            type=str,
+            help="The model state file used to initialize the model weights.")
+  parser.add_argument('--pre_trained',
+            default=None,
+            type=str,
+            help="The path of pre-trained RoBERTa model")
+
+  ## TBD: review params below
+  parser.add_argument("--max_grad_norm",
+            default=1,
+            type=float,
+            help="The clip threshold of global gradient norm")
+  parser.add_argument("--epsilon",
+            default=1e-6,
+            type=float,
+            help="epsilon setting for Adam.")
+  parser.add_argument("--adam_beta1",
+            default=0.9,
+            type=float,
+            help="The beta1 parameter for Adam.")
+  parser.add_argument("--adam_beta2",
+            default=0.999,
+            type=float,
+            help="The beta2 parameter for Adam.")
+  parser.add_argument("--warmup_proportion",
+            default=0.1,
+            type=float,
+            help="Proportion of training to perform linear learning rate warmup for. "
+              "E.g., 0.1 = 10%% of training.")
+  parser.add_argument("--lr_schedule_ends",
+            default=0,
+            type=float,
+            help="The ended learning rate scale for learning rate scheduling")
+  parser.add_argument("--lr_schedule",
+            default='warmup_linear',
+            type=str,
+            help="The learning rate scheduler used for traning. "
+              "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
+  parser.add_argument('--accumulative_update',
+            type=int,
+            default=1,
+            help="Number of updates steps to accumulate before performing a backward/update pass.")
+  parser.add_argument('--loss_scale',
+            type=float, default=256,
+            help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
+  parser.add_argument('--scale_steps',
+            type=int, default=1000,
+            help='The steps to wait to increase the loss scale.')
+  parser.add_argument('--model_config',
+            type=str,
+            help="The config file of bert model.")
+  parser.add_argument('--cls_drop_out',
+            type=float,
+            default=None,
+            help="The config file model initialization and fine tuning.")
+  parser.add_argument('--weight_decay',
+            type=float,
+            default=0.01,
+            help="The weight decay rate")
+  parser.add_argument('--opt_type',
+            type=str.lower,
+            default='adam',
+            choices=['adam', 'admax'],
+            help="The optimizer to be used.")
+  return parser
+
+if __name__ == "__main__":
+  parser = build_argument_parser()
+  args = parser.parse_args()
+  logger = set_logger(args.task_name, os.path.join(args.output_dir, 'training_{}.log'.format(args.task_name)))
+  logger.info(args)
+  try:
+    main(args)
+  except Exception as ex:
+    try:
+      logger.exception(f'Uncatched exception happened during execution.')
+      import atexit
+      atexit._run_exitfuncs()
+    except:
+      pass
+    os._exit(-1)
diff --git a/DeBERTa/apps/sequence_classification.py b/DeBERTa/apps/sequence_classification.py
@@ -35,7 +35,7 @@ def __init__(self, config, num_labels=2, drop_out=None, pre_trained=None):
 
     self.classifier = torch.nn.Linear(output_dim, num_labels)
     drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else  torch.nn.Dropout(drop_out)
     self.apply(self.init_weights)
     self.bert.apply_state()
 
@@ -46,7 +46,7 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi
     pooled_output = self.dropout(pooled_output)
     logits = self.classifier(pooled_output)
 
-    loss = 0
+    loss = torch.tensor(0).to(logits)
     if labels is not None:
       if self.num_labels ==1:
         # regression task
@@ -68,4 +68,4 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi
         label_confidence = 1
         loss = -((log_softmax(logits)*labels).sum(-1)*label_confidence).mean()
 
-    return (logits,loss)
+    return (loss, logits)
diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py
@@ -55,7 +55,7 @@ def eval_fn(trainer, model, device, tag):
     return eval_metric
 
   def loss_fn(trainer, model, data):
-    _, loss = model(**data)
+    loss, _ = model(**data)
     return loss.mean(), data['input_ids'].size(0)
 
   trainer = DistributedTrainer(args, model, device, data_fn, loss_fn = loss_fn, eval_fn = eval_fn, dump_interval = args.dump_interval)
@@ -160,7 +160,7 @@ def run_eval(args, model, device, eval_data, prefix=None, tag=None, steps=None):
     for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=no_tqdm):
       batch = batch_to(batch, device)
       with torch.no_grad():
-        logits, tmp_eval_loss = model(**batch)
+        tmp_eval_loss, logits = model(**batch)
       label_ids = batch['labels'].to(device)
       predicts.append(logits)
       labels.append(label_ids)
@@ -195,7 +195,7 @@ def run_predict(args, model, device, eval_data, prefix=None):
     for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=args.rank>0):
       batch = batch_to(batch, device)
       with torch.no_grad():
-        logits, _ = model(**batch)
+        _, logits = model(**batch)
       if args.world_size>1:
         logits_all = [torch.zeros_like(logits) for _ in range(args.world_size)]
         torch.distributed.all_gather(logits_all, logits)

diff --git a/DeBERTa/deberta/__init__.py b/DeBERTa/deberta/__init__.py
@@ -17,5 +17,5 @@
 from .disentangled_attention import *
 from .ops import *
 from .bert import *
-from .gpt2_tokenizer import GPT2Tokenizer
+from .gpt2_tokenizer import GPT2Tokenizer, DebertaPreTrainedTokenizer
 from .config import *
diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py
@@ -63,7 +63,7 @@ def __init__(self, config):
     super().__init__()
     self.dense = nn.Linear(config.hidden_size, config.hidden_size)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.config = config
 
   def forward(self, hidden_states, input_states, mask=None):
@@ -110,7 +110,7 @@ def __init__(self, config):
     super(BertOutput, self).__init__()
     self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.config = config
 
   def forward(self, hidden_states, input_states, mask=None):
@@ -145,6 +145,7 @@ class BertEncoder(nn.Module):
   def __init__(self, config):
     super().__init__()
     layer = BertLayer(config)
+    # Set number of layers here
     self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
     self.relative_attention = getattr(config, 'relative_attention', False)
     if self.relative_attention:
@@ -160,7 +161,7 @@ def get_attention_mask(self, attention_mask):
     if attention_mask.dim()<=2:
       extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
       attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1)
-      attention_mask = attention_mask.byte()
+      attention_mask = attention_mask.int()
     elif attention_mask.dim()==3:
       attention_mask = attention_mask.unsqueeze(1)
 
@@ -229,7 +230,7 @@ def __init__(self, config):
     if self.embedding_size != config.hidden_size:
       self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.output_to_half = False
     self.config = config
 

diff --git a/DeBERTa/deberta/config.py b/DeBERTa/deberta/config.py
@@ -15,6 +15,8 @@ def from_dict(cls, json_object):
             if isinstance(value, dict):
                 value = AbsModelConfig.from_dict(value)
             config.__dict__[key] = value
+        config.use_xdropout = False
+        config.use_xsoftmax = False
         return config
 
     @classmethod

diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
@@ -92,14 +92,15 @@ def __init__(self, config):
             self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
             if self.max_relative_positions <1:
                 self.max_relative_positions = config.max_position_embeddings
-            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.hidden_dropout_prob)
 
             if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
                 self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False)
             if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
                 self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size)
 
-        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+        self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob)
+        self.use_xsoftmax = config.use_xsoftmax
 
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
@@ -174,7 +175,11 @@ def linear(w,b,x):
         if self.talking_head:
             attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2)
 
-        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        if self.use_xsoftmax:
+            attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        else:
+            nodex = torch.nn.Softmax(-1)
+            attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1))
         attention_probs = self.dropout(attention_probs)
         if self.talking_head:
             attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,3 +127,4 @@ dmypy.json

		# Pyre type checker
		.pyre/
		tmp/