Skip to content
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,4 @@ dmypy.json

# Pyre type checker
.pyre/
tmp/
2 changes: 1 addition & 1 deletion DeBERTa/apps/multi_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
self.num_labels = num_labels
self.classifier = nn.Linear(config.hidden_size, 1)
drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
self.apply(self.init_weights)

def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion DeBERTa/apps/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
self.proj = nn.Linear(config.hidden_size, config.hidden_size)
self.classifier = nn.Linear(config.hidden_size, self.num_labels)
drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
self.apply(self.init_weights)

def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
Expand Down
195 changes: 195 additions & 0 deletions DeBERTa/apps/orttrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#

import os
import argparse
import random

import numpy as np
import torch
from ..deberta import GPT2Tokenizer, DebertaPreTrainedTokenizer
from ..onnx import ORTGlueTest
from ..utils import *
from .task_registry import tasks
from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size, get_mpi_context_world_rank, get_mpi_context_world_size

def create_model(args, num_labels, model_class_fn):
# Prepare model
rank = getattr(args, 'rank', 0)
init_model = args.init_model if rank<1 else None
model = model_class_fn(init_model, args.model_config, num_labels=num_labels, \
drop_out=args.cls_drop_out, \
pre_trained = args.pre_trained)
if args.fp16:
model = model.half()
return model

def main(args):
os.makedirs(args.output_dir, exist_ok=True)
logger.info("Using seed " + str(args.seed))
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

# load model based on task
tokenizer = GPT2Tokenizer()
processor = tasks[args.task_name.lower()](tokenizer = tokenizer, max_seq_len = args.max_seq_length, data_dir = args.data_dir)
label_list = processor.get_labels()
model_class_fn = processor.get_model_class_fn()
model = create_model(args, len(label_list), model_class_fn)
logger.info("Model config {}".format(model.config))

# train with ORT
test = ORTGlueTest()
test.setUp(args)
test.local_rank = get_mpi_context_local_rank()
test.world_size = get_mpi_context_world_size()
print("mpirun launch, local_rank / world_size: ", test.local_rank, test.world_size)
os.environ['RANK'] = str(test.local_rank)
os.environ['WORLD_SIZE'] = str(test.world_size)
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29501'
test.model = model
test.tokenizer = DebertaPreTrainedTokenizer()
test.run_glue(task_name=args.task_name, fp16=False, use_new_api=True)

def build_argument_parser():
parser = argparse.ArgumentParser()

## Required parameters
parser.add_argument("--data_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
parser.add_argument("--task_name",
default=None,
type=str,
required=True,
help="The name of the task to train.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
parser.add_argument("--cache_dir",
default=None,
type=str,
required=True,
help="The directory to store the pretrained models downloaded from s3.")
## Other parameters,
parser.add_argument("--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
default=32,
type=int,
help="Total batch size for eval.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument('--seed',
type=int,
default=random.randint(0, 2**32 - 1),
help="random seed for initialization")
parser.add_argument('--fp16',
default=False,
type=boolean_string,
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--init_model',
type=str,
help="The model state file used to initialize the model weights.")
parser.add_argument('--pre_trained',
default=None,
type=str,
help="The path of pre-trained RoBERTa model")

## TBD: review params below
parser.add_argument("--max_grad_norm",
default=1,
type=float,
help="The clip threshold of global gradient norm")
parser.add_argument("--epsilon",
default=1e-6,
type=float,
help="epsilon setting for Adam.")
parser.add_argument("--adam_beta1",
default=0.9,
type=float,
help="The beta1 parameter for Adam.")
parser.add_argument("--adam_beta2",
default=0.999,
type=float,
help="The beta2 parameter for Adam.")
parser.add_argument("--warmup_proportion",
default=0.1,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--lr_schedule_ends",
default=0,
type=float,
help="The ended learning rate scale for learning rate scheduling")
parser.add_argument("--lr_schedule",
default='warmup_linear',
type=str,
help="The learning rate scheduler used for traning. "
"E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
parser.add_argument('--accumulative_update',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--loss_scale',
type=float, default=256,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--scale_steps',
type=int, default=1000,
help='The steps to wait to increase the loss scale.')
parser.add_argument('--model_config',
type=str,
help="The config file of bert model.")
parser.add_argument('--cls_drop_out',
type=float,
default=None,
help="The config file model initialization and fine tuning.")
parser.add_argument('--weight_decay',
type=float,
default=0.01,
help="The weight decay rate")
parser.add_argument('--opt_type',
type=str.lower,
default='adam',
choices=['adam', 'admax'],
help="The optimizer to be used.")
return parser

if __name__ == "__main__":
parser = build_argument_parser()
args = parser.parse_args()
logger = set_logger(args.task_name, os.path.join(args.output_dir, 'training_{}.log'.format(args.task_name)))
logger.info(args)
try:
main(args)
except Exception as ex:
try:
logger.exception(f'Uncatched exception happened during execution.')
import atexit
atexit._run_exitfuncs()
except:
pass
os._exit(-1)
6 changes: 3 additions & 3 deletions DeBERTa/apps/sequence_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, config, num_labels=2, drop_out=None, pre_trained=None):

self.classifier = torch.nn.Linear(output_dim, num_labels)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.dropout = StableDropout(drop_out) if config.use_xdropout else torch.nn.Dropout(drop_out)
self.apply(self.init_weights)
self.bert.apply_state()

Expand All @@ -46,7 +46,7 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)

loss = 0
loss = torch.tensor(0).to(logits)
if labels is not None:
if self.num_labels ==1:
# regression task
Expand All @@ -68,4 +68,4 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi
label_confidence = 1
loss = -((log_softmax(logits)*labels).sum(-1)*label_confidence).mean()

return (logits,loss)
return (loss, logits)
6 changes: 3 additions & 3 deletions DeBERTa/apps/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def eval_fn(trainer, model, device, tag):
return eval_metric

def loss_fn(trainer, model, data):
_, loss = model(**data)
loss, _ = model(**data)
return loss.mean(), data['input_ids'].size(0)

trainer = DistributedTrainer(args, model, device, data_fn, loss_fn = loss_fn, eval_fn = eval_fn, dump_interval = args.dump_interval)
Expand Down Expand Up @@ -160,7 +160,7 @@ def run_eval(args, model, device, eval_data, prefix=None, tag=None, steps=None):
for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=no_tqdm):
batch = batch_to(batch, device)
with torch.no_grad():
logits, tmp_eval_loss = model(**batch)
tmp_eval_loss, logits = model(**batch)
label_ids = batch['labels'].to(device)
predicts.append(logits)
labels.append(label_ids)
Expand Down Expand Up @@ -195,7 +195,7 @@ def run_predict(args, model, device, eval_data, prefix=None):
for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=args.rank>0):
batch = batch_to(batch, device)
with torch.no_grad():
logits, _ = model(**batch)
_, logits = model(**batch)
if args.world_size>1:
logits_all = [torch.zeros_like(logits) for _ in range(args.world_size)]
torch.distributed.all_gather(logits_all, logits)
Expand Down
2 changes: 1 addition & 1 deletion DeBERTa/deberta/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@
from .disentangled_attention import *
from .ops import *
from .bert import *
from .gpt2_tokenizer import GPT2Tokenizer
from .gpt2_tokenizer import GPT2Tokenizer, DebertaPreTrainedTokenizer
from .config import *
9 changes: 5 additions & 4 deletions DeBERTa/deberta/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
self.config = config

def forward(self, hidden_states, input_states, mask=None):
Expand Down Expand Up @@ -110,7 +110,7 @@ def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
self.config = config

def forward(self, hidden_states, input_states, mask=None):
Expand Down Expand Up @@ -145,6 +145,7 @@ class BertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
layer = BertLayer(config)
# Set number of layers here
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
self.relative_attention = getattr(config, 'relative_attention', False)
if self.relative_attention:
Expand All @@ -160,7 +161,7 @@ def get_attention_mask(self, attention_mask):
if attention_mask.dim()<=2:
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1)
attention_mask = attention_mask.byte()
attention_mask = attention_mask.int()
elif attention_mask.dim()==3:
attention_mask = attention_mask.unsqueeze(1)

Expand Down Expand Up @@ -229,7 +230,7 @@ def __init__(self, config):
if self.embedding_size != config.hidden_size:
self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
self.output_to_half = False
self.config = config

Expand Down
2 changes: 2 additions & 0 deletions DeBERTa/deberta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def from_dict(cls, json_object):
if isinstance(value, dict):
value = AbsModelConfig.from_dict(value)
config.__dict__[key] = value
config.use_xdropout = False
config.use_xsoftmax = False
return config

@classmethod
Expand Down
11 changes: 8 additions & 3 deletions DeBERTa/deberta/disentangled_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,15 @@ def __init__(self, config):
self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
if self.max_relative_positions <1:
self.max_relative_positions = config.max_position_embeddings
self.pos_dropout = StableDropout(config.hidden_dropout_prob)
self.pos_dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.hidden_dropout_prob)

if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False)
if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size)

self.dropout = StableDropout(config.attention_probs_dropout_prob)
self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob)
self.use_xsoftmax = config.use_xsoftmax

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
Expand Down Expand Up @@ -174,7 +175,11 @@ def linear(w,b,x):
if self.talking_head:
attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2)

attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
if self.use_xsoftmax:
attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
else:
nodex = torch.nn.Softmax(-1)
attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1))
attention_probs = self.dropout(attention_probs)
if self.talking_head:
attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2)
Expand Down
Loading