From 3bac78541e90bea845450ad3886e7f2ee1eab741 Mon Sep 17 00:00:00 2001 From: NoBugsHere Date: Thu, 11 Nov 2021 12:44:21 +0300 Subject: [PATCH 1/5] add roberta support --- .../squad/refactor_squad_torch_bert.json | 175 ++++++++++++++++++ .../preprocessors/squad_preprocessor.py | 5 +- .../torch_bert/torch_transformers_squad.py | 6 +- 3 files changed, 183 insertions(+), 3 deletions(-) create mode 100644 deeppavlov/configs/squad/refactor_squad_torch_bert.json diff --git a/deeppavlov/configs/squad/refactor_squad_torch_bert.json b/deeppavlov/configs/squad/refactor_squad_torch_bert.json new file mode 100644 index 0000000000..76634c056f --- /dev/null +++ b/deeppavlov/configs/squad/refactor_squad_torch_bert.json @@ -0,0 +1,175 @@ +{ + "dataset_reader": { + "class_name": "squad_dataset_reader", + "data_path": "{DOWNLOADS_PATH}/squad/" + }, + "dataset_iterator": { + "class_name": "squad_iterator", + "seed": 1337, + "shuffle": true + }, + "chainer": { + "in": [ + "context_raw", + "question_raw" + ], + "in_y": [ + "ans_raw", + "ans_raw_start" + ], + "pipe": [ + { + "class_name": "torch_squad_transformers_preprocessor", + "vocab_file": "{TRANSFORMER}", + "do_lower_case": "{LOWERCASE}", + "max_seq_length": 768, + "return_tokens": true, + "in": [ + "question_raw", + "context_raw" + ], + "out": [ + "bert_features", + "subtokens" + ] + }, + { + "class_name": "squad_bert_mapping", + "do_lower_case": "{LOWERCASE}", + "in": [ + "context_raw", + "bert_features", + "subtokens" + ], + "out": [ + "subtok2chars", + "char2subtoks" + ] + }, + { + "class_name": "squad_bert_ans_preprocessor", + "do_lower_case": "{LOWERCASE}", + "in": [ + "ans_raw", + "ans_raw_start", + "char2subtoks" + ], + "out": [ + "ans", + "ans_start", + "ans_end" + ] + }, + { + "class_name": "torch_transformers_squad", + "pretrained_bert": "{TRANSFORMER}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05, + "weight_decay": 0.01, + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-06 + }, + "learning_rate_drop_patience": 2, + "learning_rate_drop_div": 2.0, + "in": [ + "bert_features" + ], + "in_y": [ + "ans_start", + "ans_end" + ], + "out": [ + "ans_start_predicted", + "ans_end_predicted", + "logits" + ] + }, + { + "class_name": "squad_bert_ans_postprocessor", + "in": [ + "ans_start_predicted", + "ans_end_predicted", + "context_raw", + "bert_features", + "subtok2chars", + "subtokens" + ], + "out": [ + "ans_predicted", + "ans_start_predicted", + "ans_end_predicted" + ] + } + ], + "out": [ + "ans_predicted", + "ans_start_predicted", + "logits" + ] + }, + "train": { + "show_examples": false, + "evaluation_targets": [ + "valid" + ], + "log_every_n_batches": 250, + "val_every_n_batches": 500, + "batch_size": 10, + "pytest_max_batches": 2, + "pytest_batch_size": 5, + "validation_patience": 10, + "metrics": [ + { + "name": "squad_v1_f1", + "inputs": [ + "ans", + "ans_predicted" + ] + }, + { + "name": "squad_v1_em", + "inputs": [ + "ans", + "ans_predicted" + ] + }, + { + "name": "squad_v2_f1", + "inputs": [ + "ans", + "ans_predicted" + ] + }, + { + "name": "squad_v2_em", + "inputs": [ + "ans", + "ans_predicted" + ] + } + ], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "LOWERCASE": true, + "TRANSFORMER": "roberta-base", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/squad_torch_bert/{TRANSFORMER}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/squad/squad_torch_bert.tar.gz", + "subdir": "{ROOT_PATH}/models" + } + ] + } +} diff --git a/deeppavlov/models/preprocessors/squad_preprocessor.py b/deeppavlov/models/preprocessors/squad_preprocessor.py index c342902d4f..1dc65519cc 100644 --- a/deeppavlov/models/preprocessors/squad_preprocessor.py +++ b/deeppavlov/models/preprocessors/squad_preprocessor.py @@ -404,7 +404,10 @@ def __call__(self, contexts, bert_features, *args, **kwargs): subtokens = args[0][batch_counter] else: subtokens = features.tokens - context_start = subtokens.index('[SEP]') + 1 + if 'SEP' in subtokens: + context_start = subtokens.index('[SEP]') + 1 + else: + context_start = subtokens.index('') + 1 idx = 0 subtok2char: Dict[int, int] = {} char2subtok: Dict[int, int] = {} diff --git a/deeppavlov/models/torch_bert/torch_transformers_squad.py b/deeppavlov/models/torch_bert/torch_transformers_squad.py index 9506ce924e..9579fa6e02 100644 --- a/deeppavlov/models/torch_bert/torch_transformers_squad.py +++ b/deeppavlov/models/torch_bert/torch_transformers_squad.py @@ -128,7 +128,7 @@ def train_on_batch(self, features: List[InputFeatures], y_st: List[List[int]], y y_end = [x[0] for x in y_end] b_y_st = torch.from_numpy(np.array(y_st)).to(self.device) b_y_end = torch.from_numpy(np.array(y_end)).to(self.device) - + input_ = { 'input_ids': b_input_ids, 'attention_mask': b_input_masks, @@ -184,7 +184,9 @@ def __call__(self, features: List[InputFeatures]) -> Tuple[List[int], List[int], b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) b_input_type_ids = torch.cat(input_type_ids, dim=0).to(self.device) - + if self.pretrained_bert == 'roberta-base': + b_input_type_ids = b_input_type_ids.unsqueeze(1).expand(-1, b_input_ids.shape[-1]) + input_ = { 'input_ids': b_input_ids, 'attention_mask': b_input_masks, From 268425373ab6fb27dd1f3c0f219e08f984f32d7e Mon Sep 17 00:00:00 2001 From: NoBugsHere Date: Fri, 19 Nov 2021 14:41:53 +0300 Subject: [PATCH 2/5] Update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index c021e7d41d..659aa1078c 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ download/ /Dockerfile /entrypoint.sh /.dockerignore + +# zhenya test +/zhenya_test From 978509bd87e3c50b9c2fe4a9cf07b152e6dec518 Mon Sep 17 00:00:00 2001 From: NoBugsHere Date: Fri, 19 Nov 2021 15:08:50 +0300 Subject: [PATCH 3/5] Revert "Update .gitignore" This reverts commit 268425373ab6fb27dd1f3c0f219e08f984f32d7e. --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 659aa1078c..c021e7d41d 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,3 @@ download/ /Dockerfile /entrypoint.sh /.dockerignore - -# zhenya test -/zhenya_test From aa39a0aa5979c4d380394b06695a3ddb77b4fb3c Mon Sep 17 00:00:00 2001 From: NoBugsHere Date: Fri, 19 Nov 2021 15:39:42 +0300 Subject: [PATCH 4/5] add other encoders support --- deeppavlov/configs/squad/refactor_squad_torch_bert.json | 4 ++-- deeppavlov/models/preprocessors/squad_preprocessor.py | 2 +- deeppavlov/models/torch_bert/torch_transformers_squad.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deeppavlov/configs/squad/refactor_squad_torch_bert.json b/deeppavlov/configs/squad/refactor_squad_torch_bert.json index 76634c056f..a1be8baa09 100644 --- a/deeppavlov/configs/squad/refactor_squad_torch_bert.json +++ b/deeppavlov/configs/squad/refactor_squad_torch_bert.json @@ -22,7 +22,7 @@ "class_name": "torch_squad_transformers_preprocessor", "vocab_file": "{TRANSFORMER}", "do_lower_case": "{LOWERCASE}", - "max_seq_length": 768, + "max_seq_length": 384, "return_tokens": true, "in": [ "question_raw", @@ -159,7 +159,7 @@ "metadata": { "variables": { "LOWERCASE": true, - "TRANSFORMER": "roberta-base", + "TRANSFORMER": "allenai/longformer-base-4096", "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", diff --git a/deeppavlov/models/preprocessors/squad_preprocessor.py b/deeppavlov/models/preprocessors/squad_preprocessor.py index 1dc65519cc..628f30d9d9 100644 --- a/deeppavlov/models/preprocessors/squad_preprocessor.py +++ b/deeppavlov/models/preprocessors/squad_preprocessor.py @@ -404,7 +404,7 @@ def __call__(self, contexts, bert_features, *args, **kwargs): subtokens = args[0][batch_counter] else: subtokens = features.tokens - if 'SEP' in subtokens: + if '[SEP]' in subtokens: context_start = subtokens.index('[SEP]') + 1 else: context_start = subtokens.index('') + 1 diff --git a/deeppavlov/models/torch_bert/torch_transformers_squad.py b/deeppavlov/models/torch_bert/torch_transformers_squad.py index 9579fa6e02..aa5d4c2636 100644 --- a/deeppavlov/models/torch_bert/torch_transformers_squad.py +++ b/deeppavlov/models/torch_bert/torch_transformers_squad.py @@ -184,7 +184,7 @@ def __call__(self, features: List[InputFeatures]) -> Tuple[List[int], List[int], b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) b_input_type_ids = torch.cat(input_type_ids, dim=0).to(self.device) - if self.pretrained_bert == 'roberta-base': + if any(x in self.pretrained_bert for x in ['roberta', 'distilbert', 'bart', 'longformer']): b_input_type_ids = b_input_type_ids.unsqueeze(1).expand(-1, b_input_ids.shape[-1]) input_ = { From e4ad3baddca3688ddb1e11218ba59a982f4c1f89 Mon Sep 17 00:00:00 2001 From: NoBugsHere Date: Thu, 25 Nov 2021 23:17:18 +0300 Subject: [PATCH 5/5] add other models support for training --- deeppavlov/models/torch_bert/torch_transformers_squad.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deeppavlov/models/torch_bert/torch_transformers_squad.py b/deeppavlov/models/torch_bert/torch_transformers_squad.py index aa5d4c2636..af4a67ab20 100644 --- a/deeppavlov/models/torch_bert/torch_transformers_squad.py +++ b/deeppavlov/models/torch_bert/torch_transformers_squad.py @@ -123,6 +123,8 @@ def train_on_batch(self, features: List[InputFeatures], y_st: List[List[int]], y b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) b_input_type_ids = torch.cat(input_type_ids, dim=0).to(self.device) + if any(x in self.pretrained_bert for x in ['roberta', 'distilbert', 'bart', 'longformer']): + b_input_type_ids = b_input_type_ids.unsqueeze(1).expand(-1, b_input_ids.shape[-1]) y_st = [x[0] for x in y_st] y_end = [x[0] for x in y_end]