deepset-ai
diff --git a/‎azure-pipelines.yml‎
Lines changed: 1 addition & 2 deletions b/‎azure-pipelines.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/lm_finetuning.py‎
Lines changed: 8 additions & 7 deletions b/‎examples/lm_finetuning.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎examples/natural_questions.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/natural_questions.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎farm/data_handler/data_silo.py‎
Lines changed: 6 additions & 7 deletions b/‎farm/data_handler/data_silo.py‎
Lines changed: 6 additions & 7 deletions
@@ -10,8 +10,7 @@ trigger:
 pr:
   branches:
     include:
-    - '*' 
-
+    - '*'
 jobs:
   - job: 'Test'
     pool:
 
@@ -19,22 +19,22 @@ def lm_finetuning():
         datefmt="%m/%d/%Y %H:%M:%S",
         level=logging.INFO,
     )
-
+    next_sent_pred_style = "bert-style"
+    next_sent_pred=True
     set_all_seeds(seed=42)
     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
     ml_logger.init_experiment(
-        experiment_name="Public_FARM", run_name="Run_minimal_example_lm"
+        experiment_name="LM_refactoring", run_name=f"new, nsp: {next_sent_pred}, {next_sent_pred_style}"
     )
     ##########################
     ########## Settings
     ##########################
-    device, n_gpu = initialize_device_settings(use_cuda=False)
+    device, n_gpu = initialize_device_settings(use_cuda=True)
     n_epochs = 1
     batch_size = 32
-    evaluate_every = 30
+    evaluate_every = 1000
     lang_model = "bert-base-cased"
     do_lower_case = False
-    next_sent_pred_style = "bert-style"
 
     # 1.Create a tokenizer
     tokenizer = Tokenizer.load(
@@ -46,7 +46,7 @@ def lm_finetuning():
         data_dir=Path("../data/lm_finetune_nips"),
         tokenizer=tokenizer,
         max_seq_len=128,
-        max_docs=20, # We have set max_docs to 20 to speed up data processing
+        max_docs=None, # You can have set max_docs here to limit the number of docs in the dataset and speed up this example
         next_sent_pred_style=next_sent_pred_style
     )
 
@@ -74,7 +74,7 @@ def lm_finetuning():
         learning_rate=2e-5,
         device=device,
         n_batches=len(data_silo.loaders["train"]),
-        n_epochs=n_epochs,
+        n_epochs=n_epochs
     )
 
     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
@@ -87,6 +87,7 @@ def lm_finetuning():
         lr_schedule=lr_schedule,
         evaluate_every=evaluate_every,
         device=device,
+        eval_report=False
     )
 
     # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
 
@@ -42,7 +42,7 @@ def question_answering():
 
     # 1.Create a tokenizer
     tokenizer = Tokenizer.load(
-        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
+        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, use_fast=False,
     )
 
     # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart
 
@@ -129,7 +129,7 @@ def _dataset_from_chunk(cls, chunk, processor):
         """
         dicts = [d[1] for d in chunk]
         indices = [x[0] for x in chunk]
-        dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices, return_problematic=True)
+        dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices)
         return dataset, tensor_names, problematic_sample_ids
 
     def _get_dataset(self, filename, dicts=None):
@@ -176,6 +176,7 @@ def _get_dataset(self, filename, dicts=None):
                 results = map(partial(self._dataset_from_chunk, processor=self.processor), grouper(dicts, num_dicts))
 
             datasets = []
+            problematic_ids_all = set()
 
             desc = f"Preprocessing Dataset"
             if filename:
@@ -185,8 +186,9 @@ def _get_dataset(self, filename, dicts=None):
                     datasets.append(dataset)
                     # update progress bar (last step can have less dicts than actual chunk_size)
                     pbar.update(min(multiprocessing_chunk_size, pbar.total-pbar.n))
-                    self.processor.problematic_sample_ids.update(problematic_samples)
-            self.processor.log_problematic()
+                    problematic_ids_all.update(problematic_samples)
+
+            self.processor.log_problematic(problematic_ids_all)
             # _dataset_from_chunk can return a None in cases where downsampling has occurred
             datasets = [d for d in datasets if d]
             concat_datasets = ConcatDataset(datasets)
@@ -221,7 +223,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
         else:
             logger.info("No train set is being loaded")
             self.data["train"] = None
-        self.processor.log_problematic()
 
         # dev data
         logger.info("")
@@ -243,7 +244,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
         else:
             logger.info("No dev set is being loaded")
             self.data["dev"] = None
-        self.processor.log_problematic()
 
         logger.info("")
         logger.info("LOADING TEST DATA")
@@ -264,7 +264,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
         else:
             logger.info("No test set is being loaded")
             self.data["test"] = None
-        self.processor.log_problematic()
 
         if self.caching:
             self._save_dataset_to_cache()
@@ -724,7 +723,7 @@ def _dataset_from_chunk(self, chunk):
             logger.info("Skipping a dict chunk as it contains less than 2 documents ...")
             return None, None
         indices = [x[0] for x in chunk]
-        datasets, tensor_names = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
+        datasets, tensor_names, _ = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
         return datasets, tensor_names
 
     def shuffle_files(self, files, seed=None):
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def question_answering():`
`42`	`42`
`43`	`43`	`# 1.Create a tokenizer`
`44`	`44`	`tokenizer = Tokenizer.load(`
`45`		`- pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case`
	`45`	`+ pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, use_fast=False,`
`46`	`46`	`)`
`47`	`47`
`48`	`48`	`# Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart`