Skip to content
This repository was archived by the owner on Apr 8, 2025. It is now read-only.

Commit 18e7fc7

Browse files
Timoellerbogdankosticbrandenchantholor
authored
WIP: Simplify processors - add Fasttokenizers (#649)
* increase transformers version * Make fast tokenizers possible * refactor QA processing * Move all fcts into dataset from dicts for QA * refactor doc classification * refactor bert_style_lm * refactor inference_processor Co-authored-by: Bogdan Kostić <[email protected]> Co-authored-by: brandenchan <[email protected]> Co-authored-by: Malte Pietsch <[email protected]>
1 parent fa08f9d commit 18e7fc7

30 files changed

+2516
-1527
lines changed

azure-pipelines.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@ trigger:
1010
pr:
1111
branches:
1212
include:
13-
- '*'
14-
13+
- '*'
1514
jobs:
1615
- job: 'Test'
1716
pool:

examples/lm_finetuning.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,22 @@ def lm_finetuning():
1919
datefmt="%m/%d/%Y %H:%M:%S",
2020
level=logging.INFO,
2121
)
22-
22+
next_sent_pred_style = "bert-style"
23+
next_sent_pred=True
2324
set_all_seeds(seed=42)
2425
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
2526
ml_logger.init_experiment(
26-
experiment_name="Public_FARM", run_name="Run_minimal_example_lm"
27+
experiment_name="LM_refactoring", run_name=f"new, nsp: {next_sent_pred}, {next_sent_pred_style}"
2728
)
2829
##########################
2930
########## Settings
3031
##########################
31-
device, n_gpu = initialize_device_settings(use_cuda=False)
32+
device, n_gpu = initialize_device_settings(use_cuda=True)
3233
n_epochs = 1
3334
batch_size = 32
34-
evaluate_every = 30
35+
evaluate_every = 1000
3536
lang_model = "bert-base-cased"
3637
do_lower_case = False
37-
next_sent_pred_style = "bert-style"
3838

3939
# 1.Create a tokenizer
4040
tokenizer = Tokenizer.load(
@@ -46,7 +46,7 @@ def lm_finetuning():
4646
data_dir=Path("../data/lm_finetune_nips"),
4747
tokenizer=tokenizer,
4848
max_seq_len=128,
49-
max_docs=20, # We have set max_docs to 20 to speed up data processing
49+
max_docs=None, # You can have set max_docs here to limit the number of docs in the dataset and speed up this example
5050
next_sent_pred_style=next_sent_pred_style
5151
)
5252

@@ -74,7 +74,7 @@ def lm_finetuning():
7474
learning_rate=2e-5,
7575
device=device,
7676
n_batches=len(data_silo.loaders["train"]),
77-
n_epochs=n_epochs,
77+
n_epochs=n_epochs
7878
)
7979

8080
# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
@@ -87,6 +87,7 @@ def lm_finetuning():
8787
lr_schedule=lr_schedule,
8888
evaluate_every=evaluate_every,
8989
device=device,
90+
eval_report=False
9091
)
9192

9293
# 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai

examples/natural_questions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def question_answering():
4242

4343
# 1.Create a tokenizer
4444
tokenizer = Tokenizer.load(
45-
pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
45+
pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, use_fast=False,
4646
)
4747

4848
# Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart

farm/data_handler/data_silo.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def _dataset_from_chunk(cls, chunk, processor):
129129
"""
130130
dicts = [d[1] for d in chunk]
131131
indices = [x[0] for x in chunk]
132-
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices, return_problematic=True)
132+
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices)
133133
return dataset, tensor_names, problematic_sample_ids
134134

135135
def _get_dataset(self, filename, dicts=None):
@@ -176,6 +176,7 @@ def _get_dataset(self, filename, dicts=None):
176176
results = map(partial(self._dataset_from_chunk, processor=self.processor), grouper(dicts, num_dicts))
177177

178178
datasets = []
179+
problematic_ids_all = set()
179180

180181
desc = f"Preprocessing Dataset"
181182
if filename:
@@ -185,8 +186,9 @@ def _get_dataset(self, filename, dicts=None):
185186
datasets.append(dataset)
186187
# update progress bar (last step can have less dicts than actual chunk_size)
187188
pbar.update(min(multiprocessing_chunk_size, pbar.total-pbar.n))
188-
self.processor.problematic_sample_ids.update(problematic_samples)
189-
self.processor.log_problematic()
189+
problematic_ids_all.update(problematic_samples)
190+
191+
self.processor.log_problematic(problematic_ids_all)
190192
# _dataset_from_chunk can return a None in cases where downsampling has occurred
191193
datasets = [d for d in datasets if d]
192194
concat_datasets = ConcatDataset(datasets)
@@ -221,7 +223,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
221223
else:
222224
logger.info("No train set is being loaded")
223225
self.data["train"] = None
224-
self.processor.log_problematic()
225226

226227
# dev data
227228
logger.info("")
@@ -243,7 +244,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
243244
else:
244245
logger.info("No dev set is being loaded")
245246
self.data["dev"] = None
246-
self.processor.log_problematic()
247247

248248
logger.info("")
249249
logger.info("LOADING TEST DATA")
@@ -264,7 +264,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
264264
else:
265265
logger.info("No test set is being loaded")
266266
self.data["test"] = None
267-
self.processor.log_problematic()
268267

269268
if self.caching:
270269
self._save_dataset_to_cache()
@@ -724,7 +723,7 @@ def _dataset_from_chunk(self, chunk):
724723
logger.info("Skipping a dict chunk as it contains less than 2 documents ...")
725724
return None, None
726725
indices = [x[0] for x in chunk]
727-
datasets, tensor_names = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
726+
datasets, tensor_names, _ = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
728727
return datasets, tensor_names
729728

730729
def shuffle_files(self, files, seed=None):

0 commit comments

Comments
 (0)