|
1 | 1 | # fmt: off
|
2 | 2 | import logging
|
| 3 | +from pathlib import Path |
3 | 4 |
|
4 | 5 | from farm.data_handler.data_silo import DataSilo
|
5 | 6 | from farm.data_handler.processor import TextClassificationProcessor
|
|
12 | 13 | from farm.train import Trainer
|
13 | 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
|
14 | 15 |
|
15 |
| -logging.basicConfig( |
16 |
| - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
17 |
| - datefmt="%m/%d/%Y %H:%M:%S", |
18 |
| - level=logging.INFO) |
19 |
| - |
20 |
| -ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") |
21 |
| -ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") |
22 |
| - |
23 |
| -########################## |
24 |
| -########## Settings |
25 |
| -########################## |
26 |
| -set_all_seeds(seed=42) |
27 |
| -n_epochs = 1 |
28 |
| -batch_size = 32 |
29 |
| -evaluate_every = 100 |
30 |
| -lang_model = "bert-base-german-cased" |
31 |
| -use_amp = None |
32 |
| - |
33 |
| -device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) |
34 |
| - |
35 |
| -# 1.Create a tokenizer |
36 |
| -tokenizer = Tokenizer.load( |
37 |
| - pretrained_model_name_or_path=lang_model, |
38 |
| - do_lower_case=False) |
39 |
| - |
40 |
| -# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset |
41 |
| -# Here we load GermEval 2018 Data. |
42 |
| - |
43 |
| -label_list = ["OTHER", "OFFENSE"] |
44 |
| -metric = "f1_macro" |
45 |
| - |
46 |
| -processor = TextClassificationProcessor(tokenizer=tokenizer, |
47 |
| - max_seq_len=128, |
48 |
| - data_dir="../data/germeval18", |
49 |
| - label_list=label_list, |
50 |
| - metric=metric, |
51 |
| - label_column_name="coarse_label" |
52 |
| - ) |
53 |
| - |
54 |
| -# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets |
55 |
| -data_silo = DataSilo( |
56 |
| - processor=processor, |
57 |
| - batch_size=batch_size) |
58 |
| - |
59 |
| -# 4. Create an AdaptiveModel |
60 |
| -# a) which consists of a pretrained language model as a basis |
61 |
| -language_model = LanguageModel.load(lang_model) |
62 |
| -# b) and a prediction head on top that is suited for our task => Text classification |
63 |
| -prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])], |
64 |
| - class_weights=data_silo.calculate_class_weights(task_name="text_classification")) |
65 |
| - |
66 |
| - |
67 |
| - |
68 |
| -model = AdaptiveModel( |
69 |
| - language_model=language_model, |
70 |
| - prediction_heads=[prediction_head], |
71 |
| - embeds_dropout_prob=0.1, |
72 |
| - lm_output_types=["per_sequence"], |
73 |
| - device=device) |
74 |
| - |
75 |
| -# 5. Create an optimizer |
76 |
| -model, optimizer, lr_schedule = initialize_optimizer( |
77 |
| - model=model, |
78 |
| - learning_rate=2e-5, |
79 |
| - device=device, |
80 |
| - n_batches=len(data_silo.loaders["train"]), |
81 |
| - n_epochs=n_epochs, |
82 |
| - use_amp=use_amp) |
83 |
| - |
84 |
| -# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time |
85 |
| -trainer = Trainer( |
86 |
| - optimizer=optimizer, |
87 |
| - data_silo=data_silo, |
88 |
| - epochs=n_epochs, |
89 |
| - n_gpu=n_gpu, |
90 |
| - lr_schedule=lr_schedule, |
91 |
| - evaluate_every=evaluate_every, |
92 |
| - device=device,) |
93 |
| - |
94 |
| -# 7. Let it grow |
95 |
| -model = trainer.train(model) |
96 |
| - |
97 |
| -# 8. Hooray! You have a model. Store it: |
98 |
| -save_dir = "saved_models/bert-german-doc-tutorial" |
99 |
| -model.save(save_dir) |
100 |
| -processor.save(save_dir) |
101 |
| - |
102 |
| -# 9. Load it & harvest your fruits (Inference) |
103 |
| -basic_texts = [ |
104 |
| - {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, |
105 |
| - {"text": "Martin Müller spielt Handball in Berlin"}, |
106 |
| -] |
107 |
| -model = Inferencer.load(save_dir) |
108 |
| -result = model.inference_from_dicts(dicts=basic_texts) |
109 |
| -print(result) |
| 16 | + |
| 17 | +def doc_classifcation(): |
| 18 | + logging.basicConfig( |
| 19 | + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
| 20 | + datefmt="%m/%d/%Y %H:%M:%S", |
| 21 | + level=logging.INFO) |
| 22 | + |
| 23 | + ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") |
| 24 | + ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") |
| 25 | + |
| 26 | + ########################## |
| 27 | + ########## Settings |
| 28 | + ########################## |
| 29 | + set_all_seeds(seed=42) |
| 30 | + device, n_gpu = initialize_device_settings(use_cuda=True) |
| 31 | + n_epochs = 1 |
| 32 | + batch_size = 32 |
| 33 | + evaluate_every = 100 |
| 34 | + lang_model = "bert-base-german-cased" |
| 35 | + use_amp = None |
| 36 | + |
| 37 | + device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) |
| 38 | + |
| 39 | + # 1.Create a tokenizer |
| 40 | + tokenizer = Tokenizer.load( |
| 41 | + pretrained_model_name_or_path=lang_model, |
| 42 | + do_lower_case=False) |
| 43 | + |
| 44 | + # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset |
| 45 | + # Here we load GermEval 2018 Data. |
| 46 | + |
| 47 | + label_list = ["OTHER", "OFFENSE"] |
| 48 | + metric = "f1_macro" |
| 49 | + |
| 50 | + processor = TextClassificationProcessor(tokenizer=tokenizer, |
| 51 | + max_seq_len=128, |
| 52 | + data_dir=Path("../data/germeval18"), |
| 53 | + label_list=label_list, |
| 54 | + metric=metric, |
| 55 | + label_column_name="coarse_label" |
| 56 | + ) |
| 57 | + |
| 58 | + # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a |
| 59 | + # few descriptive statistics of our datasets |
| 60 | + data_silo = DataSilo( |
| 61 | + processor=processor, |
| 62 | + batch_size=batch_size) |
| 63 | + |
| 64 | + # 4. Create an AdaptiveModel |
| 65 | + # a) which consists of a pretrained language model as a basis |
| 66 | + language_model = LanguageModel.load(lang_model) |
| 67 | + # b) and a prediction head on top that is suited for our task => Text classification |
| 68 | + prediction_head = TextClassificationHead( |
| 69 | + layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])], |
| 70 | + class_weights=data_silo.calculate_class_weights(task_name="text_classification")) |
| 71 | + |
| 72 | + model = AdaptiveModel( |
| 73 | + language_model=language_model, |
| 74 | + prediction_heads=[prediction_head], |
| 75 | + embeds_dropout_prob=0.1, |
| 76 | + lm_output_types=["per_sequence"], |
| 77 | + device=device) |
| 78 | + |
| 79 | + # 5. Create an optimizer |
| 80 | + model, optimizer, lr_schedule = initialize_optimizer( |
| 81 | + model=model, |
| 82 | + learning_rate=2e-5, |
| 83 | + device=device, |
| 84 | + n_batches=len(data_silo.loaders["train"]), |
| 85 | + n_epochs=n_epochs, |
| 86 | + use_amp=use_amp) |
| 87 | + |
| 88 | + # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time |
| 89 | + trainer = Trainer( |
| 90 | + optimizer=optimizer, |
| 91 | + data_silo=data_silo, |
| 92 | + epochs=n_epochs, |
| 93 | + n_gpu=n_gpu, |
| 94 | + lr_schedule=lr_schedule, |
| 95 | + evaluate_every=evaluate_every, |
| 96 | + device=device) |
| 97 | + |
| 98 | + # 7. Let it grow |
| 99 | + model = trainer.train(model) |
| 100 | + |
| 101 | + # 8. Hooray! You have a model. Store it: |
| 102 | + save_dir = Path("saved_models/bert-german-doc-tutorial") |
| 103 | + model.save(save_dir) |
| 104 | + processor.save(save_dir) |
| 105 | + |
| 106 | + # 9. Load it & harvest your fruits (Inference) |
| 107 | + basic_texts = [ |
| 108 | + {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, |
| 109 | + {"text": "Martin Müller spielt Handball in Berlin"}, |
| 110 | + ] |
| 111 | + model = Inferencer.load(save_dir) |
| 112 | + result = model.inference_from_dicts(dicts=basic_texts) |
| 113 | + print(result) |
| 114 | + |
| 115 | + |
| 116 | +if __name__ == "__main__": |
| 117 | + doc_classifcation() |
110 | 118 |
|
111 | 119 | # fmt: on
|
0 commit comments