Skip to content
This repository was archived by the owner on Apr 8, 2025. It is now read-only.

Commit a2e32fc

Browse files
authored
Support for Windows OS (#172)
1 parent fa4f420 commit a2e32fc

32 files changed

+1457
-1347
lines changed

examples/doc_classification.py

Lines changed: 103 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# fmt: off
22
import logging
3+
from pathlib import Path
34

45
from farm.data_handler.data_silo import DataSilo
56
from farm.data_handler.processor import TextClassificationProcessor
@@ -12,100 +13,107 @@
1213
from farm.train import Trainer
1314
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
1415

15-
logging.basicConfig(
16-
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
17-
datefmt="%m/%d/%Y %H:%M:%S",
18-
level=logging.INFO)
19-
20-
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
21-
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
22-
23-
##########################
24-
########## Settings
25-
##########################
26-
set_all_seeds(seed=42)
27-
n_epochs = 1
28-
batch_size = 32
29-
evaluate_every = 100
30-
lang_model = "bert-base-german-cased"
31-
use_amp = None
32-
33-
device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
34-
35-
# 1.Create a tokenizer
36-
tokenizer = Tokenizer.load(
37-
pretrained_model_name_or_path=lang_model,
38-
do_lower_case=False)
39-
40-
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
41-
# Here we load GermEval 2018 Data.
42-
43-
label_list = ["OTHER", "OFFENSE"]
44-
metric = "f1_macro"
45-
46-
processor = TextClassificationProcessor(tokenizer=tokenizer,
47-
max_seq_len=128,
48-
data_dir="../data/germeval18",
49-
label_list=label_list,
50-
metric=metric,
51-
label_column_name="coarse_label"
52-
)
53-
54-
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
55-
data_silo = DataSilo(
56-
processor=processor,
57-
batch_size=batch_size)
58-
59-
# 4. Create an AdaptiveModel
60-
# a) which consists of a pretrained language model as a basis
61-
language_model = LanguageModel.load(lang_model)
62-
# b) and a prediction head on top that is suited for our task => Text classification
63-
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])],
64-
class_weights=data_silo.calculate_class_weights(task_name="text_classification"))
65-
66-
67-
68-
model = AdaptiveModel(
69-
language_model=language_model,
70-
prediction_heads=[prediction_head],
71-
embeds_dropout_prob=0.1,
72-
lm_output_types=["per_sequence"],
73-
device=device)
74-
75-
# 5. Create an optimizer
76-
model, optimizer, lr_schedule = initialize_optimizer(
77-
model=model,
78-
learning_rate=2e-5,
79-
device=device,
80-
n_batches=len(data_silo.loaders["train"]),
81-
n_epochs=n_epochs,
82-
use_amp=use_amp)
83-
84-
# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
85-
trainer = Trainer(
86-
optimizer=optimizer,
87-
data_silo=data_silo,
88-
epochs=n_epochs,
89-
n_gpu=n_gpu,
90-
lr_schedule=lr_schedule,
91-
evaluate_every=evaluate_every,
92-
device=device,)
93-
94-
# 7. Let it grow
95-
model = trainer.train(model)
96-
97-
# 8. Hooray! You have a model. Store it:
98-
save_dir = "saved_models/bert-german-doc-tutorial"
99-
model.save(save_dir)
100-
processor.save(save_dir)
101-
102-
# 9. Load it & harvest your fruits (Inference)
103-
basic_texts = [
104-
{"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
105-
{"text": "Martin Müller spielt Handball in Berlin"},
106-
]
107-
model = Inferencer.load(save_dir)
108-
result = model.inference_from_dicts(dicts=basic_texts)
109-
print(result)
16+
17+
def doc_classifcation():
18+
logging.basicConfig(
19+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
20+
datefmt="%m/%d/%Y %H:%M:%S",
21+
level=logging.INFO)
22+
23+
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
24+
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
25+
26+
##########################
27+
########## Settings
28+
##########################
29+
set_all_seeds(seed=42)
30+
device, n_gpu = initialize_device_settings(use_cuda=True)
31+
n_epochs = 1
32+
batch_size = 32
33+
evaluate_every = 100
34+
lang_model = "bert-base-german-cased"
35+
use_amp = None
36+
37+
device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
38+
39+
# 1.Create a tokenizer
40+
tokenizer = Tokenizer.load(
41+
pretrained_model_name_or_path=lang_model,
42+
do_lower_case=False)
43+
44+
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
45+
# Here we load GermEval 2018 Data.
46+
47+
label_list = ["OTHER", "OFFENSE"]
48+
metric = "f1_macro"
49+
50+
processor = TextClassificationProcessor(tokenizer=tokenizer,
51+
max_seq_len=128,
52+
data_dir=Path("../data/germeval18"),
53+
label_list=label_list,
54+
metric=metric,
55+
label_column_name="coarse_label"
56+
)
57+
58+
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
59+
# few descriptive statistics of our datasets
60+
data_silo = DataSilo(
61+
processor=processor,
62+
batch_size=batch_size)
63+
64+
# 4. Create an AdaptiveModel
65+
# a) which consists of a pretrained language model as a basis
66+
language_model = LanguageModel.load(lang_model)
67+
# b) and a prediction head on top that is suited for our task => Text classification
68+
prediction_head = TextClassificationHead(
69+
layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])],
70+
class_weights=data_silo.calculate_class_weights(task_name="text_classification"))
71+
72+
model = AdaptiveModel(
73+
language_model=language_model,
74+
prediction_heads=[prediction_head],
75+
embeds_dropout_prob=0.1,
76+
lm_output_types=["per_sequence"],
77+
device=device)
78+
79+
# 5. Create an optimizer
80+
model, optimizer, lr_schedule = initialize_optimizer(
81+
model=model,
82+
learning_rate=2e-5,
83+
device=device,
84+
n_batches=len(data_silo.loaders["train"]),
85+
n_epochs=n_epochs,
86+
use_amp=use_amp)
87+
88+
# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
89+
trainer = Trainer(
90+
optimizer=optimizer,
91+
data_silo=data_silo,
92+
epochs=n_epochs,
93+
n_gpu=n_gpu,
94+
lr_schedule=lr_schedule,
95+
evaluate_every=evaluate_every,
96+
device=device)
97+
98+
# 7. Let it grow
99+
model = trainer.train(model)
100+
101+
# 8. Hooray! You have a model. Store it:
102+
save_dir = Path("saved_models/bert-german-doc-tutorial")
103+
model.save(save_dir)
104+
processor.save(save_dir)
105+
106+
# 9. Load it & harvest your fruits (Inference)
107+
basic_texts = [
108+
{"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
109+
{"text": "Martin Müller spielt Handball in Berlin"},
110+
]
111+
model = Inferencer.load(save_dir)
112+
result = model.inference_from_dicts(dicts=basic_texts)
113+
print(result)
114+
115+
116+
if __name__ == "__main__":
117+
doc_classifcation()
110118

111119
# fmt: on

0 commit comments

Comments
 (0)