Coverage for mindsdb / integrations / handlers / huggingface_handler / finetune.py: 0%
106 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1import evaluate
2import nltk
3import numpy as np
4from datasets import Dataset
5from transformers import (
6 AutoConfig,
7 AutoModelForSeq2SeqLM,
8 AutoModelForSequenceClassification,
9 AutoTokenizer,
10 DataCollatorForSeq2Seq,
11 Seq2SeqTrainingArguments,
12 Trainer,
13 TrainingArguments,
14)
16# todo add support for question answering task
17# todo add support for fill mask
18# todo add support for text_generation (causal language model)
19# todo add support for text_2_text generation
22def _finetune_cls(df, args):
23 df = df.rename(columns={args["target"]: "labels", args["input_column"]: "text"})
24 tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
25 tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
26 dataset = Dataset.from_pandas(df)
28 def _tokenize_text_cls_fn(examples):
29 return tokenizer(examples["text"], padding="max_length", truncation=True)
31 tokenized_datasets = dataset.map(_tokenize_text_cls_fn, batched=True)
32 ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1))
33 train_ds = ds["train"]
34 eval_ds = ds["test"]
36 ft_args = args.get("using", {}).get("trainer_args", {})
37 ft_args["output_dir"] = args["model_folder"]
39 n_labels = len(args["labels_map"])
40 # todo replace for prod
41 assert n_labels == df["labels"].nunique(), (
42 f"Label mismatch! Ensure labels match what the model was originally trained on. Found {df['labels'].nunique()} classes, expected {n_labels}."
43 ) # noqa
44 # TODO: ideally check that labels are a subset of the original ones, too.
45 config = AutoConfig.from_pretrained(args["model_name"])
46 model = AutoModelForSequenceClassification.from_pretrained(args["model_name"], config=config)
47 metric = evaluate.load("accuracy")
48 training_args = TrainingArguments(**ft_args)
50 def _compute_metrics(eval_pred):
51 logits, labels = eval_pred
52 predictions = np.argmax(logits, axis=-1)
53 return metric.compute(predictions=predictions, references=labels)
55 # generate trainer and finetune
56 trainer = Trainer(
57 model=model,
58 args=training_args,
59 train_dataset=train_ds,
60 eval_dataset=eval_ds,
61 compute_metrics=_compute_metrics,
62 )
64 return tokenizer, trainer
67# TODO: merge with summarization?
68def _finetune_translate(df, args):
69 config = AutoConfig.from_pretrained(args["model_name"])
70 df = df.rename(columns={args["target"]: "translation", args["input_column"]: "text"})
71 tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
72 tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
73 dataset = Dataset.from_pandas(df)
75 def _tokenize_translate_fn(examples):
76 source_lang = args["lang_input"]
77 target_lang = args["lang_output"]
78 max_target_length = config.task_specific_params["summarization"]["max_length"]
79 prefix = f"translate {source_lang} to {target_lang}: "
80 inputs = [prefix + ex for ex in examples["text"]]
81 targets = [ex for ex in examples["translation"]]
82 model_inputs = tokenizer(inputs, max_length=config.n_positions, truncation=True)
84 # Setup the tokenizer for targets
85 with tokenizer.as_target_tokenizer():
86 labels = tokenizer(targets, max_length=max_target_length, truncation=True)
88 model_inputs["labels"] = labels["input_ids"]
89 return model_inputs
91 tokenized_datasets = dataset.map(_tokenize_translate_fn, batched=True)
92 ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1))
93 train_ds = ds["train"]
94 eval_ds = ds["test"]
95 ft_args = args.get("using", {}).get("trainer_args", {})
96 ft_args["output_dir"] = args["model_folder"]
97 ft_args["predict_with_generate"] = True
99 model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config)
100 model.resize_token_embeddings(len(tokenizer))
101 training_args = Seq2SeqTrainingArguments(**ft_args)
102 data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
104 # generate trainer and finetune
105 trainer = Trainer(
106 model=model,
107 args=training_args,
108 train_dataset=train_ds,
109 eval_dataset=eval_ds,
110 data_collator=data_collator,
111 # compute_metrics=_compute_metrics,
112 )
114 return tokenizer, trainer
117def _finetune_summarization(df, args):
118 df = df.rename(columns={args["target"]: "summary", args["input_column"]: "text"})
119 tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"])
120 tokenizer = AutoTokenizer.from_pretrained(tokenizer_from)
121 dataset = Dataset.from_pandas(df)
122 config = AutoConfig.from_pretrained(args["model_name"])
124 def _tokenize_summarize_fn(examples):
125 prefix = "summarize: " if "t5" in args["model_name"] else ""
126 inputs = [prefix + doc for doc in examples["text"]]
127 model_inputs = tokenizer(
128 inputs,
129 padding="max_length",
130 truncation=True,
131 max_length=config.max_position_embeddings,
132 pad_to_max_length=True,
133 ) # noqa
134 labels = tokenizer(
135 text_target=examples["summary"],
136 max_length=config.max_position_embeddings,
137 truncation=True,
138 ) # noqa
139 model_inputs["labels"] = labels["input_ids"]
140 return model_inputs
142 tokenized_datasets = dataset.map(_tokenize_summarize_fn, batched=True)
143 ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1))
144 train_ds = ds["train"]
145 eval_ds = ds["test"]
147 ft_args = args.get("using", {}).get("trainer_args", {})
148 ft_args["output_dir"] = args["model_folder"]
149 ft_args["predict_with_generate"] = True
151 model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config)
152 metric = evaluate.load("rouge")
153 training_args = Seq2SeqTrainingArguments(**ft_args)
154 data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
156 def _compute_metrics(eval_pred):
157 # ref: github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb
158 predictions, labels = eval_pred
159 decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
160 decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
162 # Rogue expects a newline after each sentence
163 decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
164 decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
166 result = metric.compute(
167 predictions=decoded_preds,
168 references=decoded_labels,
169 use_stemmer=True,
170 use_aggregator=True,
171 )
172 result = {key: value * 100 for key, value in result.items()}
173 prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
174 result["gen_len"] = np.mean(prediction_lens) # todo: remove?
175 return {k: round(v, 4) for k, v in result.items()}
177 # generate trainer and finetune
178 trainer = Trainer(
179 model=model,
180 args=training_args,
181 train_dataset=train_ds,
182 eval_dataset=eval_ds,
183 data_collator=data_collator,
184 compute_metrics=_compute_metrics,
185 )
187 return tokenizer, trainer
190def _finetune_fill_mask(df, args):
191 raise NotImplementedError("Finetuning fill-mask models is not yet supported.")
194def _finetune_text_generation(df, args):
195 raise NotImplementedError("Finetuning text-generation models is not yet supported.")
198def _finetune_question_answering(df, args):
199 raise NotImplementedError("Finetuning question-answering models is not yet supported.")
202def _finetune_text_2_text_generation(df, args):
203 raise NotImplementedError("Finetuning text-2-text generation models is not yet supported.")