Coverage for mindsdb / integrations / handlers / huggingface_handler / finetune.py: 0%

106 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import evaluate 

2import nltk 

3import numpy as np 

4from datasets import Dataset 

5from transformers import ( 

6 AutoConfig, 

7 AutoModelForSeq2SeqLM, 

8 AutoModelForSequenceClassification, 

9 AutoTokenizer, 

10 DataCollatorForSeq2Seq, 

11 Seq2SeqTrainingArguments, 

12 Trainer, 

13 TrainingArguments, 

14) 

15 

16# todo add support for question answering task 

17# todo add support for fill mask 

18# todo add support for text_generation (causal language model) 

19# todo add support for text_2_text generation 

20 

21 

22def _finetune_cls(df, args): 

23 df = df.rename(columns={args["target"]: "labels", args["input_column"]: "text"}) 

24 tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"]) 

25 tokenizer = AutoTokenizer.from_pretrained(tokenizer_from) 

26 dataset = Dataset.from_pandas(df) 

27 

28 def _tokenize_text_cls_fn(examples): 

29 return tokenizer(examples["text"], padding="max_length", truncation=True) 

30 

31 tokenized_datasets = dataset.map(_tokenize_text_cls_fn, batched=True) 

32 ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1)) 

33 train_ds = ds["train"] 

34 eval_ds = ds["test"] 

35 

36 ft_args = args.get("using", {}).get("trainer_args", {}) 

37 ft_args["output_dir"] = args["model_folder"] 

38 

39 n_labels = len(args["labels_map"]) 

40 # todo replace for prod 

41 assert n_labels == df["labels"].nunique(), ( 

42 f"Label mismatch! Ensure labels match what the model was originally trained on. Found {df['labels'].nunique()} classes, expected {n_labels}." 

43 ) # noqa 

44 # TODO: ideally check that labels are a subset of the original ones, too. 

45 config = AutoConfig.from_pretrained(args["model_name"]) 

46 model = AutoModelForSequenceClassification.from_pretrained(args["model_name"], config=config) 

47 metric = evaluate.load("accuracy") 

48 training_args = TrainingArguments(**ft_args) 

49 

50 def _compute_metrics(eval_pred): 

51 logits, labels = eval_pred 

52 predictions = np.argmax(logits, axis=-1) 

53 return metric.compute(predictions=predictions, references=labels) 

54 

55 # generate trainer and finetune 

56 trainer = Trainer( 

57 model=model, 

58 args=training_args, 

59 train_dataset=train_ds, 

60 eval_dataset=eval_ds, 

61 compute_metrics=_compute_metrics, 

62 ) 

63 

64 return tokenizer, trainer 

65 

66 

67# TODO: merge with summarization? 

68def _finetune_translate(df, args): 

69 config = AutoConfig.from_pretrained(args["model_name"]) 

70 df = df.rename(columns={args["target"]: "translation", args["input_column"]: "text"}) 

71 tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"]) 

72 tokenizer = AutoTokenizer.from_pretrained(tokenizer_from) 

73 dataset = Dataset.from_pandas(df) 

74 

75 def _tokenize_translate_fn(examples): 

76 source_lang = args["lang_input"] 

77 target_lang = args["lang_output"] 

78 max_target_length = config.task_specific_params["summarization"]["max_length"] 

79 prefix = f"translate {source_lang} to {target_lang}: " 

80 inputs = [prefix + ex for ex in examples["text"]] 

81 targets = [ex for ex in examples["translation"]] 

82 model_inputs = tokenizer(inputs, max_length=config.n_positions, truncation=True) 

83 

84 # Setup the tokenizer for targets 

85 with tokenizer.as_target_tokenizer(): 

86 labels = tokenizer(targets, max_length=max_target_length, truncation=True) 

87 

88 model_inputs["labels"] = labels["input_ids"] 

89 return model_inputs 

90 

91 tokenized_datasets = dataset.map(_tokenize_translate_fn, batched=True) 

92 ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1)) 

93 train_ds = ds["train"] 

94 eval_ds = ds["test"] 

95 ft_args = args.get("using", {}).get("trainer_args", {}) 

96 ft_args["output_dir"] = args["model_folder"] 

97 ft_args["predict_with_generate"] = True 

98 

99 model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config) 

100 model.resize_token_embeddings(len(tokenizer)) 

101 training_args = Seq2SeqTrainingArguments(**ft_args) 

102 data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) 

103 

104 # generate trainer and finetune 

105 trainer = Trainer( 

106 model=model, 

107 args=training_args, 

108 train_dataset=train_ds, 

109 eval_dataset=eval_ds, 

110 data_collator=data_collator, 

111 # compute_metrics=_compute_metrics, 

112 ) 

113 

114 return tokenizer, trainer 

115 

116 

117def _finetune_summarization(df, args): 

118 df = df.rename(columns={args["target"]: "summary", args["input_column"]: "text"}) 

119 tokenizer_from = args.get("using", {}).get("tokenizer_from", args["model_name"]) 

120 tokenizer = AutoTokenizer.from_pretrained(tokenizer_from) 

121 dataset = Dataset.from_pandas(df) 

122 config = AutoConfig.from_pretrained(args["model_name"]) 

123 

124 def _tokenize_summarize_fn(examples): 

125 prefix = "summarize: " if "t5" in args["model_name"] else "" 

126 inputs = [prefix + doc for doc in examples["text"]] 

127 model_inputs = tokenizer( 

128 inputs, 

129 padding="max_length", 

130 truncation=True, 

131 max_length=config.max_position_embeddings, 

132 pad_to_max_length=True, 

133 ) # noqa 

134 labels = tokenizer( 

135 text_target=examples["summary"], 

136 max_length=config.max_position_embeddings, 

137 truncation=True, 

138 ) # noqa 

139 model_inputs["labels"] = labels["input_ids"] 

140 return model_inputs 

141 

142 tokenized_datasets = dataset.map(_tokenize_summarize_fn, batched=True) 

143 ds = tokenized_datasets.shuffle(seed=42).train_test_split(test_size=args.get("eval_size", 0.1)) 

144 train_ds = ds["train"] 

145 eval_ds = ds["test"] 

146 

147 ft_args = args.get("using", {}).get("trainer_args", {}) 

148 ft_args["output_dir"] = args["model_folder"] 

149 ft_args["predict_with_generate"] = True 

150 

151 model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"], config=config) 

152 metric = evaluate.load("rouge") 

153 training_args = Seq2SeqTrainingArguments(**ft_args) 

154 data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) 

155 

156 def _compute_metrics(eval_pred): 

157 # ref: github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb 

158 predictions, labels = eval_pred 

159 decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) 

160 decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 

161 

162 # Rogue expects a newline after each sentence 

163 decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds] 

164 decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels] 

165 

166 result = metric.compute( 

167 predictions=decoded_preds, 

168 references=decoded_labels, 

169 use_stemmer=True, 

170 use_aggregator=True, 

171 ) 

172 result = {key: value * 100 for key, value in result.items()} 

173 prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] 

174 result["gen_len"] = np.mean(prediction_lens) # todo: remove? 

175 return {k: round(v, 4) for k, v in result.items()} 

176 

177 # generate trainer and finetune 

178 trainer = Trainer( 

179 model=model, 

180 args=training_args, 

181 train_dataset=train_ds, 

182 eval_dataset=eval_ds, 

183 data_collator=data_collator, 

184 compute_metrics=_compute_metrics, 

185 ) 

186 

187 return tokenizer, trainer 

188 

189 

190def _finetune_fill_mask(df, args): 

191 raise NotImplementedError("Finetuning fill-mask models is not yet supported.") 

192 

193 

194def _finetune_text_generation(df, args): 

195 raise NotImplementedError("Finetuning text-generation models is not yet supported.") 

196 

197 

198def _finetune_question_answering(df, args): 

199 raise NotImplementedError("Finetuning question-answering models is not yet supported.") 

200 

201 

202def _finetune_text_2_text_generation(df, args): 

203 raise NotImplementedError("Finetuning text-2-text generation models is not yet supported.")