Coverage for mindsdb / integrations / handlers / huggingface_api_handler / huggingface_api_handler.py: 0%

130 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import json 

2from typing import Optional, Dict 

3import pandas as pd 

4from huggingface_hub import HfApi 

5from huggingface_hub import hf_hub_download 

6from hugging_py_face import NLP, ComputerVision, AudioProcessing, get_in_df_supported_tasks 

7 

8from mindsdb.integrations.libs.base import BaseMLEngine 

9from mindsdb.integrations.utilities.handler_utils import get_api_key 

10from .exceptions import UnsupportedTaskException, InsufficientParametersException 

11 

12 

13class HuggingFaceInferenceAPIHandler(BaseMLEngine): 

14 """ 

15 Integration with the Hugging Face Inference API. 

16 """ 

17 

18 name = 'huggingface_api' 

19 

20 @staticmethod 

21 def create_validation(target, args=None, **kwargs): 

22 args = args['using'] 

23 

24 if 'input_column' not in args: 

25 raise InsufficientParametersException('input_column has to be specified') 

26 

27 if 'model_name' not in args: 

28 # detect model by task 

29 task = args.get('task') 

30 if task is None: 

31 raise InsufficientParametersException('model_name or task have to be specified') 

32 

33 args['model_name'] = None 

34 else: 

35 # detect task by model 

36 hf_api = HfApi() 

37 metadata = hf_api.model_info(args['model_name']) 

38 

39 if 'task' not in args: 

40 args['task'] = metadata.pipeline_tag 

41 

42 if args['task'] not in get_in_df_supported_tasks(): 

43 raise UnsupportedTaskException(f'The task {args["task"]} is not supported by the Hugging Face Inference API engine.') 

44 

45 if args['task'] == 'zero-shot-classification': 

46 if 'candidate_labels' not in args: 

47 raise Exception('"candidate_labels" is required for zero-shot-classification') 

48 

49 if args['task'] == 'sentence-similarity': 

50 if 'input_column2' not in args: 

51 raise InsufficientParametersException('input_column2 has to be specified') 

52 

53 def create(self, target: str, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = None) -> None: 

54 if 'using' not in args: 

55 raise InsufficientParametersException("Hugging Face Inference engine requires a USING clause! Refer to its documentation for more details.") 

56 

57 args = args['using'] 

58 args['target'] = target 

59 

60 if 'options' not in args: 

61 args['options'] = {} 

62 

63 if 'parameters' not in args: 

64 args['parameters'] = {} 

65 

66 if args['model_name'] is not None: 

67 # config.json 

68 config = {} 

69 try: 

70 config_path = hf_hub_download(args['model_name'], 'config.json') 

71 config = json.load(open(config_path)) 

72 except Exception: 

73 pass 

74 

75 if 'max_length' in args: 

76 args['options']['max_length'] = args['max_length'] 

77 elif 'max_position_embeddings' in config: 

78 args['options']['max_length'] = config['max_position_embeddings'] 

79 elif 'max_length' in config: 

80 args['options']['max_length'] = config['max_length'] 

81 

82 labels_default = config.get('id2label', {}) 

83 labels_map = {} 

84 if 'labels' in args: 

85 for num, value in labels_default.items(): 

86 if num.isdigit(): 

87 num = int(num) 

88 labels_map[value] = args['labels'][num] 

89 args['labels_map'] = labels_map 

90 if 'task_specific_params' in config: 

91 args['task_specific_params'] = config['task_specific_params'] 

92 

93 # for summarization 

94 if 'min_output_length' in args: 

95 args['options']['min_output_length'] = args['min_output_length'] 

96 

97 if 'max_output_length' in args: 

98 args['options']['max_output_length'] = args['max_output_length'] 

99 

100 self.model_storage.json_set('args', args) 

101 

102 def predict(self, df: Optional[pd.DataFrame] = None, args: Optional[Dict] = None) -> None: 

103 args = self.model_storage.json_get('args') 

104 api_key = get_api_key('huggingface_api', args, self.engine_storage, strict=False) 

105 

106 input_column = args['input_column'] 

107 model_name = args['model_name'] 

108 endpoint = args['endpoint'] if 'endpoint' in args else None 

109 options = args['options'] if 'options' in args else None 

110 parameters = args['parameters'] if 'parameters' in args else None 

111 

112 if args['task'] == 'text-classification': 

113 nlp = NLP(api_key, endpoint) 

114 result_df = nlp.text_classification_in_df( 

115 df, 

116 input_column, 

117 options, 

118 model_name, 

119 ) 

120 labels_map = args.get('labels_map') 

121 

122 result_df['predictions'] = result_df['predictions'].apply(lambda x: labels_map.get(x, x)) 

123 

124 elif args['task'] == 'fill-mask': 

125 nlp = NLP(api_key, endpoint) 

126 result_df = nlp.fill_mask_in_df( 

127 df, 

128 input_column, 

129 options, 

130 model_name 

131 ) 

132 

133 elif args['task'] == 'summarization': 

134 nlp = NLP(api_key, endpoint) 

135 result_df = nlp.summarization_in_df( 

136 df, 

137 input_column, 

138 parameters, 

139 options, 

140 model_name 

141 ) 

142 

143 elif args['task'] == 'text-generation': 

144 nlp = NLP(api_key, endpoint) 

145 result_df = nlp.text_generation_in_df( 

146 df, 

147 input_column, 

148 parameters, 

149 options, 

150 model_name 

151 ) 

152 

153 elif args['task'] == 'question-answering': 

154 nlp = NLP(api_key, endpoint) 

155 result_df = nlp.question_answering_in_df( 

156 df, 

157 input_column, 

158 args['context_column'], 

159 model_name 

160 ) 

161 

162 elif args['task'] == 'sentence-similarity': 

163 nlp = NLP(api_key, endpoint) 

164 result_df = nlp.sentence_similarity_in_df( 

165 df, 

166 input_column, 

167 args['input_column2'], 

168 options, 

169 model_name 

170 ) 

171 

172 elif args['task'] == 'zero-shot-classification': 

173 nlp = NLP(api_key, endpoint) 

174 result_df = nlp.zero_shot_classification_in_df( 

175 df, 

176 input_column, 

177 args['candidate_labels'], 

178 parameters, 

179 options, 

180 model_name 

181 ) 

182 

183 elif args['task'] == 'translation': 

184 lang_in = args['lang_input'] 

185 lang_out = args['lang_output'] 

186 

187 input_origin = None 

188 if 'task_specific_params' in args: 

189 task = f"translation_{lang_in}_to_{lang_out}" 

190 if task in args['task_specific_params'] and 'prefix' in args['task_specific_params'][task]: 

191 # inject prefix to data 

192 prefix = args['task_specific_params'][task]['prefix'] 

193 input_origin = df[input_column] 

194 df[input_column] = prefix + input_origin 

195 # don't pick up model in hugging_py_face 

196 lang_in = lang_out = None 

197 

198 nlp = NLP(api_key, endpoint) 

199 result_df = nlp.translation_in_df( 

200 df, 

201 input_column, 

202 lang_in, 

203 lang_out, 

204 options, 

205 model_name 

206 ) 

207 if input_origin is not None: 

208 df[input_column] = input_origin 

209 

210 elif args['task'] == 'image-classification': 

211 cp = ComputerVision(api_key, endpoint) 

212 result_df = cp.image_classification_in_df( 

213 df, 

214 input_column, 

215 model_name 

216 ) 

217 

218 elif args['task'] == 'object-detection': 

219 cp = ComputerVision(api_key, endpoint) 

220 result_df = cp.object_detection_in_df( 

221 df, 

222 input_column, 

223 model_name 

224 ) 

225 

226 elif args['task'] == 'automatic-speech-recognition': 

227 ap = AudioProcessing(api_key, endpoint) 

228 result_df = ap.automatic_speech_recognition_in_df( 

229 df, 

230 input_column, 

231 model_name 

232 ) 

233 

234 elif args['task'] == 'audio-classification': 

235 ap = AudioProcessing(api_key, endpoint) 

236 result_df = ap.audio_classification_in_df( 

237 df, 

238 input_column, 

239 model_name 

240 ) 

241 

242 result_df = result_df.rename(columns={'predictions': args['target']}) 

243 return result_df