Coverage for mindsdb / integrations / handlers / spacy_handler / spacy_handler.py: 0%

100 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import spacy 

2import pandas as pd 

3from mindsdb.integrations.libs.base import BaseMLEngine 

4from typing import Optional 

5 

6 

7def named_entity_recognition(doc): 

8 pred_whole = set() 

9 output = {"entity": [], "star_char": [], "end_char": [], "label_": []} 

10 for ent in doc.ents: 

11 output["entity"].append(str(ent)) 

12 output["star_char"].append(ent.start_char) 

13 output["end_char"].append(ent.end_char) 

14 output["label_"].append(ent.label_) 

15 pred_whole.add((ent.start_char, ent.end_char, ent.label_)) 

16 return pred_whole, output 

17 

18 

19def lemmatization(doc): 

20 pred_whole = set() 

21 output = {"lemma_": []} 

22 for token in doc: 

23 output["lemma_"].append(token.lemma_) 

24 pred_whole.add((token.lemma_)) 

25 return pred_whole, output 

26 

27 

28def dependency_parsing(doc): 

29 pred_whole = set() 

30 output = { 

31 "text": [], 

32 "dep_": [], 

33 "head.text": [], 

34 "head.pos_": [], 

35 "children": [], 

36 } 

37 for token in doc: 

38 children = str([child for child in token.children]) 

39 output["text"].append(token.text) 

40 output["dep_"].append(token.dep_) 

41 output["head.text"].append(token.head.text) 

42 output["head.pos_"].append(token.head.pos_) 

43 output["children"].append(children) 

44 pred_whole.add( 

45 ( 

46 token.text, 

47 token.dep_, 

48 token.head.text, 

49 token.head.pos_, 

50 children, 

51 ) 

52 ) 

53 return pred_whole, output 

54 

55 

56def pos_tagging(doc): 

57 pred_whole = set() 

58 output = { 

59 "text": [], 

60 "lemma_": [], 

61 "pos_": [], 

62 "tag_": [], 

63 "dep_": [], 

64 "shape_": [], 

65 "is_alpha": [], 

66 "is_stop": [], 

67 } 

68 for token in doc: 

69 output["text"].append(token.text) 

70 output["lemma_"].append(token.lemma_) 

71 output["pos_"].append(token.pos_) 

72 output["tag_"].append(token.tag_) 

73 output["dep_"].append(token.dep_) 

74 output["shape_"].append(token.shape_) 

75 output["is_alpha"].append(token.is_alpha) 

76 output["is_stop"].append(token.is_stop) 

77 pred_whole.add( 

78 ( 

79 token.text, 

80 token.lemma_, 

81 token.pos_, 

82 token.tag_, 

83 token.dep_, 

84 token.shape_, 

85 token.is_alpha, 

86 token.is_stop, 

87 ) 

88 ) 

89 return pred_whole, output 

90 

91 

92def morphology(doc): 

93 pred_whole = set() 

94 output = {"token": [], "token.morph": []} 

95 for token in doc: 

96 output["token"].append(str(token)) 

97 output["token.morph"].append(str(token.morph)) 

98 pred_whole.add((str(token), str(token.morph))) 

99 return pred_whole, output 

100 

101 

102lingustic_features = { 

103 "ner": named_entity_recognition, 

104 "lemmatization": lemmatization, 

105 "dependency-parsing": dependency_parsing, 

106 "pos-tag": pos_tagging, 

107 "morphology": morphology, 

108} 

109 

110 

111class SpacyHandler(BaseMLEngine): 

112 """ 

113 Integration with the spaCy NLP library. 

114 """ 

115 

116 name = "spacy" 

117 

118 def __init__(self, *args, **kwargs): 

119 super().__init__(*args, **kwargs) 

120 self.default_linguistic_feature = "ner" 

121 self.linguistic_features = [ 

122 "ner", 

123 "lemmatization", 

124 "dependency-parsing", 

125 "pos-tag", 

126 "morphology", 

127 ] 

128 

129 @staticmethod 

130 def create_validation(target, args=None, **kwargs): 

131 if "using" not in args: 

132 raise Exception( 

133 "spaCy engine requires a USING clause! Refer to its documentation for more details." 

134 ) 

135 else: 

136 args = args["using"] 

137 

138 if len(set(args.keys()) & {"linguistic_feature", "target_column"}) == 0: 

139 raise Exception( 

140 "`linguistic_feature` and `target_column` are required for this engine." 

141 ) 

142 

143 def create( 

144 self, 

145 target: str, 

146 df: Optional[pd.DataFrame] = None, 

147 args: Optional[dict] = None, 

148 ) -> None: 

149 model_args = args["using"] 

150 model_args["target"] = target 

151 

152 if not args.get("linguistic_feature"): 

153 args["linguistic_feature"] = self.default_linguistic_feature 

154 

155 # Loading the model 

156 nlp = spacy.load("en_core_web_sm") 

157 

158 # Serialize the model 

159 bytes_data = nlp.to_bytes() 

160 

161 # Save the serialized model and its config and model_args to the model storage 

162 self.model_storage.file_set("model_data", bytes_data) 

163 self.model_storage.json_set("model_args", model_args) 

164 self.model_storage.json_set("config", nlp.config) 

165 

166 def predict(self, df, args=None): 

167 config = self.model_storage.json_get("config") 

168 model_args = self.model_storage.json_get("model_args") 

169 bytes_data = self.model_storage.file_get("model_data") 

170 

171 # Deserialize the model using the provided config 

172 lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"]) 

173 nlp = lang_cls.from_config(config) 

174 nlp.from_bytes(bytes_data) 

175 

176 column_name = model_args["target_column"] 

177 linguistic_feature = model_args.get("linguistic_feature") 

178 

179 predictions_whole = [] 

180 predictions_attributes = [] 

181 for _, text in df.iterrows(): 

182 doc = nlp(text[column_name]) 

183 if linguistic_feature in lingustic_features: 

184 pred_whole, pred_attributes = lingustic_features[linguistic_feature]( 

185 doc 

186 ) 

187 predictions_whole.append(pred_whole) 

188 predictions_attributes.append(pred_attributes) 

189 

190 predictions_df = pd.DataFrame(predictions_attributes) 

191 df = pd.concat([df, predictions_df], axis=1) 

192 df[model_args["target"]] = predictions_whole 

193 

194 return df