Coverage for mindsdb / integrations / handlers / spacy_handler / spacy_handler.py: 0%
100 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1import spacy
2import pandas as pd
3from mindsdb.integrations.libs.base import BaseMLEngine
4from typing import Optional
7def named_entity_recognition(doc):
8 pred_whole = set()
9 output = {"entity": [], "star_char": [], "end_char": [], "label_": []}
10 for ent in doc.ents:
11 output["entity"].append(str(ent))
12 output["star_char"].append(ent.start_char)
13 output["end_char"].append(ent.end_char)
14 output["label_"].append(ent.label_)
15 pred_whole.add((ent.start_char, ent.end_char, ent.label_))
16 return pred_whole, output
19def lemmatization(doc):
20 pred_whole = set()
21 output = {"lemma_": []}
22 for token in doc:
23 output["lemma_"].append(token.lemma_)
24 pred_whole.add((token.lemma_))
25 return pred_whole, output
28def dependency_parsing(doc):
29 pred_whole = set()
30 output = {
31 "text": [],
32 "dep_": [],
33 "head.text": [],
34 "head.pos_": [],
35 "children": [],
36 }
37 for token in doc:
38 children = str([child for child in token.children])
39 output["text"].append(token.text)
40 output["dep_"].append(token.dep_)
41 output["head.text"].append(token.head.text)
42 output["head.pos_"].append(token.head.pos_)
43 output["children"].append(children)
44 pred_whole.add(
45 (
46 token.text,
47 token.dep_,
48 token.head.text,
49 token.head.pos_,
50 children,
51 )
52 )
53 return pred_whole, output
56def pos_tagging(doc):
57 pred_whole = set()
58 output = {
59 "text": [],
60 "lemma_": [],
61 "pos_": [],
62 "tag_": [],
63 "dep_": [],
64 "shape_": [],
65 "is_alpha": [],
66 "is_stop": [],
67 }
68 for token in doc:
69 output["text"].append(token.text)
70 output["lemma_"].append(token.lemma_)
71 output["pos_"].append(token.pos_)
72 output["tag_"].append(token.tag_)
73 output["dep_"].append(token.dep_)
74 output["shape_"].append(token.shape_)
75 output["is_alpha"].append(token.is_alpha)
76 output["is_stop"].append(token.is_stop)
77 pred_whole.add(
78 (
79 token.text,
80 token.lemma_,
81 token.pos_,
82 token.tag_,
83 token.dep_,
84 token.shape_,
85 token.is_alpha,
86 token.is_stop,
87 )
88 )
89 return pred_whole, output
92def morphology(doc):
93 pred_whole = set()
94 output = {"token": [], "token.morph": []}
95 for token in doc:
96 output["token"].append(str(token))
97 output["token.morph"].append(str(token.morph))
98 pred_whole.add((str(token), str(token.morph)))
99 return pred_whole, output
102lingustic_features = {
103 "ner": named_entity_recognition,
104 "lemmatization": lemmatization,
105 "dependency-parsing": dependency_parsing,
106 "pos-tag": pos_tagging,
107 "morphology": morphology,
108}
111class SpacyHandler(BaseMLEngine):
112 """
113 Integration with the spaCy NLP library.
114 """
116 name = "spacy"
118 def __init__(self, *args, **kwargs):
119 super().__init__(*args, **kwargs)
120 self.default_linguistic_feature = "ner"
121 self.linguistic_features = [
122 "ner",
123 "lemmatization",
124 "dependency-parsing",
125 "pos-tag",
126 "morphology",
127 ]
129 @staticmethod
130 def create_validation(target, args=None, **kwargs):
131 if "using" not in args:
132 raise Exception(
133 "spaCy engine requires a USING clause! Refer to its documentation for more details."
134 )
135 else:
136 args = args["using"]
138 if len(set(args.keys()) & {"linguistic_feature", "target_column"}) == 0:
139 raise Exception(
140 "`linguistic_feature` and `target_column` are required for this engine."
141 )
143 def create(
144 self,
145 target: str,
146 df: Optional[pd.DataFrame] = None,
147 args: Optional[dict] = None,
148 ) -> None:
149 model_args = args["using"]
150 model_args["target"] = target
152 if not args.get("linguistic_feature"):
153 args["linguistic_feature"] = self.default_linguistic_feature
155 # Loading the model
156 nlp = spacy.load("en_core_web_sm")
158 # Serialize the model
159 bytes_data = nlp.to_bytes()
161 # Save the serialized model and its config and model_args to the model storage
162 self.model_storage.file_set("model_data", bytes_data)
163 self.model_storage.json_set("model_args", model_args)
164 self.model_storage.json_set("config", nlp.config)
166 def predict(self, df, args=None):
167 config = self.model_storage.json_get("config")
168 model_args = self.model_storage.json_get("model_args")
169 bytes_data = self.model_storage.file_get("model_data")
171 # Deserialize the model using the provided config
172 lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
173 nlp = lang_cls.from_config(config)
174 nlp.from_bytes(bytes_data)
176 column_name = model_args["target_column"]
177 linguistic_feature = model_args.get("linguistic_feature")
179 predictions_whole = []
180 predictions_attributes = []
181 for _, text in df.iterrows():
182 doc = nlp(text[column_name])
183 if linguistic_feature in lingustic_features:
184 pred_whole, pred_attributes = lingustic_features[linguistic_feature](
185 doc
186 )
187 predictions_whole.append(pred_whole)
188 predictions_attributes.append(pred_attributes)
190 predictions_df = pd.DataFrame(predictions_attributes)
191 df = pd.concat([df, predictions_df], axis=1)
192 df[model_args["target"]] = predictions_whole
194 return df