Coverage for mindsdb/integrations/handlers/spacy_handler/spacy

1import spacy

2import pandas as pd

3from mindsdb.integrations.libs.base import BaseMLEngine

4from typing import Optional

7def named_entity_recognition(doc):

8 pred_whole = set()

9 output = {"entity": [], "star_char": [], "end_char": [], "label_": []}

10 for ent in doc.ents:

11 output["entity"].append(str(ent))

12 output["star_char"].append(ent.start_char)

13 output["end_char"].append(ent.end_char)

14 output["label_"].append(ent.label_)

15 pred_whole.add((ent.start_char, ent.end_char, ent.label_))

16 return pred_whole, output

19def lemmatization(doc):

20 pred_whole = set()

21 output = {"lemma_": []}

22 for token in doc:

23 output["lemma_"].append(token.lemma_)

24 pred_whole.add((token.lemma_))

25 return pred_whole, output

28def dependency_parsing(doc):

29 pred_whole = set()

30 output = {

31 "text": [],

32 "dep_": [],

33 "head.text": [],

34 "head.pos_": [],

35 "children": [],

36 }

37 for token in doc:

38 children = str([child for child in token.children])

39 output["text"].append(token.text)

40 output["dep_"].append(token.dep_)

41 output["head.text"].append(token.head.text)

42 output["head.pos_"].append(token.head.pos_)

43 output["children"].append(children)

44 pred_whole.add(

45 (

46 token.text,

47 token.dep_,

48 token.head.text,

49 token.head.pos_,

50 children,

51 )

52 )

53 return pred_whole, output

56def pos_tagging(doc):

57 pred_whole = set()

58 output = {

59 "text": [],

60 "lemma_": [],

61 "pos_": [],

62 "tag_": [],

63 "dep_": [],

64 "shape_": [],

65 "is_alpha": [],

66 "is_stop": [],

67 }

68 for token in doc:

69 output["text"].append(token.text)

70 output["lemma_"].append(token.lemma_)

71 output["pos_"].append(token.pos_)

72 output["tag_"].append(token.tag_)

73 output["dep_"].append(token.dep_)

74 output["shape_"].append(token.shape_)

75 output["is_alpha"].append(token.is_alpha)

76 output["is_stop"].append(token.is_stop)

77 pred_whole.add(

78 (

79 token.text,

80 token.lemma_,

81 token.pos_,

82 token.tag_,

83 token.dep_,

84 token.shape_,

85 token.is_alpha,

86 token.is_stop,

87 )

88 )

89 return pred_whole, output

92def morphology(doc):

93 pred_whole = set()

94 output = {"token": [], "token.morph": []}

95 for token in doc:

96 output["token"].append(str(token))

97 output["token.morph"].append(str(token.morph))

98 pred_whole.add((str(token), str(token.morph)))

99 return pred_whole, output

100

101

102lingustic_features = {

103 "ner": named_entity_recognition,

104 "lemmatization": lemmatization,

105 "dependency-parsing": dependency_parsing,

106 "pos-tag": pos_tagging,

107 "morphology": morphology,

108}

109

110

111class SpacyHandler(BaseMLEngine):

112 """

113 Integration with the spaCy NLP library.

114 """

115

116 name = "spacy"

117

118 def __init__(self, *args, **kwargs):

119 super().__init__(*args, **kwargs)

120 self.default_linguistic_feature = "ner"

121 self.linguistic_features = [

122 "ner",

123 "lemmatization",

124 "dependency-parsing",

125 "pos-tag",

126 "morphology",

127 ]

128

129 @staticmethod

130 def create_validation(target, args=None, **kwargs):

131 if "using" not in args:

132 raise Exception(

133 "spaCy engine requires a USING clause! Refer to its documentation for more details."

134 )

135 else:

136 args = args["using"]

137

138 if len(set(args.keys()) & {"linguistic_feature", "target_column"}) == 0:

139 raise Exception(

140 "`linguistic_feature` and `target_column` are required for this engine."

141 )

142

143 def create(

144 self,

145 target: str,

146 df: Optional[pd.DataFrame] = None,

147 args: Optional[dict] = None,

148 ) -> None:

149 model_args = args["using"]

150 model_args["target"] = target

151

152 if not args.get("linguistic_feature"):

153 args["linguistic_feature"] = self.default_linguistic_feature

154

155 # Loading the model

156 nlp = spacy.load("en_core_web_sm")

157

158 # Serialize the model

159 bytes_data = nlp.to_bytes()

160

161 # Save the serialized model and its config and model_args to the model storage

162 self.model_storage.file_set("model_data", bytes_data)

163 self.model_storage.json_set("model_args", model_args)

164 self.model_storage.json_set("config", nlp.config)

165

166 def predict(self, df, args=None):

167 config = self.model_storage.json_get("config")

168 model_args = self.model_storage.json_get("model_args")

169 bytes_data = self.model_storage.file_get("model_data")

170

171 # Deserialize the model using the provided config

172 lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])

173 nlp = lang_cls.from_config(config)

174 nlp.from_bytes(bytes_data)

175

176 column_name = model_args["target_column"]

177 linguistic_feature = model_args.get("linguistic_feature")

178

179 predictions_whole = []

180 predictions_attributes = []

181 for _, text in df.iterrows():

182 doc = nlp(text[column_name])

183 if linguistic_feature in lingustic_features:

184 pred_whole, pred_attributes = lingustic_features[linguistic_feature](

185 doc

186 )

187 predictions_whole.append(pred_whole)

188 predictions_attributes.append(pred_attributes)

189

190 predictions_df = pd.DataFrame(predictions_attributes)

191 df = pd.concat([df, predictions_df], axis=1)

192 df[model_args["target"]] = predictions_whole

193

194 return df

Coverage for mindsdb / integrations / handlers / spacy_handler / spacy_handler.py: 0%

100 statements