Coverage for mindsdb/integrations/handlers/webz_handler/webz

1from typing import List

3import pandas as pd

4from mindsdb_sql_parser import ast

6from mindsdb.integrations.libs.api_handler import APITable

7from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions

10class WebzBaseAPITable(APITable):

12 ENDPOINT = None

13 OUTPUT_COLUMNS = []

14 SORTABLE_COLUMNS = []

15 TABLE_NAME = None

17 def select(self, query: ast.Select) -> pd.DataFrame:

18 """Selects data from the API and returns it as a pandas DataFrame

20 Returns dataframe representing the API results.

22 Args:

23 query (ast.Select): SQL SELECT query

25 """

26 conditions = extract_comparison_conditions(query.where)

27 params = {}

29 for op, arg1, arg2 in conditions:

30 if op != "=":

31 raise NotImplementedError(f"Unsupported Operator: {op}")

32 elif arg1 == "query":

33 params["q"] = arg2

34 else:

35 raise NotImplementedError(f"Unknown clause: {arg1}")

37 if query.order_by:

38 if len(query.order_by) > 1:

39 raise ValueError("Unsupported to order by multiple fields")

40 order_item = query.order_by[0]

41 sort_column = ".".join(order_item.field.parts[1:])

42 # make sure that column is sortable

43 if sort_column not in type(self).SORTABLE_COLUMNS:

44 raise ValueError(f"Order by unknown column {sort_column}")

45 params.update({"sort": sort_column, "order": order_item.direction.lower()})

47 if query.limit is not None:

48 params["size"] = query.limit.value

49 result = self.handler.call_webz_api(

50 method_name=type(self).TABLE_NAME, params=params

51 )

53 # filter targets

54 columns = []

55 for target in query.targets:

56 if isinstance(target, ast.Star):

57 columns = self.get_columns()

58 break

59 elif isinstance(target, ast.Identifier):

60 columns.append(target.parts[-1])

61 else:

62 raise NotImplementedError(f"Unknown query target {type(target)}")

64 # columns to lower case

65 columns = [name.lower() for name in columns]

67 if len(result) == 0:

68 return pd.DataFrame([], columns=columns)

70 # add absent columns

71 for col in set(columns) & set(result.columns) ^ set(columns):

72 result[col] = None

74 # filter by columns

75 result = result[columns]

77 # Rename columns

78 for target in query.targets:

79 if target.alias:

80 result.rename(

81 columns={target.parts[-1]: str(target.alias)}, inplace=True

82 )

83 return result

85 def get_columns(self) -> List[str]:

86 """Gets all columns to be returned in pandas DataFrame responses

88 Returns

89 List of columns

91 """

92 return [column.replace(".", "__") for column in type(self).OUTPUT_COLUMNS]

95class WebzPostsTable(WebzBaseAPITable):

96 """To interact with structured posts data from news articles, blog posts and online discussions

97 provided through the Webz.IO API.

99 """

100

101 ENDPOINT = "filterWebContent"

102 OUTPUT_COLUMNS = [

103 "thread.uuid",

104 "thread.url",

105 "thread.site_full",

106 "thread.site",

107 "thread.site_section",

108 "thread.section_title",

109 "thread.title",

110 "thread.title_full",

111 "thread.published",

112 "thread.replies_count",

113 "thread.participants_count",

114 "thread.site_type",

115 "thread.main_image",

116 "thread.country",

117 "thread.site_categories",

118 "thread.social.facebook.likes",

119 "thread.social.facebook.shares",

120 "thread.social.facebook.comments",

121 "thread.social.gplus.shares",

122 "thread.social.pinterest.shares",

123 "thread.social.linkedin.shares",

124 "thread.social.stumbledupon.shares",

125 "thread.social.vk.shares",

126 "thread.performance_score",

127 "thread.domain_rank",

128 "thread.domain_rank_updated",

129 "thread.reach.per_million",

130 "thread.reach.page_views",

131 "thread.reach.updated",

132 "uuid",

133 "url",

134 "ord_in_thread",

135 "parent_url",

136 "author",

137 "published",

138 "title",

139 "text",

140 "language",

141 "external_links",

142 "external_images",

143 "rating",

144 "entities.persons",

145 "entities.organizations",

146 "entities.locations",

147 "crawled",

148 ]

149 SORTABLE_COLUMNS = [

150 "crawled",

151 "relevancy",

152 "social.facebook.likes",

153 "social.facebook.shares",

154 "social.facebook.comments",

155 "social.gplus.shares",

156 "social.pinterest.shares",

157 "social.linkedin.shares",

158 "social.stumbledupon.shares",

159 "social.vk.shares",

160 "replies_count",

161 "participants_count",

162 "performance_score",

163 "published",

164 "thread.published",

165 "domain_rank",

166 "ord_in_thread",

167 "rating",

168 ]

169 TABLE_NAME = "posts"

170

171

172class WebzReviewsTable(WebzBaseAPITable):

173 """To interact with structured reviews data from hundreds of review sites,

174 provided through the Webz.IO API.

175

176 """

177

178 ENDPOINT = "reviewFilter"

179 OUTPUT_COLUMNS = [

180 "item.uuid",

181 "item.url",

182 "item.site_full",

183 "item.site",

184 "item.site_section",

185 "item.section_title",

186 "item.title",

187 "item.title_full",

188 "item.published",

189 "item.reviews_count",

190 "item.reviewers_count",

191 "item.main_image",

192 "item.country",

193 "item.site_categories",

194 "item.domain_rank",

195 "item.domain_rank_updated",

196 "uuid",

197 "url",

198 "ord_in_thread",

199 "author",

200 "published",

201 "title",

202 "text",

203 "language",

204 "external_links",

205 "rating",

206 "crawled",

207 ]

208 SORTABLE_COLUMNS = [

209 "crawled",

210 "relevancy",

211 "reviews_count",

212 "reviewers_count",

213 "spam_score",

214 "domain_rank",

215 "ord_in_thread",

216 "rating",

217 ]

218 TABLE_NAME = "reviews"

Coverage for mindsdb / integrations / handlers / webz_handler / webz_tables.py: 0%

60 statements