Coverage for mindsdb / integrations / handlers / webz_handler / webz_tables.py: 0%
60 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1from typing import List
3import pandas as pd
4from mindsdb_sql_parser import ast
6from mindsdb.integrations.libs.api_handler import APITable
7from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions
10class WebzBaseAPITable(APITable):
12 ENDPOINT = None
13 OUTPUT_COLUMNS = []
14 SORTABLE_COLUMNS = []
15 TABLE_NAME = None
17 def select(self, query: ast.Select) -> pd.DataFrame:
18 """Selects data from the API and returns it as a pandas DataFrame
20 Returns dataframe representing the API results.
22 Args:
23 query (ast.Select): SQL SELECT query
25 """
26 conditions = extract_comparison_conditions(query.where)
27 params = {}
29 for op, arg1, arg2 in conditions:
30 if op != "=":
31 raise NotImplementedError(f"Unsupported Operator: {op}")
32 elif arg1 == "query":
33 params["q"] = arg2
34 else:
35 raise NotImplementedError(f"Unknown clause: {arg1}")
37 if query.order_by:
38 if len(query.order_by) > 1:
39 raise ValueError("Unsupported to order by multiple fields")
40 order_item = query.order_by[0]
41 sort_column = ".".join(order_item.field.parts[1:])
42 # make sure that column is sortable
43 if sort_column not in type(self).SORTABLE_COLUMNS:
44 raise ValueError(f"Order by unknown column {sort_column}")
45 params.update({"sort": sort_column, "order": order_item.direction.lower()})
47 if query.limit is not None:
48 params["size"] = query.limit.value
49 result = self.handler.call_webz_api(
50 method_name=type(self).TABLE_NAME, params=params
51 )
53 # filter targets
54 columns = []
55 for target in query.targets:
56 if isinstance(target, ast.Star):
57 columns = self.get_columns()
58 break
59 elif isinstance(target, ast.Identifier):
60 columns.append(target.parts[-1])
61 else:
62 raise NotImplementedError(f"Unknown query target {type(target)}")
64 # columns to lower case
65 columns = [name.lower() for name in columns]
67 if len(result) == 0:
68 return pd.DataFrame([], columns=columns)
70 # add absent columns
71 for col in set(columns) & set(result.columns) ^ set(columns):
72 result[col] = None
74 # filter by columns
75 result = result[columns]
77 # Rename columns
78 for target in query.targets:
79 if target.alias:
80 result.rename(
81 columns={target.parts[-1]: str(target.alias)}, inplace=True
82 )
83 return result
85 def get_columns(self) -> List[str]:
86 """Gets all columns to be returned in pandas DataFrame responses
88 Returns
89 List of columns
91 """
92 return [column.replace(".", "__") for column in type(self).OUTPUT_COLUMNS]
95class WebzPostsTable(WebzBaseAPITable):
96 """To interact with structured posts data from news articles, blog posts and online discussions
97 provided through the Webz.IO API.
99 """
101 ENDPOINT = "filterWebContent"
102 OUTPUT_COLUMNS = [
103 "thread.uuid",
104 "thread.url",
105 "thread.site_full",
106 "thread.site",
107 "thread.site_section",
108 "thread.section_title",
109 "thread.title",
110 "thread.title_full",
111 "thread.published",
112 "thread.replies_count",
113 "thread.participants_count",
114 "thread.site_type",
115 "thread.main_image",
116 "thread.country",
117 "thread.site_categories",
118 "thread.social.facebook.likes",
119 "thread.social.facebook.shares",
120 "thread.social.facebook.comments",
121 "thread.social.gplus.shares",
122 "thread.social.pinterest.shares",
123 "thread.social.linkedin.shares",
124 "thread.social.stumbledupon.shares",
125 "thread.social.vk.shares",
126 "thread.performance_score",
127 "thread.domain_rank",
128 "thread.domain_rank_updated",
129 "thread.reach.per_million",
130 "thread.reach.page_views",
131 "thread.reach.updated",
132 "uuid",
133 "url",
134 "ord_in_thread",
135 "parent_url",
136 "author",
137 "published",
138 "title",
139 "text",
140 "language",
141 "external_links",
142 "external_images",
143 "rating",
144 "entities.persons",
145 "entities.organizations",
146 "entities.locations",
147 "crawled",
148 ]
149 SORTABLE_COLUMNS = [
150 "crawled",
151 "relevancy",
152 "social.facebook.likes",
153 "social.facebook.shares",
154 "social.facebook.comments",
155 "social.gplus.shares",
156 "social.pinterest.shares",
157 "social.linkedin.shares",
158 "social.stumbledupon.shares",
159 "social.vk.shares",
160 "replies_count",
161 "participants_count",
162 "performance_score",
163 "published",
164 "thread.published",
165 "domain_rank",
166 "ord_in_thread",
167 "rating",
168 ]
169 TABLE_NAME = "posts"
172class WebzReviewsTable(WebzBaseAPITable):
173 """To interact with structured reviews data from hundreds of review sites,
174 provided through the Webz.IO API.
176 """
178 ENDPOINT = "reviewFilter"
179 OUTPUT_COLUMNS = [
180 "item.uuid",
181 "item.url",
182 "item.site_full",
183 "item.site",
184 "item.site_section",
185 "item.section_title",
186 "item.title",
187 "item.title_full",
188 "item.published",
189 "item.reviews_count",
190 "item.reviewers_count",
191 "item.main_image",
192 "item.country",
193 "item.site_categories",
194 "item.domain_rank",
195 "item.domain_rank_updated",
196 "uuid",
197 "url",
198 "ord_in_thread",
199 "author",
200 "published",
201 "title",
202 "text",
203 "language",
204 "external_links",
205 "rating",
206 "crawled",
207 ]
208 SORTABLE_COLUMNS = [
209 "crawled",
210 "relevancy",
211 "reviews_count",
212 "reviewers_count",
213 "spam_score",
214 "domain_rank",
215 "ord_in_thread",
216 "rating",
217 ]
218 TABLE_NAME = "reviews"