Coverage for mindsdb / integrations / handlers / webz_handler / webz_tables.py: 0%

60 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1from typing import List 

2 

3import pandas as pd 

4from mindsdb_sql_parser import ast 

5 

6from mindsdb.integrations.libs.api_handler import APITable 

7from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions 

8 

9 

10class WebzBaseAPITable(APITable): 

11 

12 ENDPOINT = None 

13 OUTPUT_COLUMNS = [] 

14 SORTABLE_COLUMNS = [] 

15 TABLE_NAME = None 

16 

17 def select(self, query: ast.Select) -> pd.DataFrame: 

18 """Selects data from the API and returns it as a pandas DataFrame 

19 

20 Returns dataframe representing the API results. 

21 

22 Args: 

23 query (ast.Select): SQL SELECT query 

24 

25 """ 

26 conditions = extract_comparison_conditions(query.where) 

27 params = {} 

28 

29 for op, arg1, arg2 in conditions: 

30 if op != "=": 

31 raise NotImplementedError(f"Unsupported Operator: {op}") 

32 elif arg1 == "query": 

33 params["q"] = arg2 

34 else: 

35 raise NotImplementedError(f"Unknown clause: {arg1}") 

36 

37 if query.order_by: 

38 if len(query.order_by) > 1: 

39 raise ValueError("Unsupported to order by multiple fields") 

40 order_item = query.order_by[0] 

41 sort_column = ".".join(order_item.field.parts[1:]) 

42 # make sure that column is sortable 

43 if sort_column not in type(self).SORTABLE_COLUMNS: 

44 raise ValueError(f"Order by unknown column {sort_column}") 

45 params.update({"sort": sort_column, "order": order_item.direction.lower()}) 

46 

47 if query.limit is not None: 

48 params["size"] = query.limit.value 

49 result = self.handler.call_webz_api( 

50 method_name=type(self).TABLE_NAME, params=params 

51 ) 

52 

53 # filter targets 

54 columns = [] 

55 for target in query.targets: 

56 if isinstance(target, ast.Star): 

57 columns = self.get_columns() 

58 break 

59 elif isinstance(target, ast.Identifier): 

60 columns.append(target.parts[-1]) 

61 else: 

62 raise NotImplementedError(f"Unknown query target {type(target)}") 

63 

64 # columns to lower case 

65 columns = [name.lower() for name in columns] 

66 

67 if len(result) == 0: 

68 return pd.DataFrame([], columns=columns) 

69 

70 # add absent columns 

71 for col in set(columns) & set(result.columns) ^ set(columns): 

72 result[col] = None 

73 

74 # filter by columns 

75 result = result[columns] 

76 

77 # Rename columns 

78 for target in query.targets: 

79 if target.alias: 

80 result.rename( 

81 columns={target.parts[-1]: str(target.alias)}, inplace=True 

82 ) 

83 return result 

84 

85 def get_columns(self) -> List[str]: 

86 """Gets all columns to be returned in pandas DataFrame responses 

87 

88 Returns 

89 List of columns 

90 

91 """ 

92 return [column.replace(".", "__") for column in type(self).OUTPUT_COLUMNS] 

93 

94 

95class WebzPostsTable(WebzBaseAPITable): 

96 """To interact with structured posts data from news articles, blog posts and online discussions 

97 provided through the Webz.IO API. 

98 

99 """ 

100 

101 ENDPOINT = "filterWebContent" 

102 OUTPUT_COLUMNS = [ 

103 "thread.uuid", 

104 "thread.url", 

105 "thread.site_full", 

106 "thread.site", 

107 "thread.site_section", 

108 "thread.section_title", 

109 "thread.title", 

110 "thread.title_full", 

111 "thread.published", 

112 "thread.replies_count", 

113 "thread.participants_count", 

114 "thread.site_type", 

115 "thread.main_image", 

116 "thread.country", 

117 "thread.site_categories", 

118 "thread.social.facebook.likes", 

119 "thread.social.facebook.shares", 

120 "thread.social.facebook.comments", 

121 "thread.social.gplus.shares", 

122 "thread.social.pinterest.shares", 

123 "thread.social.linkedin.shares", 

124 "thread.social.stumbledupon.shares", 

125 "thread.social.vk.shares", 

126 "thread.performance_score", 

127 "thread.domain_rank", 

128 "thread.domain_rank_updated", 

129 "thread.reach.per_million", 

130 "thread.reach.page_views", 

131 "thread.reach.updated", 

132 "uuid", 

133 "url", 

134 "ord_in_thread", 

135 "parent_url", 

136 "author", 

137 "published", 

138 "title", 

139 "text", 

140 "language", 

141 "external_links", 

142 "external_images", 

143 "rating", 

144 "entities.persons", 

145 "entities.organizations", 

146 "entities.locations", 

147 "crawled", 

148 ] 

149 SORTABLE_COLUMNS = [ 

150 "crawled", 

151 "relevancy", 

152 "social.facebook.likes", 

153 "social.facebook.shares", 

154 "social.facebook.comments", 

155 "social.gplus.shares", 

156 "social.pinterest.shares", 

157 "social.linkedin.shares", 

158 "social.stumbledupon.shares", 

159 "social.vk.shares", 

160 "replies_count", 

161 "participants_count", 

162 "performance_score", 

163 "published", 

164 "thread.published", 

165 "domain_rank", 

166 "ord_in_thread", 

167 "rating", 

168 ] 

169 TABLE_NAME = "posts" 

170 

171 

172class WebzReviewsTable(WebzBaseAPITable): 

173 """To interact with structured reviews data from hundreds of review sites, 

174 provided through the Webz.IO API. 

175 

176 """ 

177 

178 ENDPOINT = "reviewFilter" 

179 OUTPUT_COLUMNS = [ 

180 "item.uuid", 

181 "item.url", 

182 "item.site_full", 

183 "item.site", 

184 "item.site_section", 

185 "item.section_title", 

186 "item.title", 

187 "item.title_full", 

188 "item.published", 

189 "item.reviews_count", 

190 "item.reviewers_count", 

191 "item.main_image", 

192 "item.country", 

193 "item.site_categories", 

194 "item.domain_rank", 

195 "item.domain_rank_updated", 

196 "uuid", 

197 "url", 

198 "ord_in_thread", 

199 "author", 

200 "published", 

201 "title", 

202 "text", 

203 "language", 

204 "external_links", 

205 "rating", 

206 "crawled", 

207 ] 

208 SORTABLE_COLUMNS = [ 

209 "crawled", 

210 "relevancy", 

211 "reviews_count", 

212 "reviewers_count", 

213 "spam_score", 

214 "domain_rank", 

215 "ord_in_thread", 

216 "rating", 

217 ] 

218 TABLE_NAME = "reviews"