Coverage for mindsdb / integrations / handlers / serpstack_handler / serpstack_tables.py: 0%

103 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import pandas as pd 

2import requests 

3from mindsdb.integrations.libs.api_handler import APITable 

4from mindsdb_sql_parser import ast 

5from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions 

6 

7 

8class BaseResultsTable(APITable): 

9 def select(self, query: ast.Select) -> pd.DataFrame: 

10 """ 

11 Selects data from the results table and returns it as a pandas DataFrame. 

12 

13 Args: 

14 query (ast.Select): The SQL query to be executed. 

15 

16 Returns: 

17 pandas.DataFrame: A pandas DataFrame containing the selected data. 

18 """ 

19 base_url = self.handler.connect() 

20 

21 params = {'access_key': self.handler.access_key} 

22 conditions = extract_comparison_conditions(query.where) 

23 params.update({condition[1]: condition[2] for condition in conditions if condition[0] == '='}) 

24 

25 if 'query' not in params: 

26 raise ValueError('Query is missing in the SQL query') 

27 if 'type' not in params and hasattr(self, 'default_type'): 

28 params['type'] = self.default_type 

29 try: 

30 api_response = requests.get(base_url, params=params) 

31 api_response.raise_for_status() # raises HTTPError for bad responses 

32 api_result = api_response.json() 

33 except requests.exceptions.HTTPError as e: 

34 raise SystemError(f"HTTP error occurred: {e.response.status_code} - {e.response.reason}") 

35 except requests.exceptions.ConnectionError as e: 

36 raise SystemError(f"Connection error occurred: {str(e)}") 

37 except requests.exceptions.Timeout as e: 

38 raise SystemError(f"Request timeout: {str(e)}") 

39 except requests.exceptions.RequestException as e: 

40 raise SystemError(f"Request exception occurred: {str(e)}") 

41 except ValueError as e: 

42 raise SystemError(f"Failed to parse JSON response: {str(e)}") 

43 

44 results = api_result.get(self.results_key, []) 

45 processed_results = [self.extract_data(result) for result in results] 

46 

47 if len(processed_results) == 0: 

48 columns = self.get_columns() 

49 empty_data = {col: ["No results found"] for col in columns} 

50 return pd.DataFrame(empty_data, columns=columns) 

51 

52 result_df = pd.DataFrame(processed_results) 

53 result_df = self.filter_columns(result_df, query) 

54 return result_df 

55 

56 def extract_data(self, data): 

57 """ 

58 Extracts the required data from the result. 

59 

60 Args: 

61 data (dict): The result data. 

62 

63 Returns: 

64 dict: A dictionary containing the extracted data. 

65 """ 

66 raise NotImplementedError("Subclasses must implement this method.") 

67 

68 def filter_columns(self, result: pd.DataFrame, query: ast.Select = None): 

69 """ 

70 Filters the columns of the result DataFrame. 

71 

72 Args: 

73 result (pandas.DataFrame): The result DataFrame. 

74 query (ast.Select): The SQL query to be executed. 

75 

76 Returns: 

77 pandas.DataFrame: A pandas DataFrame containing the filtered data. 

78 """ 

79 columns = [] 

80 if query is not None: 

81 for target in query.targets: 

82 if isinstance(target, ast.Star): 

83 columns = self.get_columns() 

84 break 

85 elif isinstance(target, ast.Identifier): 

86 columns.append(target.parts[-1]) 

87 else: 

88 raise NotImplementedError 

89 else: 

90 columns = self.get_columns() 

91 

92 columns = [name.lower() for name in columns] 

93 

94 if len(result) == 0: 

95 result = pd.DataFrame([], columns=columns) 

96 else: 

97 for col in set(columns) & set(result.columns) ^ set(columns): 

98 result[col] = None 

99 

100 result = result[columns] 

101 

102 if query is not None and query.limit is not None: 

103 result = result.head(query.limit.value) 

104 

105 return result 

106 

107 

108class OrganicResultsTable(BaseResultsTable): 

109 results_key = 'organic_results' 

110 

111 def extract_data(self, organic): 

112 return { 

113 'position': organic.get('position'), 

114 'title': organic.get('title'), 

115 'url': organic.get('url'), 

116 'domain': organic.get('domain'), 

117 'displayed_url': organic.get('displayed_url'), 

118 'snippet': organic.get('snippet'), 

119 'cached_page_url': organic.get('cached_page_url'), 

120 'related_pages_url': organic.get('related_pages_url'), 

121 'prerender': organic.get('prerender'), 

122 'sitelinks': self._extract_sitelinks(organic.get('sitelinks')), 

123 'rich_snippet': self._extract_rich_snippet(organic.get('rich_snippet')) 

124 } 

125 

126 def _extract_sitelinks(self, sitelinks): 

127 if not sitelinks: 

128 return None 

129 return { 

130 'inline': [{'title': link['title'], 'url': link['url']} for link in sitelinks.get('inline', [])], 

131 'expanded': [{'title': link['title'], 'url': link['url']} for link in sitelinks.get('expanded', [])] 

132 } 

133 

134 def _extract_rich_snippet(self, rich_snippet): 

135 if not rich_snippet: 

136 return None 

137 snippet_type = 'top' if 'top' in rich_snippet else 'bottom' 

138 return { 

139 'detected_extensions': rich_snippet.get(snippet_type, {}).get('detected_extensions', []), 

140 'extensions': rich_snippet.get(snippet_type, {}).get('extensions', []) 

141 } 

142 

143 def get_columns(self): 

144 return [ 

145 'position', 

146 'title', 

147 'url', 

148 'domain', 

149 'displayed_url', 

150 'snippet', 

151 'cached_page_url', 

152 'related_pages_url', 

153 'prerender', 

154 'sitelinks', 

155 'rich_snippet' 

156 ] 

157 

158 

159class ImageResultsTable(BaseResultsTable): 

160 results_key = 'image_results' 

161 default_type = 'images' 

162 

163 def extract_data(self, image): 

164 return { 

165 'position': image.get('position'), 

166 'title': image.get('title'), 

167 'width': image.get('width'), 

168 'height': image.get('height'), 

169 'image_url': image.get('image_url'), 

170 'type': image.get('type'), 

171 'url': image.get('url'), 

172 'source': image.get('source') 

173 } 

174 

175 def get_columns(self): 

176 return [ 

177 'position', 

178 'title', 

179 'width', 

180 'height', 

181 'image_url', 

182 'type', 

183 'url', 

184 'source' 

185 ] 

186 

187 

188class VideoResultsTable(BaseResultsTable): 

189 results_key = 'video_results' 

190 default_type = 'videos' 

191 

192 def extract_data(self, video): 

193 return { 

194 'position': video.get('position'), 

195 'title': video.get('title'), 

196 'url': video.get('url'), 

197 'displayed_url': video.get('displayed_url'), 

198 'uploaded': video.get('uploaded'), 

199 'snippet': video.get('snippet'), 

200 'length': video.get('length') 

201 } 

202 

203 def get_columns(self): 

204 return [ 

205 'position', 

206 'title', 

207 'url', 

208 'displayed_url', 

209 'uploaded', 

210 'snippet', 

211 'length' 

212 ] 

213 

214 

215class NewsResultsTable(BaseResultsTable): 

216 results_key = 'news_results' 

217 default_type = 'news' 

218 

219 def extract_data(self, news): 

220 return { 

221 'position': news.get('position'), 

222 'title': news.get('title'), 

223 'url': news.get('url'), 

224 'source_name': news.get('source_name'), 

225 'uploaded': news.get('uploaded'), 

226 'uploaded_utc': news.get('uploaded_utc'), 

227 'snippet': news.get('snippet'), 

228 'thumbnail_url': news.get('thumbnail_url') 

229 } 

230 

231 def get_columns(self): 

232 return [ 

233 'position', 

234 'title', 

235 'url', 

236 'source_name', 

237 'uploaded', 

238 'uploaded_utc', 

239 'snippet', 

240 'thumbnail_url' 

241 ] 

242 

243 

244class ShoppingResultsTable(BaseResultsTable): 

245 results_key = 'shopping_results' 

246 default_type = 'shopping' 

247 

248 def extract_data(self, shopping): 

249 return { 

250 'position': shopping.get('position'), 

251 'title': shopping.get('title'), 

252 'url': shopping.get('url') 

253 } 

254 

255 def get_columns(self): 

256 return [ 

257 'position', 

258 'title', 

259 'url' 

260 ]