Coverage for mindsdb / integrations / handlers / hackernews_handler / hn_table.py: 0%

76 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import pandas as pd 

2from mindsdb.integrations.libs.api_handler import APITable 

3from mindsdb_sql_parser import ast 

4from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions 

5from typing import List 

6 

7 

8class StoriesTable(APITable): 

9 json_endpoint = "topstories.json" 

10 columns = ['id', 'time', 'title', 'url', 'score', 'descendants'] 

11 

12 def select(self, query: ast.Select) -> pd.DataFrame: 

13 """Select data from the stories table and return it as a pandas DataFrame. 

14 Args: 

15 query (ast.Select): The SQL query to be executed. 

16 Returns: 

17 pandas.DataFrame: A pandas DataFrame containing the selected data. 

18 """ 

19 hn_handler = self.handler 

20 

21 # Extract the limit value from the SQL query, if it exists 

22 limit = None 

23 if query.limit is not None: 

24 limit = query.limit.value 

25 

26 df = hn_handler.get_df_from_class(self, limit) 

27 

28 # Apply any WHERE clauses in the SQL query to the DataFrame 

29 conditions = extract_comparison_conditions(query.where) 

30 for condition in conditions: 

31 if condition[0] == '=' and condition[1] == 'id': 

32 df = df[df['id'] == int(condition[2])] 

33 elif condition[0] == '>' and condition[1] == 'time': 

34 timestamp = int(condition[2]) 

35 df = df[df['time'] > timestamp] 

36 

37 # Filter the columns in the DataFrame according to the SQL query 

38 self.filter_columns(df, query) 

39 

40 return df 

41 

42 def get_columns(self): 

43 """Get the list of column names for the stories table. 

44 Returns: 

45 list: A list of column names for the stories table. 

46 """ 

47 return self.columns 

48 

49 def filter_columns(self, df, query): 

50 """Filter the columns in the DataFrame according to the SQL query. 

51 Args: 

52 df (pandas.DataFrame): The DataFrame to filter. 

53 query (ast.Select): The SQL query to apply to the DataFrame. 

54 """ 

55 columns = [] 

56 for target in query.targets: 

57 if isinstance(target, ast.Star): 

58 columns = self.get_columns() 

59 break 

60 elif isinstance(target, ast.Identifier): 

61 columns.append(target.value) 

62 df = df[columns] 

63 return df 

64 

65 

66class HNStoriesTable(StoriesTable): 

67 json_endpoint = "askstories.json" 

68 columns = ['id', 'time', 'title', 'text', 'score', 'descendants'] 

69 

70 

71class JobStoriesTable(StoriesTable): 

72 json_endpoint = "jobstories.json" 

73 columns = ['id', 'time', 'title', 'url', 'score', 'type'] 

74 

75 

76class ShowStoriesTable(StoriesTable): 

77 json_endpoint = "showstories.json" 

78 columns = ['id', 'time', 'title', 'text', 'score', 'descendants'] 

79 

80 

81class CommentsTable(APITable): 

82 def select(self, query: ast.Select) -> pd.DataFrame: 

83 """Select data from the comments table and return it as a pandas DataFrame. 

84 Args: 

85 query (ast.Select): The SQL query to be executed. 

86 Returns: 

87 pandas.DataFrame: A pandas DataFrame containing the selected data. 

88 """ 

89 hn_handler = self.handler 

90 

91 # Get the limit value from the SQL query, if it exists 

92 limit = None 

93 if query.limit is not None: 

94 limit = query.limit.value 

95 

96 # Get the item ID from the SQL query 

97 item_id = None 

98 conditions = extract_comparison_conditions(query.where) 

99 for condition in conditions: 

100 if condition[0] == '=' and condition[1] == 'item_id': 

101 item_id = condition[2] 

102 

103 if item_id is None: 

104 raise ValueError('Item ID is missing in the SQL query') 

105 

106 # Call the Hacker News API to get the comments for the specified item 

107 comments_df = hn_handler.call_hackernews_api('get_comments', params={'item_id': item_id}) 

108 

109 # Fill NaN values with 'deleted' 

110 comments_df = comments_df.fillna('deleted') 

111 # Filter the columns to those specified in the SQL query 

112 self.filter_columns(comments_df, query) 

113 

114 # Limit the number of results if necessary 

115 if limit is not None: 

116 comments_df = comments_df.head(limit) 

117 

118 return comments_df 

119 

120 def get_columns(self) -> List[str]: 

121 """Get the list of column names for the comments table. 

122 Returns: 

123 list: A list of column names for the comments table. 

124 """ 

125 return [ 

126 'id', 

127 'by', 

128 'parent', 

129 'text', 

130 'time', 

131 'type', 

132 ] 

133 

134 def filter_columns(self, result: pd.DataFrame, query: ast.Select = None) -> None: 

135 """Filter the columns of a DataFrame to those specified in an SQL query. 

136 Args: 

137 result (pandas.DataFrame): The DataFrame to filter. 

138 query (ast.Select): The SQL query containing the column names to filter on. 

139 """ 

140 if query is None: 

141 return 

142 

143 columns = [] 

144 for target in query.targets: 

145 if isinstance(target, ast.Star): 

146 return 

147 elif isinstance(target, ast.Identifier): 

148 columns.append(target.value) 

149 

150 if len(columns) > 0: 

151 result = result[columns]