Coverage for mindsdb / integrations / handlers / web_handler / web_handler.py: 87%

56 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1from typing import List 

2 

3import pandas as pd 

4from mindsdb.integrations.libs.response import HandlerStatusResponse 

5from mindsdb.utilities.config import config 

6from mindsdb.utilities.security import validate_urls 

7from .urlcrawl_helpers import get_all_websites 

8 

9from mindsdb.integrations.libs.api_handler import APIResource, APIHandler 

10from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator 

11 

12 

13class CrawlerTable(APIResource): 

14 def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame: 

15 """ 

16 Selects data from the provided websites 

17 

18 Returns: 

19 dataframe: Dataframe containing the crawled data 

20 

21 Raises: 

22 NotImplementedError: If the query is not supported 

23 """ 

24 urls = [] 

25 crawl_depth = None 

26 per_url_limit = None 

27 headers = {} 

28 for condition in conditions: 

29 if condition.column == "url": 

30 if condition.op == FilterOperator.IN: 

31 urls = condition.value 

32 elif condition.op == FilterOperator.EQUAL: 32 ↛ 34line 32 didn't jump to line 34 because the condition on line 32 was always true

33 urls = [condition.value] 

34 condition.applied = True 

35 if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL: 

36 crawl_depth = condition.value 

37 condition.applied = True 

38 if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL: 

39 per_url_limit = condition.value 

40 condition.applied = True 

41 if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 headers["User-Agent"] = condition.value 

43 condition.applied = True 

44 

45 if len(urls) == 0: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise NotImplementedError( 

47 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"' 

48 ) 

49 

50 allowed_urls = config.get("web_crawling_allowed_sites", []) 

51 if allowed_urls and not validate_urls(urls, allowed_urls): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError( 

53 f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}." 

54 ) 

55 

56 if limit is None and per_url_limit is None and crawl_depth is None: 

57 per_url_limit = 1 

58 if per_url_limit is not None: 

59 # crawl every url separately 

60 results = [] 

61 for url in urls: 

62 results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers)) 

63 result = pd.concat(results) 

64 else: 

65 result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers) 

66 

67 if limit is not None and len(result) > limit: 

68 result = result[:limit] 

69 

70 return result 

71 

72 def get_columns(self): 

73 """ 

74 Returns the columns of the crawler table 

75 """ 

76 return ["url", "text_content", "error"] 

77 

78 

79class WebHandler(APIHandler): 

80 """ 

81 Web handler, handling crawling content from websites. 

82 """ 

83 

84 def __init__(self, name=None, **kwargs): 

85 super().__init__(name) 

86 crawler = CrawlerTable(self) 

87 self._register_table("crawler", crawler) 

88 

89 def check_connection(self) -> HandlerStatusResponse: 

90 """ 

91 Checks the connection to the web handler 

92 @TODO: Implement a better check for the connection 

93 

94 Returns: 

95 HandlerStatusResponse: Response containing the status of the connection. Hardcoded to True for now. 

96 """ 

97 response = HandlerStatusResponse(True) 

98 return response