Coverage for mindsdb/integrations/handlers/web_handler/web

1from typing import List

3import pandas as pd

4from mindsdb.integrations.libs.response import HandlerStatusResponse

5from mindsdb.utilities.config import config

6from mindsdb.utilities.security import validate_urls

7from .urlcrawl_helpers import get_all_websites

9from mindsdb.integrations.libs.api_handler import APIResource, APIHandler

10from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator

13class CrawlerTable(APIResource):

14 def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:

15 """

16 Selects data from the provided websites

18 Returns:

19 dataframe: Dataframe containing the crawled data

21 Raises:

22 NotImplementedError: If the query is not supported

23 """

24 urls = []

25 crawl_depth = None

26 per_url_limit = None

27 headers = {}

28 for condition in conditions:

29 if condition.column == "url":

30 if condition.op == FilterOperator.IN:

31 urls = condition.value

32 elif condition.op == FilterOperator.EQUAL: 32 ↛ 34line 32 didn't jump to line 34 because the condition on line 32 was always true

33 urls = [condition.value]

34 condition.applied = True

35 if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:

36 crawl_depth = condition.value

37 condition.applied = True

38 if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:

39 per_url_limit = condition.value

40 condition.applied = True

41 if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 headers["User-Agent"] = condition.value

43 condition.applied = True

45 if len(urls) == 0: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise NotImplementedError(

47 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'

48 )

50 allowed_urls = config.get("web_crawling_allowed_sites", [])

51 if allowed_urls and not validate_urls(urls, allowed_urls): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError(

53 f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."

54 )

56 if limit is None and per_url_limit is None and crawl_depth is None:

57 per_url_limit = 1

58 if per_url_limit is not None:

59 # crawl every url separately

60 results = []

61 for url in urls:

62 results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))

63 result = pd.concat(results)

64 else:

65 result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)

67 if limit is not None and len(result) > limit:

68 result = result[:limit]

70 return result

72 def get_columns(self):

73 """

74 Returns the columns of the crawler table

75 """

76 return ["url", "text_content", "error"]

79class WebHandler(APIHandler):

80 """

81 Web handler, handling crawling content from websites.

82 """

84 def __init__(self, name=None, **kwargs):

85 super().__init__(name)

86 crawler = CrawlerTable(self)

87 self._register_table("crawler", crawler)

89 def check_connection(self) -> HandlerStatusResponse:

90 """

91 Checks the connection to the web handler

92 @TODO: Implement a better check for the connection

94 Returns:

95 HandlerStatusResponse: Response containing the status of the connection. Hardcoded to True for now.

96 """

97 response = HandlerStatusResponse(True)

98 return response

Coverage for mindsdb / integrations / handlers / web_handler / web_handler.py: 87%

56 statements