Coverage for mindsdb / integrations / handlers / web_handler / web_handler.py: 87%
56 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1from typing import List
3import pandas as pd
4from mindsdb.integrations.libs.response import HandlerStatusResponse
5from mindsdb.utilities.config import config
6from mindsdb.utilities.security import validate_urls
7from .urlcrawl_helpers import get_all_websites
9from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
10from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
13class CrawlerTable(APIResource):
14 def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
15 """
16 Selects data from the provided websites
18 Returns:
19 dataframe: Dataframe containing the crawled data
21 Raises:
22 NotImplementedError: If the query is not supported
23 """
24 urls = []
25 crawl_depth = None
26 per_url_limit = None
27 headers = {}
28 for condition in conditions:
29 if condition.column == "url":
30 if condition.op == FilterOperator.IN:
31 urls = condition.value
32 elif condition.op == FilterOperator.EQUAL: 32 ↛ 34line 32 didn't jump to line 34 because the condition on line 32 was always true
33 urls = [condition.value]
34 condition.applied = True
35 if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
36 crawl_depth = condition.value
37 condition.applied = True
38 if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
39 per_url_limit = condition.value
40 condition.applied = True
41 if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 headers["User-Agent"] = condition.value
43 condition.applied = True
45 if len(urls) == 0: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise NotImplementedError(
47 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
48 )
50 allowed_urls = config.get("web_crawling_allowed_sites", [])
51 if allowed_urls and not validate_urls(urls, allowed_urls): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError(
53 f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
54 )
56 if limit is None and per_url_limit is None and crawl_depth is None:
57 per_url_limit = 1
58 if per_url_limit is not None:
59 # crawl every url separately
60 results = []
61 for url in urls:
62 results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
63 result = pd.concat(results)
64 else:
65 result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
67 if limit is not None and len(result) > limit:
68 result = result[:limit]
70 return result
72 def get_columns(self):
73 """
74 Returns the columns of the crawler table
75 """
76 return ["url", "text_content", "error"]
79class WebHandler(APIHandler):
80 """
81 Web handler, handling crawling content from websites.
82 """
84 def __init__(self, name=None, **kwargs):
85 super().__init__(name)
86 crawler = CrawlerTable(self)
87 self._register_table("crawler", crawler)
89 def check_connection(self) -> HandlerStatusResponse:
90 """
91 Checks the connection to the web handler
92 @TODO: Implement a better check for the connection
94 Returns:
95 HandlerStatusResponse: Response containing the status of the connection. Hardcoded to True for now.
96 """
97 response = HandlerStatusResponse(True)
98 return response