Coverage for mindsdb / interfaces / knowledge_base / preprocessing / document_loader.py: 31%
47 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1import os
2from typing import Iterator, List
4from mindsdb.interfaces.file.file_controller import FileController
5from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
6from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
7 FileSplitter,
8)
9from mindsdb.interfaces.knowledge_base.preprocessing.models import Document
10from mindsdb.utilities import log
12try: # Optional web handler dependency
13 from mindsdb.integrations.handlers.web_handler.urlcrawl_helpers import get_all_websites
14except ImportError: # pragma: no cover - executed when web handler extras missing
15 get_all_websites = None
17logger = log.getLogger(__name__)
20def _get_langchain_document(feature: str):
21 try:
22 from langchain_core.documents import Document
23 except ModuleNotFoundError as exc: # pragma: no cover - runtime guard
24 if getattr(exc, "name", "").startswith("langchain") or "langchain" in str(exc):
25 raise ImportError(
26 f"{feature} requires the optional knowledge base dependencies. Install them via `pip install mindsdb[kb]`."
27 ) from exc
28 raise
29 return Document
32class DocumentLoader:
33 """Handles loading documents from various sources including SQL queries"""
35 def __init__(
36 self,
37 file_controller: FileController,
38 file_splitter: FileSplitter,
39 file_loader_class=FileLoader,
40 mysql_proxy=None,
41 ):
42 """
43 Initialize with required dependencies
45 Args:
46 file_controller: Controller for file operations
47 file_splitter: Splitter for file content
48 file_loader_class: Class to use for file loading
49 mysql_proxy: Proxy for executing MySQL queries
50 """
51 self.file_controller = file_controller
52 self.file_splitter = file_splitter
53 self.file_loader_class = file_loader_class
54 self.mysql_proxy = mysql_proxy
56 def load_files(self, file_names: List[str]) -> Iterator[Document]:
57 """Load and split documents from files"""
58 for file_name in file_names:
59 file_path = self.file_controller.get_file_path(file_name)
60 loader = self.file_loader_class(file_path)
62 for doc in loader.lazy_load():
63 # Add file extension to metadata for proper splitting
64 extension = os.path.splitext(file_path)[1].lower()
65 doc.metadata["extension"] = extension
66 doc.metadata["source"] = file_name
68 # Use FileSplitter to handle the document based on its type
69 split_docs = self.file_splitter.split_documents([doc])
70 for split_doc in split_docs:
71 # Preserve original metadata while adding split-specific metadata
72 metadata = doc.metadata.copy()
73 metadata.update(split_doc.metadata or {})
75 yield Document(content=split_doc.page_content, metadata=metadata)
77 def load_web_pages(
78 self,
79 urls: List[str],
80 crawl_depth: int,
81 limit: int,
82 filters: List[str] = None,
83 ) -> Iterator[Document]:
84 """Load and split documents from web pages"""
85 if get_all_websites is None:
86 raise RuntimeError(
87 "Web crawling requires the optional web handler dependencies. "
88 "Install them via `pip install mindsdb[web]` or skip web sources."
89 )
91 websites_df = get_all_websites(urls, crawl_depth=crawl_depth, limit=limit, filters=filters)
92 if get_all_websites is None:
93 raise RuntimeError(
94 "Web crawling requires the optional web handler dependencies. "
95 "Install them via `pip install mindsdb[web]` or skip web sources."
96 )
98 LangchainDocument = _get_langchain_document("Web page ingestion for knowledge bases")
100 for _, row in websites_df.iterrows():
101 # Create a document with HTML extension for proper splitting
102 doc = LangchainDocument(
103 page_content=row["text_content"], metadata={"extension": ".html", "url": row["url"]}
104 )
106 # Use FileSplitter to handle HTML content
107 split_docs = self.file_splitter.split_documents([doc])
108 for split_doc in split_docs:
109 metadata = doc.metadata.copy()
110 metadata.update(split_doc.metadata or {})
112 yield Document(content=split_doc.page_content, metadata=metadata)