Coverage for mindsdb / interfaces / knowledge_base / preprocessing / document_loader.py: 31%

47 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import os 

2from typing import Iterator, List 

3 

4from mindsdb.interfaces.file.file_controller import FileController 

5from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader 

6from mindsdb.integrations.utilities.rag.splitters.file_splitter import ( 

7 FileSplitter, 

8) 

9from mindsdb.interfaces.knowledge_base.preprocessing.models import Document 

10from mindsdb.utilities import log 

11 

12try: # Optional web handler dependency 

13 from mindsdb.integrations.handlers.web_handler.urlcrawl_helpers import get_all_websites 

14except ImportError: # pragma: no cover - executed when web handler extras missing 

15 get_all_websites = None 

16 

17logger = log.getLogger(__name__) 

18 

19 

20def _get_langchain_document(feature: str): 

21 try: 

22 from langchain_core.documents import Document 

23 except ModuleNotFoundError as exc: # pragma: no cover - runtime guard 

24 if getattr(exc, "name", "").startswith("langchain") or "langchain" in str(exc): 

25 raise ImportError( 

26 f"{feature} requires the optional knowledge base dependencies. Install them via `pip install mindsdb[kb]`." 

27 ) from exc 

28 raise 

29 return Document 

30 

31 

32class DocumentLoader: 

33 """Handles loading documents from various sources including SQL queries""" 

34 

35 def __init__( 

36 self, 

37 file_controller: FileController, 

38 file_splitter: FileSplitter, 

39 file_loader_class=FileLoader, 

40 mysql_proxy=None, 

41 ): 

42 """ 

43 Initialize with required dependencies 

44 

45 Args: 

46 file_controller: Controller for file operations 

47 file_splitter: Splitter for file content 

48 file_loader_class: Class to use for file loading 

49 mysql_proxy: Proxy for executing MySQL queries 

50 """ 

51 self.file_controller = file_controller 

52 self.file_splitter = file_splitter 

53 self.file_loader_class = file_loader_class 

54 self.mysql_proxy = mysql_proxy 

55 

56 def load_files(self, file_names: List[str]) -> Iterator[Document]: 

57 """Load and split documents from files""" 

58 for file_name in file_names: 

59 file_path = self.file_controller.get_file_path(file_name) 

60 loader = self.file_loader_class(file_path) 

61 

62 for doc in loader.lazy_load(): 

63 # Add file extension to metadata for proper splitting 

64 extension = os.path.splitext(file_path)[1].lower() 

65 doc.metadata["extension"] = extension 

66 doc.metadata["source"] = file_name 

67 

68 # Use FileSplitter to handle the document based on its type 

69 split_docs = self.file_splitter.split_documents([doc]) 

70 for split_doc in split_docs: 

71 # Preserve original metadata while adding split-specific metadata 

72 metadata = doc.metadata.copy() 

73 metadata.update(split_doc.metadata or {}) 

74 

75 yield Document(content=split_doc.page_content, metadata=metadata) 

76 

77 def load_web_pages( 

78 self, 

79 urls: List[str], 

80 crawl_depth: int, 

81 limit: int, 

82 filters: List[str] = None, 

83 ) -> Iterator[Document]: 

84 """Load and split documents from web pages""" 

85 if get_all_websites is None: 

86 raise RuntimeError( 

87 "Web crawling requires the optional web handler dependencies. " 

88 "Install them via `pip install mindsdb[web]` or skip web sources." 

89 ) 

90 

91 websites_df = get_all_websites(urls, crawl_depth=crawl_depth, limit=limit, filters=filters) 

92 if get_all_websites is None: 

93 raise RuntimeError( 

94 "Web crawling requires the optional web handler dependencies. " 

95 "Install them via `pip install mindsdb[web]` or skip web sources." 

96 ) 

97 

98 LangchainDocument = _get_langchain_document("Web page ingestion for knowledge bases") 

99 

100 for _, row in websites_df.iterrows(): 

101 # Create a document with HTML extension for proper splitting 

102 doc = LangchainDocument( 

103 page_content=row["text_content"], metadata={"extension": ".html", "url": row["url"]} 

104 ) 

105 

106 # Use FileSplitter to handle HTML content 

107 split_docs = self.file_splitter.split_documents([doc]) 

108 for split_doc in split_docs: 

109 metadata = doc.metadata.copy() 

110 metadata.update(split_doc.metadata or {}) 

111 

112 yield Document(content=split_doc.page_content, metadata=metadata)