Coverage for mindsdb / integrations / utilities / rag / loaders / file_loader.py: 22%

45 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import pathlib 

2from typing import Iterator, List, Any 

3 

4_FILE_LOADER_IMPORT_ERROR = None 

5 

6 

7class _BaseLoaderFallback: 

8 """Minimal stand-in so FileLoader can be defined without LangChain installed.""" 

9 

10 def __init__(self, *args, **kwargs): 

11 pass 

12 

13 

14try: # pragma: no cover - optional dependency 

15 from langchain_core.document_loaders import BaseLoader as _LangchainBaseLoader 

16except ModuleNotFoundError as exc: # LangChain not installed 

17 BaseLoader = _BaseLoaderFallback 

18 _FILE_LOADER_IMPORT_ERROR = exc 

19else: 

20 BaseLoader = _LangchainBaseLoader 

21 

22try: # pragma: no cover - optional dependency 

23 from langchain_core.documents.base import Document 

24except ModuleNotFoundError as exc: 

25 Document = Any 

26 _FILE_LOADER_IMPORT_ERROR = _FILE_LOADER_IMPORT_ERROR or exc 

27 

28try: # pragma: no cover - optional dependency 

29 from langchain_community.document_loaders.csv_loader import CSVLoader 

30 from langchain_community.document_loaders import ( 

31 PyMuPDFLoader, 

32 TextLoader, 

33 UnstructuredHTMLLoader, 

34 UnstructuredMarkdownLoader, 

35 ) 

36except ModuleNotFoundError as exc: 

37 CSVLoader = PyMuPDFLoader = TextLoader = UnstructuredHTMLLoader = UnstructuredMarkdownLoader = None 

38 _FILE_LOADER_IMPORT_ERROR = _FILE_LOADER_IMPORT_ERROR or exc 

39 

40 

41def _require_file_loader_dependency(): 

42 if _FILE_LOADER_IMPORT_ERROR is not None: 

43 raise ImportError( 

44 "File loading requires the optional knowledge base dependencies. Install them via `pip install mindsdb[kb]`." 

45 ) from _FILE_LOADER_IMPORT_ERROR 

46 

47 

48class FileLoader(BaseLoader): 

49 """Loads files of various types into vector database document representation""" 

50 

51 def __init__(self, path: str): 

52 _require_file_loader_dependency() 

53 self.path = path 

54 super().__init__() 

55 

56 def _get_loader_from_extension(self, extension: str, path: str) -> BaseLoader: 

57 if extension == ".pdf": 

58 return PyMuPDFLoader(path) 

59 if extension == ".csv": 

60 return CSVLoader(path) 

61 if extension == ".html": 

62 return UnstructuredHTMLLoader(path) 

63 if extension == ".md": 

64 return UnstructuredMarkdownLoader(path) 

65 return TextLoader(path, encoding="utf-8") 

66 

67 def _lazy_load_documents_from_file(self, path: str) -> Iterator[Document]: 

68 file_extension = pathlib.Path(path).suffix 

69 loader = self._get_loader_from_extension(file_extension, path) 

70 

71 for doc in loader.lazy_load(): 

72 doc.metadata["extension"] = file_extension 

73 yield doc 

74 

75 def load(self) -> List[Document]: 

76 """Loads a file and converts the contents into a vector database Document representation""" 

77 return list(self.lazy_load()) 

78 

79 def lazy_load(self) -> Iterator[Document]: 

80 """Loads a file and converts the contents into a vector database Document representation""" 

81 for doc in self._lazy_load_documents_from_file(self.path): 

82 yield doc