Coverage for mindsdb / integrations / utilities / rag / loaders / file_loader.py: 22%
45 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1import pathlib
2from typing import Iterator, List, Any
4_FILE_LOADER_IMPORT_ERROR = None
7class _BaseLoaderFallback:
8 """Minimal stand-in so FileLoader can be defined without LangChain installed."""
10 def __init__(self, *args, **kwargs):
11 pass
14try: # pragma: no cover - optional dependency
15 from langchain_core.document_loaders import BaseLoader as _LangchainBaseLoader
16except ModuleNotFoundError as exc: # LangChain not installed
17 BaseLoader = _BaseLoaderFallback
18 _FILE_LOADER_IMPORT_ERROR = exc
19else:
20 BaseLoader = _LangchainBaseLoader
22try: # pragma: no cover - optional dependency
23 from langchain_core.documents.base import Document
24except ModuleNotFoundError as exc:
25 Document = Any
26 _FILE_LOADER_IMPORT_ERROR = _FILE_LOADER_IMPORT_ERROR or exc
28try: # pragma: no cover - optional dependency
29 from langchain_community.document_loaders.csv_loader import CSVLoader
30 from langchain_community.document_loaders import (
31 PyMuPDFLoader,
32 TextLoader,
33 UnstructuredHTMLLoader,
34 UnstructuredMarkdownLoader,
35 )
36except ModuleNotFoundError as exc:
37 CSVLoader = PyMuPDFLoader = TextLoader = UnstructuredHTMLLoader = UnstructuredMarkdownLoader = None
38 _FILE_LOADER_IMPORT_ERROR = _FILE_LOADER_IMPORT_ERROR or exc
41def _require_file_loader_dependency():
42 if _FILE_LOADER_IMPORT_ERROR is not None:
43 raise ImportError(
44 "File loading requires the optional knowledge base dependencies. Install them via `pip install mindsdb[kb]`."
45 ) from _FILE_LOADER_IMPORT_ERROR
48class FileLoader(BaseLoader):
49 """Loads files of various types into vector database document representation"""
51 def __init__(self, path: str):
52 _require_file_loader_dependency()
53 self.path = path
54 super().__init__()
56 def _get_loader_from_extension(self, extension: str, path: str) -> BaseLoader:
57 if extension == ".pdf":
58 return PyMuPDFLoader(path)
59 if extension == ".csv":
60 return CSVLoader(path)
61 if extension == ".html":
62 return UnstructuredHTMLLoader(path)
63 if extension == ".md":
64 return UnstructuredMarkdownLoader(path)
65 return TextLoader(path, encoding="utf-8")
67 def _lazy_load_documents_from_file(self, path: str) -> Iterator[Document]:
68 file_extension = pathlib.Path(path).suffix
69 loader = self._get_loader_from_extension(file_extension, path)
71 for doc in loader.lazy_load():
72 doc.metadata["extension"] = file_extension
73 yield doc
75 def load(self) -> List[Document]:
76 """Loads a file and converts the contents into a vector database Document representation"""
77 return list(self.lazy_load())
79 def lazy_load(self) -> Iterator[Document]:
80 """Loads a file and converts the contents into a vector database Document representation"""
81 for doc in self._lazy_load_documents_from_file(self.path):
82 yield doc