Coverage for mindsdb/integrations/utilities/rag/loaders/file

1import pathlib

2from typing import Iterator, List, Any

4_FILE_LOADER_IMPORT_ERROR = None

7class _BaseLoaderFallback:

8 """Minimal stand-in so FileLoader can be defined without LangChain installed."""

10 def __init__(self, *args, **kwargs):

11 pass

14try: # pragma: no cover - optional dependency

15 from langchain_core.document_loaders import BaseLoader as _LangchainBaseLoader

16except ModuleNotFoundError as exc: # LangChain not installed

17 BaseLoader = _BaseLoaderFallback

18 _FILE_LOADER_IMPORT_ERROR = exc

19else:

20 BaseLoader = _LangchainBaseLoader

22try: # pragma: no cover - optional dependency

23 from langchain_core.documents.base import Document

24except ModuleNotFoundError as exc:

25 Document = Any

26 _FILE_LOADER_IMPORT_ERROR = _FILE_LOADER_IMPORT_ERROR or exc

28try: # pragma: no cover - optional dependency

29 from langchain_community.document_loaders.csv_loader import CSVLoader

30 from langchain_community.document_loaders import (

31 PyMuPDFLoader,

32 TextLoader,

33 UnstructuredHTMLLoader,

34 UnstructuredMarkdownLoader,

35 )

36except ModuleNotFoundError as exc:

37 CSVLoader = PyMuPDFLoader = TextLoader = UnstructuredHTMLLoader = UnstructuredMarkdownLoader = None

38 _FILE_LOADER_IMPORT_ERROR = _FILE_LOADER_IMPORT_ERROR or exc

41def _require_file_loader_dependency():

42 if _FILE_LOADER_IMPORT_ERROR is not None:

43 raise ImportError(

44 "File loading requires the optional knowledge base dependencies. Install them via `pip install mindsdb[kb]`."

45 ) from _FILE_LOADER_IMPORT_ERROR

48class FileLoader(BaseLoader):

49 """Loads files of various types into vector database document representation"""

51 def __init__(self, path: str):

52 _require_file_loader_dependency()

53 self.path = path

54 super().__init__()

56 def _get_loader_from_extension(self, extension: str, path: str) -> BaseLoader:

57 if extension == ".pdf":

58 return PyMuPDFLoader(path)

59 if extension == ".csv":

60 return CSVLoader(path)

61 if extension == ".html":

62 return UnstructuredHTMLLoader(path)

63 if extension == ".md":

64 return UnstructuredMarkdownLoader(path)

65 return TextLoader(path, encoding="utf-8")

67 def _lazy_load_documents_from_file(self, path: str) -> Iterator[Document]:

68 file_extension = pathlib.Path(path).suffix

69 loader = self._get_loader_from_extension(file_extension, path)

71 for doc in loader.lazy_load():

72 doc.metadata["extension"] = file_extension

73 yield doc

75 def load(self) -> List[Document]:

76 """Loads a file and converts the contents into a vector database Document representation"""

77 return list(self.lazy_load())

79 def lazy_load(self) -> Iterator[Document]:

80 """Loads a file and converts the contents into a vector database Document representation"""

81 for doc in self._lazy_load_documents_from_file(self.path):

82 yield doc

Coverage for mindsdb / integrations / utilities / rag / loaders / file_loader.py: 22%

45 statements