Coverage for mindsdb / integrations / utilities / rag / splitters / file_splitter.py: 64%
74 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1from dataclasses import dataclass
2from functools import lru_cache
3from typing import Callable, List, TYPE_CHECKING, Any
5from mindsdb.interfaces.knowledge_base.preprocessing.models import TextChunkingConfig
7from mindsdb.utilities import log
9DEFAULT_CHUNK_SIZE = 1000
10DEFAULT_CHUNK_OVERLAP = 50
11DEFAULT_MARKDOWN_HEADERS_TO_SPLIT_ON = [
12 ("#", "Header 1"),
13 ("##", "Header 2"),
14 ("###", "Header 3"),
15]
16DEFAULT_HTML_HEADERS_TO_SPLIT_ON = [
17 ("h1", "Header 1"),
18 ("h2", "Header 2"),
19 ("h3", "Header 3"),
20 ("h4", "Header 4"),
21]
22if TYPE_CHECKING: # pragma: no cover - type checking only
23 from langchain_core.documents import Document
24else:
25 Document = Any
27logger = log.getLogger(__name__)
30def _require_kb_dependency(feature: str, exc: ModuleNotFoundError):
31 missing = exc.name or "required module"
32 raise ImportError(
33 f"{feature} requires the optional knowledge base dependencies (missing {missing}). "
34 "Install them via `pip install mindsdb[kb]`."
35 ) from exc
38@lru_cache(maxsize=1)
39def _load_splitter_dependencies():
40 from langchain_core.documents import Document as LangchainDocument
41 from langchain_text_splitters import (
42 MarkdownHeaderTextSplitter,
43 HTMLHeaderTextSplitter,
44 RecursiveCharacterTextSplitter,
45 )
47 return LangchainDocument, MarkdownHeaderTextSplitter, HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
50def _get_splitter_dependencies(feature: str):
51 try:
52 return _load_splitter_dependencies()
53 except ModuleNotFoundError as exc: # pragma: no cover - runtime guard
54 if getattr(exc, "name", "").startswith("langchain") or "langchain" in str(exc):
55 _require_kb_dependency(feature, exc)
56 raise
59@dataclass
60class FileSplitterConfig:
61 """Represents configuration needed to split a file into chunks for retrieval."""
63 # Target chunk size in characters. Not all splitters will adhere exactly to this (it's more of a guideline)
64 chunk_size: int = DEFAULT_CHUNK_SIZE
65 # How many characters each chunk should overlap. Not all splitters will adhere exactly to this (it's more of a guideline)
66 chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
67 # Chunking parameters are passed as a TextChunkingConfig
68 text_chunking_config: TextChunkingConfig = None
69 # Default recursive splitter to use for text files, or unsupported files
70 recursive_splitter: Any = None
71 # Splitter to use for MD splitting
72 markdown_splitter: Any = None
73 # Splitter to use for HTML splitting
74 html_splitter: Any = None
76 def __post_init__(self):
77 feature = "Knowledge base document splitting"
78 if self.text_chunking_config is None: 78 ↛ 81line 78 didn't jump to line 81 because the condition on line 78 was always true
79 self.text_chunking_config = TextChunkingConfig(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
81 if self.recursive_splitter is None: 81 ↛ 89line 81 didn't jump to line 89 because the condition on line 81 was always true
82 _, _, _, RecursiveCharacterTextSplitter = _get_splitter_dependencies(feature)
83 self.recursive_splitter = RecursiveCharacterTextSplitter(
84 chunk_size=self.text_chunking_config.chunk_size,
85 chunk_overlap=self.text_chunking_config.chunk_overlap,
86 length_function=self.text_chunking_config.length_function,
87 separators=self.text_chunking_config.separators,
88 )
89 if self.markdown_splitter is None: 89 ↛ 94line 89 didn't jump to line 94 because the condition on line 89 was always true
90 _, MarkdownHeaderTextSplitter, _, _ = _get_splitter_dependencies(feature)
91 self.markdown_splitter = MarkdownHeaderTextSplitter(
92 headers_to_split_on=DEFAULT_MARKDOWN_HEADERS_TO_SPLIT_ON
93 )
94 if self.html_splitter is None: 94 ↛ exitline 94 didn't return from function '__post_init__' because the condition on line 94 was always true
95 _, _, HTMLHeaderTextSplitter, _ = _get_splitter_dependencies(feature)
96 self.html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=DEFAULT_HTML_HEADERS_TO_SPLIT_ON)
99class FileSplitter:
100 """Splits Documents that represent various file types into chunks for retrieval."""
102 def __init__(self, config: FileSplitterConfig):
103 """
104 Args:
105 config (FileSplitterConfig): Configuration for the file splitter.
106 """
107 self.config = config
108 self._extension_map = {
109 ".pdf": self._recursive_splitter_fn,
110 ".md": self._markdown_splitter_fn,
111 ".html": self._html_splitter_fn,
112 }
113 self.default_splitter = self._recursive_splitter_fn
115 def _split_func_by_extension(self, extension) -> Callable:
116 return self._extension_map.get(extension, self.default_splitter)()
118 def split_documents(self, documents: List["Document"], default_failover: bool = True) -> List["Document"]:
119 """Splits a list of documents representing files using the appropriate splitting & chunking strategies
121 Args:
122 documents (List[Document]): List of documents representing files to split.
123 default_failover (bool, optional): Whether to use the default splitter as a fallback if the file type is not supported. Defaults to True.
125 Returns:
126 List[Document]: List of documents representing the split files.
127 """
128 split_documents = []
129 for document in documents:
130 extension = document.metadata.get("extension")
131 split_func = self._split_func_by_extension(extension=extension)
132 try:
133 split_documents += split_func(document.page_content)
134 except Exception as e:
135 logger.exception(f"Error splitting document with extension {extension}:")
136 if not default_failover:
137 raise ValueError(f"Error splitting document with extension {extension}") from e
138 # Try default splitter as a failover, if enabled.
139 split_func = self._split_func_by_extension(extension=None)
140 split_documents += split_func(document.page_content)
141 return split_documents
143 def _markdown_splitter_fn(self) -> Callable:
144 return self.config.markdown_splitter.split_text
146 def _html_splitter_fn(self) -> Callable:
147 return self.config.html_splitter.split_text
149 def _recursive_splitter_fn(self) -> Callable:
150 # Recursive splitter is a TextSplitter where split_text returns List[str].
151 def recursive_split(content: str) -> List["Document"]:
152 LangchainDocument, _, _, _ = _get_splitter_dependencies("Knowledge base document splitting")
153 split_content = self.config.recursive_splitter.split_text(content)
154 return [LangchainDocument(page_content=c) for c in split_content]
156 return recursive_split