Coverage for mindsdb / integrations / utilities / rag / splitters / file_splitter.py: 64%

74 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1from dataclasses import dataclass 

2from functools import lru_cache 

3from typing import Callable, List, TYPE_CHECKING, Any 

4 

5from mindsdb.interfaces.knowledge_base.preprocessing.models import TextChunkingConfig 

6 

7from mindsdb.utilities import log 

8 

9DEFAULT_CHUNK_SIZE = 1000 

10DEFAULT_CHUNK_OVERLAP = 50 

11DEFAULT_MARKDOWN_HEADERS_TO_SPLIT_ON = [ 

12 ("#", "Header 1"), 

13 ("##", "Header 2"), 

14 ("###", "Header 3"), 

15] 

16DEFAULT_HTML_HEADERS_TO_SPLIT_ON = [ 

17 ("h1", "Header 1"), 

18 ("h2", "Header 2"), 

19 ("h3", "Header 3"), 

20 ("h4", "Header 4"), 

21] 

22if TYPE_CHECKING: # pragma: no cover - type checking only 

23 from langchain_core.documents import Document 

24else: 

25 Document = Any 

26 

27logger = log.getLogger(__name__) 

28 

29 

30def _require_kb_dependency(feature: str, exc: ModuleNotFoundError): 

31 missing = exc.name or "required module" 

32 raise ImportError( 

33 f"{feature} requires the optional knowledge base dependencies (missing {missing}). " 

34 "Install them via `pip install mindsdb[kb]`." 

35 ) from exc 

36 

37 

38@lru_cache(maxsize=1) 

39def _load_splitter_dependencies(): 

40 from langchain_core.documents import Document as LangchainDocument 

41 from langchain_text_splitters import ( 

42 MarkdownHeaderTextSplitter, 

43 HTMLHeaderTextSplitter, 

44 RecursiveCharacterTextSplitter, 

45 ) 

46 

47 return LangchainDocument, MarkdownHeaderTextSplitter, HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter 

48 

49 

50def _get_splitter_dependencies(feature: str): 

51 try: 

52 return _load_splitter_dependencies() 

53 except ModuleNotFoundError as exc: # pragma: no cover - runtime guard 

54 if getattr(exc, "name", "").startswith("langchain") or "langchain" in str(exc): 

55 _require_kb_dependency(feature, exc) 

56 raise 

57 

58 

59@dataclass 

60class FileSplitterConfig: 

61 """Represents configuration needed to split a file into chunks for retrieval.""" 

62 

63 # Target chunk size in characters. Not all splitters will adhere exactly to this (it's more of a guideline) 

64 chunk_size: int = DEFAULT_CHUNK_SIZE 

65 # How many characters each chunk should overlap. Not all splitters will adhere exactly to this (it's more of a guideline) 

66 chunk_overlap: int = DEFAULT_CHUNK_OVERLAP 

67 # Chunking parameters are passed as a TextChunkingConfig 

68 text_chunking_config: TextChunkingConfig = None 

69 # Default recursive splitter to use for text files, or unsupported files 

70 recursive_splitter: Any = None 

71 # Splitter to use for MD splitting 

72 markdown_splitter: Any = None 

73 # Splitter to use for HTML splitting 

74 html_splitter: Any = None 

75 

76 def __post_init__(self): 

77 feature = "Knowledge base document splitting" 

78 if self.text_chunking_config is None: 78 ↛ 81line 78 didn't jump to line 81 because the condition on line 78 was always true

79 self.text_chunking_config = TextChunkingConfig(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) 

80 

81 if self.recursive_splitter is None: 81 ↛ 89line 81 didn't jump to line 89 because the condition on line 81 was always true

82 _, _, _, RecursiveCharacterTextSplitter = _get_splitter_dependencies(feature) 

83 self.recursive_splitter = RecursiveCharacterTextSplitter( 

84 chunk_size=self.text_chunking_config.chunk_size, 

85 chunk_overlap=self.text_chunking_config.chunk_overlap, 

86 length_function=self.text_chunking_config.length_function, 

87 separators=self.text_chunking_config.separators, 

88 ) 

89 if self.markdown_splitter is None: 89 ↛ 94line 89 didn't jump to line 94 because the condition on line 89 was always true

90 _, MarkdownHeaderTextSplitter, _, _ = _get_splitter_dependencies(feature) 

91 self.markdown_splitter = MarkdownHeaderTextSplitter( 

92 headers_to_split_on=DEFAULT_MARKDOWN_HEADERS_TO_SPLIT_ON 

93 ) 

94 if self.html_splitter is None: 94 ↛ exitline 94 didn't return from function '__post_init__' because the condition on line 94 was always true

95 _, _, HTMLHeaderTextSplitter, _ = _get_splitter_dependencies(feature) 

96 self.html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=DEFAULT_HTML_HEADERS_TO_SPLIT_ON) 

97 

98 

99class FileSplitter: 

100 """Splits Documents that represent various file types into chunks for retrieval.""" 

101 

102 def __init__(self, config: FileSplitterConfig): 

103 """ 

104 Args: 

105 config (FileSplitterConfig): Configuration for the file splitter. 

106 """ 

107 self.config = config 

108 self._extension_map = { 

109 ".pdf": self._recursive_splitter_fn, 

110 ".md": self._markdown_splitter_fn, 

111 ".html": self._html_splitter_fn, 

112 } 

113 self.default_splitter = self._recursive_splitter_fn 

114 

115 def _split_func_by_extension(self, extension) -> Callable: 

116 return self._extension_map.get(extension, self.default_splitter)() 

117 

118 def split_documents(self, documents: List["Document"], default_failover: bool = True) -> List["Document"]: 

119 """Splits a list of documents representing files using the appropriate splitting & chunking strategies 

120 

121 Args: 

122 documents (List[Document]): List of documents representing files to split. 

123 default_failover (bool, optional): Whether to use the default splitter as a fallback if the file type is not supported. Defaults to True. 

124 

125 Returns: 

126 List[Document]: List of documents representing the split files. 

127 """ 

128 split_documents = [] 

129 for document in documents: 

130 extension = document.metadata.get("extension") 

131 split_func = self._split_func_by_extension(extension=extension) 

132 try: 

133 split_documents += split_func(document.page_content) 

134 except Exception as e: 

135 logger.exception(f"Error splitting document with extension {extension}:") 

136 if not default_failover: 

137 raise ValueError(f"Error splitting document with extension {extension}") from e 

138 # Try default splitter as a failover, if enabled. 

139 split_func = self._split_func_by_extension(extension=None) 

140 split_documents += split_func(document.page_content) 

141 return split_documents 

142 

143 def _markdown_splitter_fn(self) -> Callable: 

144 return self.config.markdown_splitter.split_text 

145 

146 def _html_splitter_fn(self) -> Callable: 

147 return self.config.html_splitter.split_text 

148 

149 def _recursive_splitter_fn(self) -> Callable: 

150 # Recursive splitter is a TextSplitter where split_text returns List[str]. 

151 def recursive_split(content: str) -> List["Document"]: 

152 LangchainDocument, _, _, _ = _get_splitter_dependencies("Knowledge base document splitting") 

153 split_content = self.config.recursive_splitter.split_text(content) 

154 return [LangchainDocument(page_content=c) for c in split_content] 

155 

156 return recursive_split