Coverage for mindsdb / interfaces / knowledge_base / preprocessing / models.py: 92%

63 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1from enum import Enum 

2 

3from typing import List, Dict, Any, Optional, Union, Callable 

4 

5 

6from pydantic import BaseModel, Field, model_validator 

7 

8 

9from mindsdb.integrations.utilities.rag.settings import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE 

10from mindsdb.integrations.utilities.rag.settings import LLMConfig 

11 

12 

13class PreprocessorType(Enum): 

14 CONTEXTUAL = "contextual" 

15 TEXT_CHUNKING = "text_chunking" 

16 JSON_CHUNKING = "json_chunking" 

17 

18 

19class BasePreprocessingConfig(BaseModel): 

20 """Base configuration for preprocessing""" 

21 

22 chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, description="Size of document chunks") 

23 chunk_overlap: int = Field(default=DEFAULT_CHUNK_OVERLAP, description="Overlap between chunks") 

24 doc_id_column_name: str = Field(default="_original_doc_id", description="Name of doc_id columns in metadata") 

25 

26 

27class ContextualConfig(BasePreprocessingConfig): 

28 """Configuration specific to contextual preprocessing""" 

29 

30 llm_config: LLMConfig = Field( 

31 default_factory=LLMConfig, description="LLM configuration to use for context generation" 

32 ) 

33 context_template: Optional[str] = Field(default=None, description="Custom template for context generation") 

34 summarize: Optional[bool] = Field(default=False, description="Whether to return chunks as summarizations") 

35 

36 

37class TextChunkingConfig(BasePreprocessingConfig): 

38 """Configuration for text chunking preprocessor using Pydantic""" 

39 

40 chunk_size: int = Field(default=1000, description="The target size of each text chunk", gt=0) 

41 chunk_overlap: int = Field(default=200, description="The number of characters to overlap between chunks", ge=0) 

42 length_function: Callable = Field(default=len, description="Function to measure text length") 

43 separators: List[str] = Field( 

44 default=["\n\n", "\n", " ", ""], 

45 description="List of separators to use for splitting text, in order of priority", 

46 ) 

47 

48 class Config: 

49 arbitrary_types_allowed = True 

50 

51 

52class JSONChunkingConfig(BasePreprocessingConfig): 

53 """Configuration for JSON chunking preprocessor""" 

54 

55 flatten_nested: bool = Field(default=True, description="Whether to flatten nested JSON structures") 

56 include_metadata: bool = Field(default=True, description="Whether to include original metadata in chunks") 

57 chunk_by_object: bool = Field( 

58 default=True, description="Whether to chunk by top-level objects (True) or create a single document (False)" 

59 ) 

60 exclude_fields: List[str] = Field(default_factory=list, description="List of fields to exclude from chunking") 

61 include_fields: List[str] = Field( 

62 default_factory=list, 

63 description="List of fields to include in chunking (if empty, all fields except excluded ones are included)", 

64 ) 

65 metadata_fields: List[str] = Field( 

66 default_factory=list, 

67 description="List of fields to extract into metadata for filtering " 

68 "(can include nested fields using dot notation). " 

69 "If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure).", 

70 ) 

71 extract_all_primitives: bool = Field( 

72 default=False, description="Whether to extract all primitive values (strings, numbers, booleans) into metadata" 

73 ) 

74 nested_delimiter: str = Field(default=".", description="Delimiter for flattened nested field names") 

75 content_column: str = Field(default="content", description="Name of the content column for chunk ID generation") 

76 

77 class Config: 

78 arbitrary_types_allowed = True 

79 

80 

81class PreprocessingConfig(BaseModel): 

82 """Complete preprocessing configuration""" 

83 

84 type: PreprocessorType = Field(default=PreprocessorType.TEXT_CHUNKING, description="Type of preprocessing to apply") 

85 contextual_config: Optional[ContextualConfig] = Field( 

86 default=None, description="Configuration for contextual preprocessing" 

87 ) 

88 text_chunking_config: Optional[TextChunkingConfig] = Field( 

89 default=None, description="Configuration for text chunking preprocessing" 

90 ) 

91 json_chunking_config: Optional[JSONChunkingConfig] = Field( 

92 default=None, description="Configuration for JSON chunking preprocessing" 

93 ) 

94 

95 @model_validator(mode="after") 

96 def validate_config_presence(self) -> "PreprocessingConfig": 

97 """Ensure the appropriate config is present for the chosen type""" 

98 if self.type == PreprocessorType.CONTEXTUAL and not self.contextual_config: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 self.contextual_config = ContextualConfig() 

100 if self.type == PreprocessorType.TEXT_CHUNKING and not self.text_chunking_config: 100 ↛ 102line 100 didn't jump to line 102 because the condition on line 100 was always true

101 self.text_chunking_config = TextChunkingConfig() 

102 if self.type == PreprocessorType.JSON_CHUNKING and not self.json_chunking_config: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was never true

103 # Import here to avoid circular imports 

104 from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingConfig 

105 

106 self.json_chunking_config = JSONChunkingConfig() 

107 return self 

108 

109 

110class Document(BaseModel): 

111 """Document model with default metadata handling""" 

112 

113 id: Optional[Union[int, str]] = Field(default=None, description="Unique identifier for the document") 

114 content: str = Field(description="The document content") 

115 embeddings: Optional[List[float]] = Field(default=None, description="Vector embeddings of the content") 

116 metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional document metadata") 

117 

118 @model_validator(mode="after") 

119 def validate_metadata(self) -> "Document": 

120 """Ensure metadata is present and valid""" 

121 if not self.metadata: 

122 self.metadata = {"source": "default"} 

123 return self 

124 

125 

126class ProcessedChunk(Document): 

127 """Processed chunk that aligns with VectorStoreHandler schema""" 

128 

129 pass