Coverage for mindsdb/interfaces/knowledge_base/preprocessing/models.py: 92%

1from enum import Enum

3from typing import List, Dict, Any, Optional, Union, Callable

6from pydantic import BaseModel, Field, model_validator

9from mindsdb.integrations.utilities.rag.settings import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE

10from mindsdb.integrations.utilities.rag.settings import LLMConfig

13class PreprocessorType(Enum):

14 CONTEXTUAL = "contextual"

15 TEXT_CHUNKING = "text_chunking"

16 JSON_CHUNKING = "json_chunking"

19class BasePreprocessingConfig(BaseModel):

20 """Base configuration for preprocessing"""

22 chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, description="Size of document chunks")

23 chunk_overlap: int = Field(default=DEFAULT_CHUNK_OVERLAP, description="Overlap between chunks")

24 doc_id_column_name: str = Field(default="_original_doc_id", description="Name of doc_id columns in metadata")

27class ContextualConfig(BasePreprocessingConfig):

28 """Configuration specific to contextual preprocessing"""

30 llm_config: LLMConfig = Field(

31 default_factory=LLMConfig, description="LLM configuration to use for context generation"

32 )

33 context_template: Optional[str] = Field(default=None, description="Custom template for context generation")

34 summarize: Optional[bool] = Field(default=False, description="Whether to return chunks as summarizations")

37class TextChunkingConfig(BasePreprocessingConfig):

38 """Configuration for text chunking preprocessor using Pydantic"""

40 chunk_size: int = Field(default=1000, description="The target size of each text chunk", gt=0)

41 chunk_overlap: int = Field(default=200, description="The number of characters to overlap between chunks", ge=0)

42 length_function: Callable = Field(default=len, description="Function to measure text length")

43 separators: List[str] = Field(

44 default=["\n\n", "\n", " ", ""],

45 description="List of separators to use for splitting text, in order of priority",

46 )

48 class Config:

49 arbitrary_types_allowed = True

52class JSONChunkingConfig(BasePreprocessingConfig):

53 """Configuration for JSON chunking preprocessor"""

55 flatten_nested: bool = Field(default=True, description="Whether to flatten nested JSON structures")

56 include_metadata: bool = Field(default=True, description="Whether to include original metadata in chunks")

57 chunk_by_object: bool = Field(

58 default=True, description="Whether to chunk by top-level objects (True) or create a single document (False)"

59 )

60 exclude_fields: List[str] = Field(default_factory=list, description="List of fields to exclude from chunking")

61 include_fields: List[str] = Field(

62 default_factory=list,

63 description="List of fields to include in chunking (if empty, all fields except excluded ones are included)",

64 )

65 metadata_fields: List[str] = Field(

66 default_factory=list,

67 description="List of fields to extract into metadata for filtering "

68 "(can include nested fields using dot notation). "

69 "If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure).",

70 )

71 extract_all_primitives: bool = Field(

72 default=False, description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"

73 )

74 nested_delimiter: str = Field(default=".", description="Delimiter for flattened nested field names")

75 content_column: str = Field(default="content", description="Name of the content column for chunk ID generation")

77 class Config:

78 arbitrary_types_allowed = True

81class PreprocessingConfig(BaseModel):

82 """Complete preprocessing configuration"""

84 type: PreprocessorType = Field(default=PreprocessorType.TEXT_CHUNKING, description="Type of preprocessing to apply")

85 contextual_config: Optional[ContextualConfig] = Field(

86 default=None, description="Configuration for contextual preprocessing"

87 )

88 text_chunking_config: Optional[TextChunkingConfig] = Field(

89 default=None, description="Configuration for text chunking preprocessing"

90 )

91 json_chunking_config: Optional[JSONChunkingConfig] = Field(

92 default=None, description="Configuration for JSON chunking preprocessing"

93 )

95 @model_validator(mode="after")

96 def validate_config_presence(self) -> "PreprocessingConfig":

97 """Ensure the appropriate config is present for the chosen type"""

98 if self.type == PreprocessorType.CONTEXTUAL and not self.contextual_config: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 self.contextual_config = ContextualConfig()

100 if self.type == PreprocessorType.TEXT_CHUNKING and not self.text_chunking_config: 100 ↛ 102line 100 didn't jump to line 102 because the condition on line 100 was always true

101 self.text_chunking_config = TextChunkingConfig()

102 if self.type == PreprocessorType.JSON_CHUNKING and not self.json_chunking_config: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was never true

103 # Import here to avoid circular imports

104 from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingConfig

105

106 self.json_chunking_config = JSONChunkingConfig()

107 return self

108

109

110class Document(BaseModel):

111 """Document model with default metadata handling"""

112

113 id: Optional[Union[int, str]] = Field(default=None, description="Unique identifier for the document")

114 content: str = Field(description="The document content")

115 embeddings: Optional[List[float]] = Field(default=None, description="Vector embeddings of the content")

116 metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional document metadata")

117

118 @model_validator(mode="after")

119 def validate_metadata(self) -> "Document":

120 """Ensure metadata is present and valid"""

121 if not self.metadata:

122 self.metadata = {"source": "default"}

123 return self

124

125

126class ProcessedChunk(Document):

127 """Processed chunk that aligns with VectorStoreHandler schema"""

128

129 pass

Coverage for mindsdb / interfaces / knowledge_base / preprocessing / models.py: 92%

63 statements