Coverage for mindsdb / interfaces / knowledge_base / preprocessing / models.py: 92%
63 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1from enum import Enum
3from typing import List, Dict, Any, Optional, Union, Callable
6from pydantic import BaseModel, Field, model_validator
9from mindsdb.integrations.utilities.rag.settings import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE
10from mindsdb.integrations.utilities.rag.settings import LLMConfig
13class PreprocessorType(Enum):
14 CONTEXTUAL = "contextual"
15 TEXT_CHUNKING = "text_chunking"
16 JSON_CHUNKING = "json_chunking"
19class BasePreprocessingConfig(BaseModel):
20 """Base configuration for preprocessing"""
22 chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, description="Size of document chunks")
23 chunk_overlap: int = Field(default=DEFAULT_CHUNK_OVERLAP, description="Overlap between chunks")
24 doc_id_column_name: str = Field(default="_original_doc_id", description="Name of doc_id columns in metadata")
27class ContextualConfig(BasePreprocessingConfig):
28 """Configuration specific to contextual preprocessing"""
30 llm_config: LLMConfig = Field(
31 default_factory=LLMConfig, description="LLM configuration to use for context generation"
32 )
33 context_template: Optional[str] = Field(default=None, description="Custom template for context generation")
34 summarize: Optional[bool] = Field(default=False, description="Whether to return chunks as summarizations")
37class TextChunkingConfig(BasePreprocessingConfig):
38 """Configuration for text chunking preprocessor using Pydantic"""
40 chunk_size: int = Field(default=1000, description="The target size of each text chunk", gt=0)
41 chunk_overlap: int = Field(default=200, description="The number of characters to overlap between chunks", ge=0)
42 length_function: Callable = Field(default=len, description="Function to measure text length")
43 separators: List[str] = Field(
44 default=["\n\n", "\n", " ", ""],
45 description="List of separators to use for splitting text, in order of priority",
46 )
48 class Config:
49 arbitrary_types_allowed = True
52class JSONChunkingConfig(BasePreprocessingConfig):
53 """Configuration for JSON chunking preprocessor"""
55 flatten_nested: bool = Field(default=True, description="Whether to flatten nested JSON structures")
56 include_metadata: bool = Field(default=True, description="Whether to include original metadata in chunks")
57 chunk_by_object: bool = Field(
58 default=True, description="Whether to chunk by top-level objects (True) or create a single document (False)"
59 )
60 exclude_fields: List[str] = Field(default_factory=list, description="List of fields to exclude from chunking")
61 include_fields: List[str] = Field(
62 default_factory=list,
63 description="List of fields to include in chunking (if empty, all fields except excluded ones are included)",
64 )
65 metadata_fields: List[str] = Field(
66 default_factory=list,
67 description="List of fields to extract into metadata for filtering "
68 "(can include nested fields using dot notation). "
69 "If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure).",
70 )
71 extract_all_primitives: bool = Field(
72 default=False, description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"
73 )
74 nested_delimiter: str = Field(default=".", description="Delimiter for flattened nested field names")
75 content_column: str = Field(default="content", description="Name of the content column for chunk ID generation")
77 class Config:
78 arbitrary_types_allowed = True
81class PreprocessingConfig(BaseModel):
82 """Complete preprocessing configuration"""
84 type: PreprocessorType = Field(default=PreprocessorType.TEXT_CHUNKING, description="Type of preprocessing to apply")
85 contextual_config: Optional[ContextualConfig] = Field(
86 default=None, description="Configuration for contextual preprocessing"
87 )
88 text_chunking_config: Optional[TextChunkingConfig] = Field(
89 default=None, description="Configuration for text chunking preprocessing"
90 )
91 json_chunking_config: Optional[JSONChunkingConfig] = Field(
92 default=None, description="Configuration for JSON chunking preprocessing"
93 )
95 @model_validator(mode="after")
96 def validate_config_presence(self) -> "PreprocessingConfig":
97 """Ensure the appropriate config is present for the chosen type"""
98 if self.type == PreprocessorType.CONTEXTUAL and not self.contextual_config: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 self.contextual_config = ContextualConfig()
100 if self.type == PreprocessorType.TEXT_CHUNKING and not self.text_chunking_config: 100 ↛ 102line 100 didn't jump to line 102 because the condition on line 100 was always true
101 self.text_chunking_config = TextChunkingConfig()
102 if self.type == PreprocessorType.JSON_CHUNKING and not self.json_chunking_config: 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was never true
103 # Import here to avoid circular imports
104 from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingConfig
106 self.json_chunking_config = JSONChunkingConfig()
107 return self
110class Document(BaseModel):
111 """Document model with default metadata handling"""
113 id: Optional[Union[int, str]] = Field(default=None, description="Unique identifier for the document")
114 content: str = Field(description="The document content")
115 embeddings: Optional[List[float]] = Field(default=None, description="Vector embeddings of the content")
116 metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional document metadata")
118 @model_validator(mode="after")
119 def validate_metadata(self) -> "Document":
120 """Ensure metadata is present and valid"""
121 if not self.metadata:
122 self.metadata = {"source": "default"}
123 return self
126class ProcessedChunk(Document):
127 """Processed chunk that aligns with VectorStoreHandler schema"""
129 pass