Coverage for mindsdb / integrations / utilities / datasets / dataset.py: 0%
33 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1from pathlib import Path
3import pandas as pd
5DATASETS_BASE_PATH = Path(__file__).parent
7SUPPORTED_TASK_TYPES = ("question_answering",)
10class DatasetNameMissing(Exception):
11 pass
14class MLTaskTypeMissing(Exception):
15 pass
18class DatasetNotFound(Exception):
19 pass
22class UnsupportedMLTaskType(Exception):
23 pass
26class MissingColumns(Exception):
27 pass
30def validate_dataset(ml_task_type=None, dataset_name=None):
32 if ml_task_type is None:
33 raise MLTaskTypeMissing(
34 "ML Task type is missing. Please provide a valid 'ml_task_type'."
35 )
37 if dataset_name is None:
38 raise DatasetNameMissing(
39 "Dataset name is missing. Please provide a valid 'dataset_name'."
40 )
42 if ml_task_type not in SUPPORTED_TASK_TYPES:
43 raise UnsupportedMLTaskType(
44 f"ML Task type '{ml_task_type}' is not supported. Supported types are: {SUPPORTED_TASK_TYPES}"
45 )
47 dataset_path = DATASETS_BASE_PATH / ml_task_type / f"{dataset_name}.csv"
49 if not dataset_path.exists():
50 raise DatasetNotFound(
51 f"Dataset '{dataset_name}' for ML Task type '{ml_task_type}' not found '{dataset_path}'."
52 )
54 return dataset_path
57def load_dataset(ml_task_type=None, dataset_name=None):
59 dataset_path = validate_dataset(ml_task_type, dataset_name)
61 return pd.read_csv(dataset_path)
64def validate_dataframe(df, mandatory_columns):
66 columns_exist = all([col in df.columns for col in mandatory_columns])
68 if not columns_exist:
69 raise MissingColumns(
70 f"Columns {mandatory_columns} are missing from the dataframe."
71 )
73 return df