Coverage for mindsdb / integrations / utilities / datasets / dataset.py: 0%

33 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1from pathlib import Path 

2 

3import pandas as pd 

4 

5DATASETS_BASE_PATH = Path(__file__).parent 

6 

7SUPPORTED_TASK_TYPES = ("question_answering",) 

8 

9 

10class DatasetNameMissing(Exception): 

11 pass 

12 

13 

14class MLTaskTypeMissing(Exception): 

15 pass 

16 

17 

18class DatasetNotFound(Exception): 

19 pass 

20 

21 

22class UnsupportedMLTaskType(Exception): 

23 pass 

24 

25 

26class MissingColumns(Exception): 

27 pass 

28 

29 

30def validate_dataset(ml_task_type=None, dataset_name=None): 

31 

32 if ml_task_type is None: 

33 raise MLTaskTypeMissing( 

34 "ML Task type is missing. Please provide a valid 'ml_task_type'." 

35 ) 

36 

37 if dataset_name is None: 

38 raise DatasetNameMissing( 

39 "Dataset name is missing. Please provide a valid 'dataset_name'." 

40 ) 

41 

42 if ml_task_type not in SUPPORTED_TASK_TYPES: 

43 raise UnsupportedMLTaskType( 

44 f"ML Task type '{ml_task_type}' is not supported. Supported types are: {SUPPORTED_TASK_TYPES}" 

45 ) 

46 

47 dataset_path = DATASETS_BASE_PATH / ml_task_type / f"{dataset_name}.csv" 

48 

49 if not dataset_path.exists(): 

50 raise DatasetNotFound( 

51 f"Dataset '{dataset_name}' for ML Task type '{ml_task_type}' not found '{dataset_path}'." 

52 ) 

53 

54 return dataset_path 

55 

56 

57def load_dataset(ml_task_type=None, dataset_name=None): 

58 

59 dataset_path = validate_dataset(ml_task_type, dataset_name) 

60 

61 return pd.read_csv(dataset_path) 

62 

63 

64def validate_dataframe(df, mandatory_columns): 

65 

66 columns_exist = all([col in df.columns for col in mandatory_columns]) 

67 

68 if not columns_exist: 

69 raise MissingColumns( 

70 f"Columns {mandatory_columns} are missing from the dataframe." 

71 ) 

72 

73 return df