debeir.datasets.utils
1# TODO: Convert a Parser Return Dict (Dict[int, Dict[str, ...]) 2 3from debeir.datasets.types import DatasetTypes, InputExample 4from debeir.evaluation.cross_validation import CrossValidator 5from debeir.evaluation.evaluator import Evaluator 6 7import datasets 8 9 10class CrossValidatorDataset: 11 """ 12 Cross Validator Dataset 13 """ 14 cross_val_cls: CrossValidator 15 16 def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'): 17 self.cross_val_cls = cross_validator 18 self.dataset = dataset 19 self.fold = 0 20 self.n_folds = n_folds 21 self.x_attr = x_attr 22 self.y_attr = y_attr 23 self.folds = [] 24 25 @classmethod 26 def prepare_cross_validator(cls, data, evaluator: Evaluator, 27 n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset': 28 """ 29 Prepare the cross validator dataset object that will internally produce the folds. 30 31 :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets 32 :param evaluator: Evaluator to use for checking results 33 :param n_splits: Number of cross validation splits, k-fold (stratified) 34 :param seed: Seed to use (default 42) 35 :param y_attr: Label, or idx of the y label 36 :param x_attr: Label or idx of the x label (not directly used) 37 """ 38 39 return cls(data, CrossValidator(evaluator, data, x_attr, y_attr, 40 n_splits=n_splits, seed=seed), 41 x_attr=x_attr, y_attr=y_attr, 42 n_folds=n_splits) 43 44 def get_fold(self, idx) -> datasets.DatasetDict: 45 """ 46 47 Get the fold and returns a dataset.DataDict object with 48 DataDict{'train': ..., 'val': ...} 49 50 :param idx: 51 """ 52 53 train_idxs, val_idxs = self.cross_val_cls.get_fold(idx) 54 dataset_dict = DatasetDict() 55 56 if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]: 57 # TODO: figure out how to make this into a huggingface dataset object generically 58 train_subset = [self.dataset[i] for i in train_idxs] 59 val_subset = [self.dataset[i] for i in val_idxs] 60 elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample: 61 train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs]) 62 val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs]) 63 64 dataset_dict['train'] = datasets.Dataset.from_dict(train_subset) 65 dataset_dict['val'] = datasets.Dataset.from_dict(val_subset) 66 67 elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset: 68 train_subset = self.dataset.select(train_idxs) 69 val_subset = self.dataset.select(val_idxs) 70 71 dataset_dict['train'] = datasets.Dataset.from_dict(train_subset) 72 dataset_dict['val'] = datasets.Dataset.from_dict(val_subset) 73 74 return dataset_dict
class
CrossValidatorDataset:
11class CrossValidatorDataset: 12 """ 13 Cross Validator Dataset 14 """ 15 cross_val_cls: CrossValidator 16 17 def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'): 18 self.cross_val_cls = cross_validator 19 self.dataset = dataset 20 self.fold = 0 21 self.n_folds = n_folds 22 self.x_attr = x_attr 23 self.y_attr = y_attr 24 self.folds = [] 25 26 @classmethod 27 def prepare_cross_validator(cls, data, evaluator: Evaluator, 28 n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset': 29 """ 30 Prepare the cross validator dataset object that will internally produce the folds. 31 32 :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets 33 :param evaluator: Evaluator to use for checking results 34 :param n_splits: Number of cross validation splits, k-fold (stratified) 35 :param seed: Seed to use (default 42) 36 :param y_attr: Label, or idx of the y label 37 :param x_attr: Label or idx of the x label (not directly used) 38 """ 39 40 return cls(data, CrossValidator(evaluator, data, x_attr, y_attr, 41 n_splits=n_splits, seed=seed), 42 x_attr=x_attr, y_attr=y_attr, 43 n_folds=n_splits) 44 45 def get_fold(self, idx) -> datasets.DatasetDict: 46 """ 47 48 Get the fold and returns a dataset.DataDict object with 49 DataDict{'train': ..., 'val': ...} 50 51 :param idx: 52 """ 53 54 train_idxs, val_idxs = self.cross_val_cls.get_fold(idx) 55 dataset_dict = DatasetDict() 56 57 if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]: 58 # TODO: figure out how to make this into a huggingface dataset object generically 59 train_subset = [self.dataset[i] for i in train_idxs] 60 val_subset = [self.dataset[i] for i in val_idxs] 61 elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample: 62 train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs]) 63 val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs]) 64 65 dataset_dict['train'] = datasets.Dataset.from_dict(train_subset) 66 dataset_dict['val'] = datasets.Dataset.from_dict(val_subset) 67 68 elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset: 69 train_subset = self.dataset.select(train_idxs) 70 val_subset = self.dataset.select(val_idxs) 71 72 dataset_dict['train'] = datasets.Dataset.from_dict(train_subset) 73 dataset_dict['val'] = datasets.Dataset.from_dict(val_subset) 74 75 return dataset_dict
Cross Validator Dataset
@classmethod
def
prepare_cross_validator( cls, data, evaluator: debeir.evaluation.evaluator.Evaluator, n_splits: int, x_attr, y_attr, seed=42) -> debeir.datasets.utils.CrossValidatorDataset:
26 @classmethod 27 def prepare_cross_validator(cls, data, evaluator: Evaluator, 28 n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset': 29 """ 30 Prepare the cross validator dataset object that will internally produce the folds. 31 32 :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets 33 :param evaluator: Evaluator to use for checking results 34 :param n_splits: Number of cross validation splits, k-fold (stratified) 35 :param seed: Seed to use (default 42) 36 :param y_attr: Label, or idx of the y label 37 :param x_attr: Label or idx of the x label (not directly used) 38 """ 39 40 return cls(data, CrossValidator(evaluator, data, x_attr, y_attr, 41 n_splits=n_splits, seed=seed), 42 x_attr=x_attr, y_attr=y_attr, 43 n_folds=n_splits)
Prepare the cross validator dataset object that will internally produce the folds.
Parameters
- data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
- evaluator: Evaluator to use for checking results
- n_splits: Number of cross validation splits, k-fold (stratified)
- seed: Seed to use (default 42)
- y_attr: Label, or idx of the y label
- x_attr: Label or idx of the x label (not directly used)
def
get_fold(self, idx) -> datasets.dataset_dict.DatasetDict:
45 def get_fold(self, idx) -> datasets.DatasetDict: 46 """ 47 48 Get the fold and returns a dataset.DataDict object with 49 DataDict{'train': ..., 'val': ...} 50 51 :param idx: 52 """ 53 54 train_idxs, val_idxs = self.cross_val_cls.get_fold(idx) 55 dataset_dict = DatasetDict() 56 57 if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]: 58 # TODO: figure out how to make this into a huggingface dataset object generically 59 train_subset = [self.dataset[i] for i in train_idxs] 60 val_subset = [self.dataset[i] for i in val_idxs] 61 elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample: 62 train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs]) 63 val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs]) 64 65 dataset_dict['train'] = datasets.Dataset.from_dict(train_subset) 66 dataset_dict['val'] = datasets.Dataset.from_dict(val_subset) 67 68 elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset: 69 train_subset = self.dataset.select(train_idxs) 70 val_subset = self.dataset.select(val_idxs) 71 72 dataset_dict['train'] = datasets.Dataset.from_dict(train_subset) 73 dataset_dict['val'] = datasets.Dataset.from_dict(val_subset) 74 75 return dataset_dict
Get the fold and returns a dataset.DataDict object with DataDict{'train': ..., 'val': ...}
Parameters
- idx: