debeir.datasets.utils

 1# TODO: Convert a Parser Return Dict (Dict[int, Dict[str, ...])
 2
 3from debeir.datasets.types import DatasetTypes, InputExample
 4from debeir.evaluation.cross_validation import CrossValidator
 5from debeir.evaluation.evaluator import Evaluator
 6
 7import datasets
 8
 9
10class CrossValidatorDataset:
11    """
12    Cross Validator Dataset
13    """
14    cross_val_cls: CrossValidator
15
16    def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'):
17        self.cross_val_cls = cross_validator
18        self.dataset = dataset
19        self.fold = 0
20        self.n_folds = n_folds
21        self.x_attr = x_attr
22        self.y_attr = y_attr
23        self.folds = []
24
25    @classmethod
26    def prepare_cross_validator(cls, data, evaluator: Evaluator,
27                                n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset':
28        """
29        Prepare the cross validator dataset object that will internally produce the folds.
30
31        :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
32        :param evaluator: Evaluator to use for checking results
33        :param n_splits: Number of cross validation splits, k-fold (stratified)
34        :param seed: Seed to use (default 42)
35        :param y_attr: Label, or idx of the y label
36        :param x_attr: Label or idx of the x label (not directly used)
37        """
38
39        return cls(data, CrossValidator(evaluator, data, x_attr, y_attr,
40                                        n_splits=n_splits, seed=seed),
41                   x_attr=x_attr, y_attr=y_attr,
42                   n_folds=n_splits)
43
44    def get_fold(self, idx) -> datasets.DatasetDict:
45        """
46
47        Get the fold and returns a dataset.DataDict object with
48        DataDict{'train': ..., 'val': ...}
49
50        :param idx:
51        """
52
53        train_idxs, val_idxs = self.cross_val_cls.get_fold(idx)
54        dataset_dict = DatasetDict()
55
56        if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]:
57            # TODO: figure out how to make this into a huggingface dataset object generically
58            train_subset = [self.dataset[i] for i in train_idxs]
59            val_subset = [self.dataset[i] for i in val_idxs]
60        elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample:
61            train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs])
62            val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs])
63
64            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
65            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
66
67        elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset:
68            train_subset = self.dataset.select(train_idxs)
69            val_subset = self.dataset.select(val_idxs)
70
71            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
72            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
73
74        return dataset_dict
class CrossValidatorDataset:
11class CrossValidatorDataset:
12    """
13    Cross Validator Dataset
14    """
15    cross_val_cls: CrossValidator
16
17    def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'):
18        self.cross_val_cls = cross_validator
19        self.dataset = dataset
20        self.fold = 0
21        self.n_folds = n_folds
22        self.x_attr = x_attr
23        self.y_attr = y_attr
24        self.folds = []
25
26    @classmethod
27    def prepare_cross_validator(cls, data, evaluator: Evaluator,
28                                n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset':
29        """
30        Prepare the cross validator dataset object that will internally produce the folds.
31
32        :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
33        :param evaluator: Evaluator to use for checking results
34        :param n_splits: Number of cross validation splits, k-fold (stratified)
35        :param seed: Seed to use (default 42)
36        :param y_attr: Label, or idx of the y label
37        :param x_attr: Label or idx of the x label (not directly used)
38        """
39
40        return cls(data, CrossValidator(evaluator, data, x_attr, y_attr,
41                                        n_splits=n_splits, seed=seed),
42                   x_attr=x_attr, y_attr=y_attr,
43                   n_folds=n_splits)
44
45    def get_fold(self, idx) -> datasets.DatasetDict:
46        """
47
48        Get the fold and returns a dataset.DataDict object with
49        DataDict{'train': ..., 'val': ...}
50
51        :param idx:
52        """
53
54        train_idxs, val_idxs = self.cross_val_cls.get_fold(idx)
55        dataset_dict = DatasetDict()
56
57        if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]:
58            # TODO: figure out how to make this into a huggingface dataset object generically
59            train_subset = [self.dataset[i] for i in train_idxs]
60            val_subset = [self.dataset[i] for i in val_idxs]
61        elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample:
62            train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs])
63            val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs])
64
65            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
66            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
67
68        elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset:
69            train_subset = self.dataset.select(train_idxs)
70            val_subset = self.dataset.select(val_idxs)
71
72            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
73            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
74
75        return dataset_dict

Cross Validator Dataset

CrossValidatorDataset(dataset, cross_validator, n_folds, x_attr='text', y_attr='label')
17    def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'):
18        self.cross_val_cls = cross_validator
19        self.dataset = dataset
20        self.fold = 0
21        self.n_folds = n_folds
22        self.x_attr = x_attr
23        self.y_attr = y_attr
24        self.folds = []
@classmethod
def prepare_cross_validator( cls, data, evaluator: debeir.evaluation.evaluator.Evaluator, n_splits: int, x_attr, y_attr, seed=42) -> debeir.datasets.utils.CrossValidatorDataset:
26    @classmethod
27    def prepare_cross_validator(cls, data, evaluator: Evaluator,
28                                n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset':
29        """
30        Prepare the cross validator dataset object that will internally produce the folds.
31
32        :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
33        :param evaluator: Evaluator to use for checking results
34        :param n_splits: Number of cross validation splits, k-fold (stratified)
35        :param seed: Seed to use (default 42)
36        :param y_attr: Label, or idx of the y label
37        :param x_attr: Label or idx of the x label (not directly used)
38        """
39
40        return cls(data, CrossValidator(evaluator, data, x_attr, y_attr,
41                                        n_splits=n_splits, seed=seed),
42                   x_attr=x_attr, y_attr=y_attr,
43                   n_folds=n_splits)

Prepare the cross validator dataset object that will internally produce the folds.

Parameters
  • data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
  • evaluator: Evaluator to use for checking results
  • n_splits: Number of cross validation splits, k-fold (stratified)
  • seed: Seed to use (default 42)
  • y_attr: Label, or idx of the y label
  • x_attr: Label or idx of the x label (not directly used)
def get_fold(self, idx) -> datasets.dataset_dict.DatasetDict:
45    def get_fold(self, idx) -> datasets.DatasetDict:
46        """
47
48        Get the fold and returns a dataset.DataDict object with
49        DataDict{'train': ..., 'val': ...}
50
51        :param idx:
52        """
53
54        train_idxs, val_idxs = self.cross_val_cls.get_fold(idx)
55        dataset_dict = DatasetDict()
56
57        if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]:
58            # TODO: figure out how to make this into a huggingface dataset object generically
59            train_subset = [self.dataset[i] for i in train_idxs]
60            val_subset = [self.dataset[i] for i in val_idxs]
61        elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample:
62            train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs])
63            val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs])
64
65            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
66            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
67
68        elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset:
69            train_subset = self.dataset.select(train_idxs)
70            val_subset = self.dataset.select(val_idxs)
71
72            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
73            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
74
75        return dataset_dict

Get the fold and returns a dataset.DataDict object with DataDict{'train': ..., 'val': ...}

Parameters
  • idx: