debeir.datasets.types
1import string 2from collections import defaultdict 3from enum import Enum 4from typing import List, Union 5 6 7class InputExample: 8 """ 9 Copied from Sentence Transformer Library 10 Structure for one input example with texts, the label and a unique id 11 """ 12 13 def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0): 14 """ 15 Creates one InputExample with the given texts, guid and label 16 17 :param guid 18 id for the example 19 :param texts 20 the texts for the example. Note, str.strip() is called on the texts 21 :param label 22 the label for the example 23 """ 24 self.guid = guid 25 self.texts = [text.strip() for text in texts] 26 self.label = label 27 28 def __str__(self): 29 return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts)) 30 31 def get_label(self): 32 return self.label 33 34 # def __getattr__(self, key): 35 # if key == "label": 36 # return self.get_label() 37 38 # if key == "texts": 39 # return self.texts 40 41 # if key in ["guid", "id"]: 42 # return self.guid 43 44 # raise KeyError() 45 46 @classmethod 47 def to_dict(cls, data: List['InputExample']): 48 text_len = len(data[0].texts) 49 processed_data = defaultdict(lambda: []) 50 51 for datum in data: 52 # string.ascii_lowercase 53 54 processed_data["id"].append(datum.guid) 55 processed_data["label"].append(datum.get_label()) 56 57 for i in range(text_len): 58 letter = string.ascii_lowercase[i] # abcdefghi 59 # processed_data[text_a] = ... 60 processed_data[f"text_{letter}"].append(datum.texts[i]) 61 62 return processed_data 63 64 @classmethod 65 def from_parser_output(cls, data): 66 pass 67 68 69class RelevanceExample(InputExample): 70 """ 71 Converts Relevance Labels to 0 - 1 72 """ 73 74 def __init__(self, max_score=2, *args, **kwargs): 75 super().__init__(*args, **kwargs) 76 self.max_score = max_score 77 78 def get_label(self): 79 return self.relevance() 80 81 def relevance(self): 82 """ 83 :return: 84 Returns a normalised score for relevance between 0 - 1 85 """ 86 return self.label / self.max_score 87 88 89class DatasetTypes(Enum): 90 """ 91 A collection of common dataset types that is usable in the library. 92 """ 93 List: "List" 94 ListInputExample: "ListInputExample" 95 ListDict: "ListDict" 96 HuggingfaceDataset: "HuggingfaceDataset"
class
InputExample:
8class InputExample: 9 """ 10 Copied from Sentence Transformer Library 11 Structure for one input example with texts, the label and a unique id 12 """ 13 14 def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0): 15 """ 16 Creates one InputExample with the given texts, guid and label 17 18 :param guid 19 id for the example 20 :param texts 21 the texts for the example. Note, str.strip() is called on the texts 22 :param label 23 the label for the example 24 """ 25 self.guid = guid 26 self.texts = [text.strip() for text in texts] 27 self.label = label 28 29 def __str__(self): 30 return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts)) 31 32 def get_label(self): 33 return self.label 34 35 # def __getattr__(self, key): 36 # if key == "label": 37 # return self.get_label() 38 39 # if key == "texts": 40 # return self.texts 41 42 # if key in ["guid", "id"]: 43 # return self.guid 44 45 # raise KeyError() 46 47 @classmethod 48 def to_dict(cls, data: List['InputExample']): 49 text_len = len(data[0].texts) 50 processed_data = defaultdict(lambda: []) 51 52 for datum in data: 53 # string.ascii_lowercase 54 55 processed_data["id"].append(datum.guid) 56 processed_data["label"].append(datum.get_label()) 57 58 for i in range(text_len): 59 letter = string.ascii_lowercase[i] # abcdefghi 60 # processed_data[text_a] = ... 61 processed_data[f"text_{letter}"].append(datum.texts[i]) 62 63 return processed_data 64 65 @classmethod 66 def from_parser_output(cls, data): 67 pass
Copied from Sentence Transformer Library Structure for one input example with texts, the label and a unique id
InputExample( guid: str = '', texts: List[str] = None, label: Union[int, float] = 0)
14 def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0): 15 """ 16 Creates one InputExample with the given texts, guid and label 17 18 :param guid 19 id for the example 20 :param texts 21 the texts for the example. Note, str.strip() is called on the texts 22 :param label 23 the label for the example 24 """ 25 self.guid = guid 26 self.texts = [text.strip() for text in texts] 27 self.label = label
Creates one InputExample with the given texts, guid and label
:param guid id for the example :param texts the texts for the example. Note, str.strip() is called on the texts :param label the label for the example
47 @classmethod 48 def to_dict(cls, data: List['InputExample']): 49 text_len = len(data[0].texts) 50 processed_data = defaultdict(lambda: []) 51 52 for datum in data: 53 # string.ascii_lowercase 54 55 processed_data["id"].append(datum.guid) 56 processed_data["label"].append(datum.get_label()) 57 58 for i in range(text_len): 59 letter = string.ascii_lowercase[i] # abcdefghi 60 # processed_data[text_a] = ... 61 processed_data[f"text_{letter}"].append(datum.texts[i]) 62 63 return processed_data
70class RelevanceExample(InputExample): 71 """ 72 Converts Relevance Labels to 0 - 1 73 """ 74 75 def __init__(self, max_score=2, *args, **kwargs): 76 super().__init__(*args, **kwargs) 77 self.max_score = max_score 78 79 def get_label(self): 80 return self.relevance() 81 82 def relevance(self): 83 """ 84 :return: 85 Returns a normalised score for relevance between 0 - 1 86 """ 87 return self.label / self.max_score
Converts Relevance Labels to 0 - 1
RelevanceExample(max_score=2, *args, **kwargs)
75 def __init__(self, max_score=2, *args, **kwargs): 76 super().__init__(*args, **kwargs) 77 self.max_score = max_score
Creates one InputExample with the given texts, guid and label
:param guid id for the example :param texts the texts for the example. Note, str.strip() is called on the texts :param label the label for the example
def
relevance(self):
82 def relevance(self): 83 """ 84 :return: 85 Returns a normalised score for relevance between 0 - 1 86 """ 87 return self.label / self.max_score
Returns
Returns a normalised score for relevance between 0 - 1
Inherited Members
class
DatasetTypes(enum.Enum):
90class DatasetTypes(Enum): 91 """ 92 A collection of common dataset types that is usable in the library. 93 """ 94 List: "List" 95 ListInputExample: "ListInputExample" 96 ListDict: "ListDict" 97 HuggingfaceDataset: "HuggingfaceDataset"
A collection of common dataset types that is usable in the library.
Inherited Members
- enum.Enum
- name
- value