debeir.core.converters
1from collections import defaultdict 2from typing import Dict, Union 3 4from debeir.core.parser import Parser 5 6import datasets 7 8 9class ParsedTopicsToDataset: 10 """ 11 Converts a parser's output to a huggingface dataset object. 12 """ 13 14 @classmethod 15 def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]): 16 """ 17 Flatten a Dict of shape (traditional parser output) 18 {topic_id: { 19 "Facet_1": ... 20 "Facet_2": ... 21 } 22 } 23 24 -> 25 26 To a flattened arrow-like dataset. 27 { 28 topic_ids: [], 29 Facet_1s: [], 30 Facet_2s: [], 31 } 32 33 :param output: Topics output from the parser object 34 :return: 35 """ 36 flattened_topics = defaultdict(lambda: []) 37 38 for topic_id, topic in output.items(): 39 flattened_topics["topic_id"].append(topic_id) 40 41 for field in parser.parse_fields: 42 if field in topic: 43 flattened_topics[field].append(topic[field]) 44 else: 45 flattened_topics[field].append(None) 46 47 return datasets.Dataset.from_dict(flattened_topics)
class
ParsedTopicsToDataset:
10class ParsedTopicsToDataset: 11 """ 12 Converts a parser's output to a huggingface dataset object. 13 """ 14 15 @classmethod 16 def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]): 17 """ 18 Flatten a Dict of shape (traditional parser output) 19 {topic_id: { 20 "Facet_1": ... 21 "Facet_2": ... 22 } 23 } 24 25 -> 26 27 To a flattened arrow-like dataset. 28 { 29 topic_ids: [], 30 Facet_1s: [], 31 Facet_2s: [], 32 } 33 34 :param output: Topics output from the parser object 35 :return: 36 """ 37 flattened_topics = defaultdict(lambda: []) 38 39 for topic_id, topic in output.items(): 40 flattened_topics["topic_id"].append(topic_id) 41 42 for field in parser.parse_fields: 43 if field in topic: 44 flattened_topics[field].append(topic[field]) 45 else: 46 flattened_topics[field].append(None) 47 48 return datasets.Dataset.from_dict(flattened_topics)
Converts a parser's output to a huggingface dataset object.
@classmethod
def
convert( cls, parser: debeir.core.parser.Parser, output: Dict[Union[str, int], Dict]):
15 @classmethod 16 def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]): 17 """ 18 Flatten a Dict of shape (traditional parser output) 19 {topic_id: { 20 "Facet_1": ... 21 "Facet_2": ... 22 } 23 } 24 25 -> 26 27 To a flattened arrow-like dataset. 28 { 29 topic_ids: [], 30 Facet_1s: [], 31 Facet_2s: [], 32 } 33 34 :param output: Topics output from the parser object 35 :return: 36 """ 37 flattened_topics = defaultdict(lambda: []) 38 39 for topic_id, topic in output.items(): 40 flattened_topics["topic_id"].append(topic_id) 41 42 for field in parser.parse_fields: 43 if field in topic: 44 flattened_topics[field].append(topic[field]) 45 else: 46 flattened_topics[field].append(None) 47 48 return datasets.Dataset.from_dict(flattened_topics)
Flatten a Dict of shape (traditional parser output) {topic_id: { "Facet_1": ... "Facet_2": ... } }
->
To a flattened arrow-like dataset. { topic_ids: [], Facet_1s: [], Facet_2s: [], }
Parameters
- output: Topics output from the parser object