debeir.core.parser
1import abc 2import csv 3import dataclasses 4import json 5from collections import defaultdict 6from dataclasses import dataclass 7from typing import Dict, List 8from xml.etree import ElementTree as ET 9 10import dill 11import pandas as pd 12 13 14# TODO: Parse fields can come from a config or ID_fields 15# TODO: move _get_topics to private cls method with arguments, and expose get_topics as an instance method. 16 17 18@dataclass(init=True) 19class Parser: 20 """ 21 Parser interface 22 """ 23 24 id_field: object 25 parse_fields: List[str] 26 27 @classmethod 28 def normalize(cls, input_dict) -> Dict: 29 """ 30 Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int] 31 32 :param input_dict: 33 :return: 34 """ 35 return pd.io.json.json_normalize(input_dict, 36 sep=".").to_dict(orient='records')[0] 37 38 def get_topics(self, path, *args, **kwargs): 39 """ 40 Instance method for getting topics, forwards instance self parameters to the _get_topics class method. 41 """ 42 43 self_kwargs = vars(self) 44 kwargs.update(self_kwargs) 45 46 return self._get_topics(path, *args, **kwargs) 47 48 @classmethod 49 @abc.abstractmethod 50 def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]: 51 raise NotImplementedError 52 53 54@dataclasses.dataclass(init=True) 55class PickleParser(Parser): 56 """ 57 Load topics from a pickle file 58 """ 59 60 @classmethod 61 def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]: 62 return dill.load(path) 63 64 65@dataclasses.dataclass(init=True) 66class XMLParser(Parser): 67 """ 68 Load topics from an XML file 69 """ 70 topic_field_name: str 71 id_field: str 72 parse_fields: List[str] 73 74 @classmethod 75 def _recurse_to_child_node(cls, node: ET.Element, track: List): 76 """ 77 Helper method to get all children nodes for text extraction in an xml. 78 79 :param node: Current node 80 :param track: List to track nodes 81 :return: 82 """ 83 if len(node.getchildren()) > 0: 84 for child in node.getchildren(): 85 track.append(cls._recurse_to_child_node(child, track)) 86 87 return node 88 89 @classmethod 90 def unwrap(cls, doc_dict, key): 91 """ 92 Converts defaultdict to dict and list of size 1 to just the element 93 94 :param doc_dict: 95 :param key: 96 """ 97 if isinstance(doc_dict[key], defaultdict): 98 doc_dict[key] = dict(doc_dict[key]) 99 100 for e_key in doc_dict[key]: 101 cls.unwrap(doc_dict[key], e_key) 102 103 if isinstance(doc_dict[key], list): 104 if len(doc_dict[key]) == 1: 105 doc_dict[key] = doc_dict[key][0] 106 107 def _get_topics(self, path, *args, **kwargs) -> Dict[int, Dict[str, str]]: 108 all_topics = ET.parse(path).getroot() 109 qtopics = {} 110 111 for topic in all_topics.findall(self.topic_field_name): 112 _id = topic.attrib[self.id_field] 113 if _id.isnumeric(): 114 _id = int(_id) 115 116 if self.parse_fields: 117 temp = {} 118 for field in self.parse_fields: 119 try: 120 temp[field] = topic.find(field).text.strip() 121 except: 122 continue 123 124 qtopics[_id] = temp 125 else: 126 # The topic contains the text 127 qtopics[_id] = {"query": topic.text.strip()} 128 129 return qtopics 130 131 132@dataclasses.dataclass 133class CSVParser(Parser): 134 """ 135 Loads topics from a CSV file 136 """ 137 id_field = "id" 138 parse_fields = ["Text"] 139 140 def __init__(self, id_field=None, parse_fields=None): 141 if parse_fields is None: 142 parse_fields = ["id", "text"] 143 144 if id_field is None: 145 id_field = "id" 146 147 super().__init__(id_field, parse_fields) 148 149 @classmethod 150 def _get_topics(cls, csvfile, dialect="excel", 151 id_field: str = None, 152 parse_fields: List[str] = None, 153 *args, **kwargs) -> Dict[int, Dict[str, str]]: 154 topics = {} 155 156 if isinstance(csvfile, str): 157 csvfile = open(csvfile, 'rt') 158 159 if id_field is None: 160 id_field = cls.id_field 161 162 if parse_fields is None: 163 parse_fields = cls.parse_fields 164 165 reader = csv.DictReader(csvfile, dialect=dialect) 166 for row in reader: 167 temp = {} 168 169 for field in parse_fields: 170 temp[field] = row[field] 171 172 topics[row[id_field]] = temp 173 174 return topics 175 176 177@dataclasses.dataclass(init=True) 178class TSVParser(CSVParser): 179 180 @classmethod 181 def _get_topics(cls, tsvfile, *args, **kwargs) -> Dict[int, Dict[str, str]]: 182 return CSVParser._get_topics(tsvfile, *args, dialect='excel-tab', **kwargs) 183 184 185@dataclasses.dataclass(init=True) 186class JsonLinesParser(Parser): 187 """ 188 Loads topics from a jsonl file, 189 a JSON per line 190 191 Provide parse_fields, id_field and whether to ignore full matches on json keys 192 secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids. 193 """ 194 parse_fields: List[str] 195 id_field: str 196 ignore_full_match: bool = True 197 secondary_id: str = None 198 199 @classmethod 200 def _get_topics(cls, jsonlfile, id_field, parse_fields, 201 ignore_full_match=True, secondary_id=None, *args, **kwargs) -> Dict[str, Dict]: 202 with open(jsonlfile, "r") as jsonl_f: 203 topics = {} 204 205 for jsonl in jsonl_f: 206 json_dict = json.loads(jsonl) 207 _id = json_dict.pop(id_field) 208 209 if secondary_id: 210 _id = str(_id) + "_" + str(json_dict[secondary_id]) 211 212 for key in list(json_dict.keys()): 213 found = False 214 for _key in parse_fields: 215 if ignore_full_match: 216 if key in _key or key == _key or _key in key: 217 found = True 218 else: 219 if _key == key: 220 found = True 221 if not found: 222 json_dict.pop(key) 223 224 topics[_id] = json_dict 225 226 return topics
@dataclass(init=True)
class
Parser:
19@dataclass(init=True) 20class Parser: 21 """ 22 Parser interface 23 """ 24 25 id_field: object 26 parse_fields: List[str] 27 28 @classmethod 29 def normalize(cls, input_dict) -> Dict: 30 """ 31 Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int] 32 33 :param input_dict: 34 :return: 35 """ 36 return pd.io.json.json_normalize(input_dict, 37 sep=".").to_dict(orient='records')[0] 38 39 def get_topics(self, path, *args, **kwargs): 40 """ 41 Instance method for getting topics, forwards instance self parameters to the _get_topics class method. 42 """ 43 44 self_kwargs = vars(self) 45 kwargs.update(self_kwargs) 46 47 return self._get_topics(path, *args, **kwargs) 48 49 @classmethod 50 @abc.abstractmethod 51 def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]: 52 raise NotImplementedError
Parser interface
@classmethod
def
normalize(cls, input_dict) -> Dict:
28 @classmethod 29 def normalize(cls, input_dict) -> Dict: 30 """ 31 Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int] 32 33 :param input_dict: 34 :return: 35 """ 36 return pd.io.json.json_normalize(input_dict, 37 sep=".").to_dict(orient='records')[0]
Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
Parameters
- input_dict:
Returns
def
get_topics(self, path, *args, **kwargs):
39 def get_topics(self, path, *args, **kwargs): 40 """ 41 Instance method for getting topics, forwards instance self parameters to the _get_topics class method. 42 """ 43 44 self_kwargs = vars(self) 45 kwargs.update(self_kwargs) 46 47 return self._get_topics(path, *args, **kwargs)
Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
55@dataclasses.dataclass(init=True) 56class PickleParser(Parser): 57 """ 58 Load topics from a pickle file 59 """ 60 61 @classmethod 62 def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]: 63 return dill.load(path)
Load topics from a pickle file
Inherited Members
66@dataclasses.dataclass(init=True) 67class XMLParser(Parser): 68 """ 69 Load topics from an XML file 70 """ 71 topic_field_name: str 72 id_field: str 73 parse_fields: List[str] 74 75 @classmethod 76 def _recurse_to_child_node(cls, node: ET.Element, track: List): 77 """ 78 Helper method to get all children nodes for text extraction in an xml. 79 80 :param node: Current node 81 :param track: List to track nodes 82 :return: 83 """ 84 if len(node.getchildren()) > 0: 85 for child in node.getchildren(): 86 track.append(cls._recurse_to_child_node(child, track)) 87 88 return node 89 90 @classmethod 91 def unwrap(cls, doc_dict, key): 92 """ 93 Converts defaultdict to dict and list of size 1 to just the element 94 95 :param doc_dict: 96 :param key: 97 """ 98 if isinstance(doc_dict[key], defaultdict): 99 doc_dict[key] = dict(doc_dict[key]) 100 101 for e_key in doc_dict[key]: 102 cls.unwrap(doc_dict[key], e_key) 103 104 if isinstance(doc_dict[key], list): 105 if len(doc_dict[key]) == 1: 106 doc_dict[key] = doc_dict[key][0] 107 108 def _get_topics(self, path, *args, **kwargs) -> Dict[int, Dict[str, str]]: 109 all_topics = ET.parse(path).getroot() 110 qtopics = {} 111 112 for topic in all_topics.findall(self.topic_field_name): 113 _id = topic.attrib[self.id_field] 114 if _id.isnumeric(): 115 _id = int(_id) 116 117 if self.parse_fields: 118 temp = {} 119 for field in self.parse_fields: 120 try: 121 temp[field] = topic.find(field).text.strip() 122 except: 123 continue 124 125 qtopics[_id] = temp 126 else: 127 # The topic contains the text 128 qtopics[_id] = {"query": topic.text.strip()} 129 130 return qtopics
Load topics from an XML file
@classmethod
def
unwrap(cls, doc_dict, key):
90 @classmethod 91 def unwrap(cls, doc_dict, key): 92 """ 93 Converts defaultdict to dict and list of size 1 to just the element 94 95 :param doc_dict: 96 :param key: 97 """ 98 if isinstance(doc_dict[key], defaultdict): 99 doc_dict[key] = dict(doc_dict[key]) 100 101 for e_key in doc_dict[key]: 102 cls.unwrap(doc_dict[key], e_key) 103 104 if isinstance(doc_dict[key], list): 105 if len(doc_dict[key]) == 1: 106 doc_dict[key] = doc_dict[key][0]
Converts defaultdict to dict and list of size 1 to just the element
Parameters
- doc_dict:
- key:
Inherited Members
133@dataclasses.dataclass 134class CSVParser(Parser): 135 """ 136 Loads topics from a CSV file 137 """ 138 id_field = "id" 139 parse_fields = ["Text"] 140 141 def __init__(self, id_field=None, parse_fields=None): 142 if parse_fields is None: 143 parse_fields = ["id", "text"] 144 145 if id_field is None: 146 id_field = "id" 147 148 super().__init__(id_field, parse_fields) 149 150 @classmethod 151 def _get_topics(cls, csvfile, dialect="excel", 152 id_field: str = None, 153 parse_fields: List[str] = None, 154 *args, **kwargs) -> Dict[int, Dict[str, str]]: 155 topics = {} 156 157 if isinstance(csvfile, str): 158 csvfile = open(csvfile, 'rt') 159 160 if id_field is None: 161 id_field = cls.id_field 162 163 if parse_fields is None: 164 parse_fields = cls.parse_fields 165 166 reader = csv.DictReader(csvfile, dialect=dialect) 167 for row in reader: 168 temp = {} 169 170 for field in parse_fields: 171 temp[field] = row[field] 172 173 topics[row[id_field]] = temp 174 175 return topics
Loads topics from a CSV file
Inherited Members
178@dataclasses.dataclass(init=True) 179class TSVParser(CSVParser): 180 181 @classmethod 182 def _get_topics(cls, tsvfile, *args, **kwargs) -> Dict[int, Dict[str, str]]: 183 return CSVParser._get_topics(tsvfile, *args, dialect='excel-tab', **kwargs)
Inherited Members
186@dataclasses.dataclass(init=True) 187class JsonLinesParser(Parser): 188 """ 189 Loads topics from a jsonl file, 190 a JSON per line 191 192 Provide parse_fields, id_field and whether to ignore full matches on json keys 193 secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids. 194 """ 195 parse_fields: List[str] 196 id_field: str 197 ignore_full_match: bool = True 198 secondary_id: str = None 199 200 @classmethod 201 def _get_topics(cls, jsonlfile, id_field, parse_fields, 202 ignore_full_match=True, secondary_id=None, *args, **kwargs) -> Dict[str, Dict]: 203 with open(jsonlfile, "r") as jsonl_f: 204 topics = {} 205 206 for jsonl in jsonl_f: 207 json_dict = json.loads(jsonl) 208 _id = json_dict.pop(id_field) 209 210 if secondary_id: 211 _id = str(_id) + "_" + str(json_dict[secondary_id]) 212 213 for key in list(json_dict.keys()): 214 found = False 215 for _key in parse_fields: 216 if ignore_full_match: 217 if key in _key or key == _key or _key in key: 218 found = True 219 else: 220 if _key == key: 221 found = True 222 if not found: 223 json_dict.pop(key) 224 225 topics[_id] = json_dict 226 227 return topics
Loads topics from a jsonl file, a JSON per line
Provide parse_fields, id_field and whether to ignore full matches on json keys secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
JsonLinesParser( id_field: str, parse_fields: List[str], ignore_full_match: bool = True, secondary_id: str = None)