debeir.engines.elasticsearch.generate_script_score
1import copy 2from typing import Dict, Union 3 4base_script = { 5 "lang": "painless", 6 # Compute faster dot products as all vectors are unit length 7 "source": None, 8 "params": None, 9} 10 11 12class SourceBuilder: 13 """ 14 Builds Script Score source for NIR-style queries in elasticsearch 15 Uses the painless language 16 17 This is a string builder class 18 """ 19 20 def __init__(self): 21 self.s = "" 22 self.i = 0 23 self.variables = [] 24 25 def _add_line(self, line): 26 self.s = self.s + line.strip() + "\n" 27 28 def add_preamble(self): 29 """ 30 Adds preamble to the internal string 31 This will return the bm25 score if the normalization constant is below 0 32 """ 33 self._add_line( 34 """ 35 if (params.norm_weight < 0.0) { 36 return _score; 37 } 38 """ 39 ) 40 41 def add_log_score(self, ignore_below_one=False) -> "SourceBuilder": 42 """ 43 Adds the BM25 log score line 44 :param ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under). 45 :return: 46 SourceBuilder 47 """ 48 if ignore_below_one: 49 self._add_line( 50 # "def log_score = _score < 1.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 51 "def log_score = params.disable_bm25 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 52 # "def log_score = Math.log(_score)/Math.log(params.norm_weight);" 53 ) 54 else: 55 self._add_line( 56 "def log_score = _score <= 0.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 57 # "def log_score = Math.log(_score)/Math.log(params.norm_weight);" 58 ) 59 60 return self 61 62 def add_embed_field(self, qfield, field) -> "SourceBuilder": 63 """ 64 Adds a cosine score line. 65 :param qfield: Query field 66 :param field: Document facet field 67 :return: 68 """ 69 if "embedding" not in field.lower(): 70 field = field.replace(".", "_") + "_Embedding" 71 72 variable_name = f"{field}_{qfield}_score" 73 74 self._add_line( 75 f"double {variable_name} = doc['{field}'].isEmpty() ? 0.0 : params.weights[{self.i}]*cosineSimilarity(params.{qfield}" 76 f", '{field}') + params.offset; " 77 # f"double {variable_name} = cosineSimilarity(params.{qfield}, '{field}') + 1.0; " 78 ) 79 self.variables.append(variable_name) 80 81 self.i += 1 82 83 return self 84 85 def finish(self): 86 """ 87 Finalises the script score and returns the internal string 88 :return: 89 A string containing the script score query 90 """ 91 self._add_line("double embed_score = " + " + ".join(self.variables) + ";") 92 self._add_line( 93 # "return params.disable_bm25 == true ? embed_score : embed_score + log_score;" 94 "return embed_score + log_score;" 95 ) 96 97 return self.s 98 99 100def generate_source(qfields: Union[list, str], fields) -> str: 101 """ 102 Generates the script source based off a set of input fields and facets 103 104 :param qfields: Query fields (or topic fields) 105 :param fields: Document facets to compute cosine similarity on 106 :return: 107 """ 108 sb = SourceBuilder() 109 sb.add_log_score(ignore_below_one=True) 110 111 if isinstance(qfields, str): 112 qfields = [qfields] 113 114 for qfield in qfields: 115 for field in fields: 116 sb.add_embed_field(qfield, field) 117 118 s = sb.finish() 119 120 return s 121 122 123# def generate_source(fields, log_ignore=False): 124# s = "" 125# 126# if log_ignore: 127# 128# s = """ 129# def log_score = _score < 1.0 ? _score : Math.log(_score)/Math.log(params.norm_weight); 130# def weights = params.weights;""".strip()+"\n" 131# 132# variables = [] 133# 134# for i, field in enumerate(fields): 135# field = field.replace(".", '_') + '_Embedding' 136# s += f"double {field}_score = doc['{field}'].size() == 0 ? 0 : weights[{i}]*cosineSimilarity(params.q_eb, '{field}') + params.offset;\n" 137# 138# variables.append(f"{field}_score") 139# 140# s = s.strip() 141# 142# s = s + "\n double embed_score = " + " + ".join(variables) + ";" 143# s = s + " \n return params.disable_bm25 == true ? embed_score : embed_score + Math.log(_score)/Math.log(params.norm_weight);" 144# 145# return s 146 147 148def check_params_is_valid(params, qfields): 149 """ 150 Validate if the parameters for the script score passes a simple sanity check. 151 152 :param params: 153 :param qfields: 154 """ 155 for qfield in qfields: 156 assert qfield in params 157 158 assert "weights" in params 159 assert "offset" in params 160 161 162def generate_script( 163 fields, params, source_generator=generate_source, qfields="q_eb" 164) -> Dict: 165 """ 166 Parameters for creating the script 167 168 :param fields: Document fields to search 169 :param params: Parameters for the script 170 :param source_generator: Function that will generate the script 171 :param qfields: Query fields to search from (topic facets) 172 :return: 173 """ 174 script = copy.deepcopy(base_script) 175 check_params_is_valid(params, qfields) 176 177 script["lang"] = "painless" 178 script["source"] = source_generator(qfields, fields) 179 script["params"] = params 180 181 return script
class
SourceBuilder:
13class SourceBuilder: 14 """ 15 Builds Script Score source for NIR-style queries in elasticsearch 16 Uses the painless language 17 18 This is a string builder class 19 """ 20 21 def __init__(self): 22 self.s = "" 23 self.i = 0 24 self.variables = [] 25 26 def _add_line(self, line): 27 self.s = self.s + line.strip() + "\n" 28 29 def add_preamble(self): 30 """ 31 Adds preamble to the internal string 32 This will return the bm25 score if the normalization constant is below 0 33 """ 34 self._add_line( 35 """ 36 if (params.norm_weight < 0.0) { 37 return _score; 38 } 39 """ 40 ) 41 42 def add_log_score(self, ignore_below_one=False) -> "SourceBuilder": 43 """ 44 Adds the BM25 log score line 45 :param ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under). 46 :return: 47 SourceBuilder 48 """ 49 if ignore_below_one: 50 self._add_line( 51 # "def log_score = _score < 1.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 52 "def log_score = params.disable_bm25 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 53 # "def log_score = Math.log(_score)/Math.log(params.norm_weight);" 54 ) 55 else: 56 self._add_line( 57 "def log_score = _score <= 0.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 58 # "def log_score = Math.log(_score)/Math.log(params.norm_weight);" 59 ) 60 61 return self 62 63 def add_embed_field(self, qfield, field) -> "SourceBuilder": 64 """ 65 Adds a cosine score line. 66 :param qfield: Query field 67 :param field: Document facet field 68 :return: 69 """ 70 if "embedding" not in field.lower(): 71 field = field.replace(".", "_") + "_Embedding" 72 73 variable_name = f"{field}_{qfield}_score" 74 75 self._add_line( 76 f"double {variable_name} = doc['{field}'].isEmpty() ? 0.0 : params.weights[{self.i}]*cosineSimilarity(params.{qfield}" 77 f", '{field}') + params.offset; " 78 # f"double {variable_name} = cosineSimilarity(params.{qfield}, '{field}') + 1.0; " 79 ) 80 self.variables.append(variable_name) 81 82 self.i += 1 83 84 return self 85 86 def finish(self): 87 """ 88 Finalises the script score and returns the internal string 89 :return: 90 A string containing the script score query 91 """ 92 self._add_line("double embed_score = " + " + ".join(self.variables) + ";") 93 self._add_line( 94 # "return params.disable_bm25 == true ? embed_score : embed_score + log_score;" 95 "return embed_score + log_score;" 96 ) 97 98 return self.s
Builds Script Score source for NIR-style queries in elasticsearch Uses the painless language
This is a string builder class
def
add_preamble(self):
29 def add_preamble(self): 30 """ 31 Adds preamble to the internal string 32 This will return the bm25 score if the normalization constant is below 0 33 """ 34 self._add_line( 35 """ 36 if (params.norm_weight < 0.0) { 37 return _score; 38 } 39 """ 40 )
Adds preamble to the internal string This will return the bm25 score if the normalization constant is below 0
def
add_log_score( self, ignore_below_one=False) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:
42 def add_log_score(self, ignore_below_one=False) -> "SourceBuilder": 43 """ 44 Adds the BM25 log score line 45 :param ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under). 46 :return: 47 SourceBuilder 48 """ 49 if ignore_below_one: 50 self._add_line( 51 # "def log_score = _score < 1.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 52 "def log_score = params.disable_bm25 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 53 # "def log_score = Math.log(_score)/Math.log(params.norm_weight);" 54 ) 55 else: 56 self._add_line( 57 "def log_score = _score <= 0.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);" 58 # "def log_score = Math.log(_score)/Math.log(params.norm_weight);" 59 ) 60 61 return self
Adds the BM25 log score line
Parameters
- ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under).
Returns
SourceBuilder
def
add_embed_field( self, qfield, field) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:
63 def add_embed_field(self, qfield, field) -> "SourceBuilder": 64 """ 65 Adds a cosine score line. 66 :param qfield: Query field 67 :param field: Document facet field 68 :return: 69 """ 70 if "embedding" not in field.lower(): 71 field = field.replace(".", "_") + "_Embedding" 72 73 variable_name = f"{field}_{qfield}_score" 74 75 self._add_line( 76 f"double {variable_name} = doc['{field}'].isEmpty() ? 0.0 : params.weights[{self.i}]*cosineSimilarity(params.{qfield}" 77 f", '{field}') + params.offset; " 78 # f"double {variable_name} = cosineSimilarity(params.{qfield}, '{field}') + 1.0; " 79 ) 80 self.variables.append(variable_name) 81 82 self.i += 1 83 84 return self
Adds a cosine score line.
Parameters
- qfield: Query field
- field: Document facet field
Returns
def
finish(self):
86 def finish(self): 87 """ 88 Finalises the script score and returns the internal string 89 :return: 90 A string containing the script score query 91 """ 92 self._add_line("double embed_score = " + " + ".join(self.variables) + ";") 93 self._add_line( 94 # "return params.disable_bm25 == true ? embed_score : embed_score + log_score;" 95 "return embed_score + log_score;" 96 ) 97 98 return self.s
Finalises the script score and returns the internal string
Returns
A string containing the script score query
def
generate_source(qfields: Union[list, str], fields) -> str:
101def generate_source(qfields: Union[list, str], fields) -> str: 102 """ 103 Generates the script source based off a set of input fields and facets 104 105 :param qfields: Query fields (or topic fields) 106 :param fields: Document facets to compute cosine similarity on 107 :return: 108 """ 109 sb = SourceBuilder() 110 sb.add_log_score(ignore_below_one=True) 111 112 if isinstance(qfields, str): 113 qfields = [qfields] 114 115 for qfield in qfields: 116 for field in fields: 117 sb.add_embed_field(qfield, field) 118 119 s = sb.finish() 120 121 return s
Generates the script source based off a set of input fields and facets
Parameters
- qfields: Query fields (or topic fields)
- fields: Document facets to compute cosine similarity on
Returns
def
check_params_is_valid(params, qfields):
149def check_params_is_valid(params, qfields): 150 """ 151 Validate if the parameters for the script score passes a simple sanity check. 152 153 :param params: 154 :param qfields: 155 """ 156 for qfield in qfields: 157 assert qfield in params 158 159 assert "weights" in params 160 assert "offset" in params
Validate if the parameters for the script score passes a simple sanity check.
Parameters
- params:
- qfields:
def
generate_script( fields, params, source_generator=<function generate_source>, qfields='q_eb') -> Dict:
163def generate_script( 164 fields, params, source_generator=generate_source, qfields="q_eb" 165) -> Dict: 166 """ 167 Parameters for creating the script 168 169 :param fields: Document fields to search 170 :param params: Parameters for the script 171 :param source_generator: Function that will generate the script 172 :param qfields: Query fields to search from (topic facets) 173 :return: 174 """ 175 script = copy.deepcopy(base_script) 176 check_params_is_valid(params, qfields) 177 178 script["lang"] = "painless" 179 script["source"] = source_generator(qfields, fields) 180 script["params"] = params 181 182 return script
Parameters for creating the script
Parameters
- fields: Document fields to search
- params: Parameters for the script
- source_generator: Function that will generate the script
- qfields: Query fields to search from (topic facets)