debeir.evaluation.evaluator
1from collections import defaultdict 2from typing import Dict, List, Union 3 4import loguru 5from analysis_tools_ir import evaluate, sigtests 6from debeir.core.config import GenericConfig, MetricsConfig 7 8 9class Evaluator: 10 """ 11 Evaluation class for computing metrics from TREC-style files 12 """ 13 14 def __init__(self, qrels: str, metrics: List[str]): 15 self.qrels = qrels 16 self.metrics = [] 17 self.depths = [] 18 19 try: 20 self._validate_and_setup_metrics(metrics) 21 except AssertionError: 22 raise ValueError("Metrics must be of the form metric@depth") 23 24 def _validate_and_setup_metrics(self, metrics): 25 for metric in metrics: 26 assert "@" in metric 27 try: 28 metric, depth = metric.split("@") 29 except: 30 raise RuntimeError(f"Unable to parse metric {metric}") 31 32 assert metric.isalpha() 33 assert depth.isdigit() 34 35 self.metrics.append(metric) 36 self.depths.append(int(depth)) 37 38 def evaluate_runs(self, res: Union[str, List[str]], **kwargs): 39 """ 40 Evaluates the TREC-style results from an input result list or file 41 42 :param res: Results file path or raw results list 43 :param kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library 44 :return: 45 """ 46 results = defaultdict(lambda: {}) 47 for metric, depth in zip(self.metrics, self.depths): 48 results[metric][depth] = evaluate.parse_run( 49 res, self.qrels, 50 metric=metric, depth=depth, 51 **kwargs 52 ) 53 54 return results 55 56 def average_all_metrics(self, runs: Dict, logger: loguru.logger): 57 """ 58 Averages the metric per topic scores into a single averaged score. 59 60 :param runs: Parsed run dictionary: {metric_name@depth: Run object} 61 :param logger: Logger to print metrics 62 """ 63 for metric, depth in zip(self.metrics, self.depths): 64 run = runs[metric][depth].run 65 logger.info(f"{metric}@{depth} Average: {sum(run.values()) / len(run):.4}") 66 67 def sigtests(self, results_a, results_b): 68 """ 69 Run a paired significance test on two result files 70 71 :param results_a: 72 :param results_b: 73 :return: 74 """ 75 return sigtests.paired.paired_t_test(results_a, results_b, self.qrels) 76 77 @classmethod 78 def build_from_config(cls, config: GenericConfig, metrics_config: MetricsConfig): 79 return cls(config.qrels, metrics_config.metrics)
class
Evaluator:
10class Evaluator: 11 """ 12 Evaluation class for computing metrics from TREC-style files 13 """ 14 15 def __init__(self, qrels: str, metrics: List[str]): 16 self.qrels = qrels 17 self.metrics = [] 18 self.depths = [] 19 20 try: 21 self._validate_and_setup_metrics(metrics) 22 except AssertionError: 23 raise ValueError("Metrics must be of the form metric@depth") 24 25 def _validate_and_setup_metrics(self, metrics): 26 for metric in metrics: 27 assert "@" in metric 28 try: 29 metric, depth = metric.split("@") 30 except: 31 raise RuntimeError(f"Unable to parse metric {metric}") 32 33 assert metric.isalpha() 34 assert depth.isdigit() 35 36 self.metrics.append(metric) 37 self.depths.append(int(depth)) 38 39 def evaluate_runs(self, res: Union[str, List[str]], **kwargs): 40 """ 41 Evaluates the TREC-style results from an input result list or file 42 43 :param res: Results file path or raw results list 44 :param kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library 45 :return: 46 """ 47 results = defaultdict(lambda: {}) 48 for metric, depth in zip(self.metrics, self.depths): 49 results[metric][depth] = evaluate.parse_run( 50 res, self.qrels, 51 metric=metric, depth=depth, 52 **kwargs 53 ) 54 55 return results 56 57 def average_all_metrics(self, runs: Dict, logger: loguru.logger): 58 """ 59 Averages the metric per topic scores into a single averaged score. 60 61 :param runs: Parsed run dictionary: {metric_name@depth: Run object} 62 :param logger: Logger to print metrics 63 """ 64 for metric, depth in zip(self.metrics, self.depths): 65 run = runs[metric][depth].run 66 logger.info(f"{metric}@{depth} Average: {sum(run.values()) / len(run):.4}") 67 68 def sigtests(self, results_a, results_b): 69 """ 70 Run a paired significance test on two result files 71 72 :param results_a: 73 :param results_b: 74 :return: 75 """ 76 return sigtests.paired.paired_t_test(results_a, results_b, self.qrels) 77 78 @classmethod 79 def build_from_config(cls, config: GenericConfig, metrics_config: MetricsConfig): 80 return cls(config.qrels, metrics_config.metrics)
Evaluation class for computing metrics from TREC-style files
def
evaluate_runs(self, res: Union[str, List[str]], **kwargs):
39 def evaluate_runs(self, res: Union[str, List[str]], **kwargs): 40 """ 41 Evaluates the TREC-style results from an input result list or file 42 43 :param res: Results file path or raw results list 44 :param kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library 45 :return: 46 """ 47 results = defaultdict(lambda: {}) 48 for metric, depth in zip(self.metrics, self.depths): 49 results[metric][depth] = evaluate.parse_run( 50 res, self.qrels, 51 metric=metric, depth=depth, 52 **kwargs 53 ) 54 55 return results
Evaluates the TREC-style results from an input result list or file
Parameters
- res: Results file path or raw results list
- kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library
Returns
def
average_all_metrics( self, runs: Dict, logger: <loguru.logger handlers=[(id=0, level=10, sink=<_io.StringIO object at 0x103af2710>)]>):
57 def average_all_metrics(self, runs: Dict, logger: loguru.logger): 58 """ 59 Averages the metric per topic scores into a single averaged score. 60 61 :param runs: Parsed run dictionary: {metric_name@depth: Run object} 62 :param logger: Logger to print metrics 63 """ 64 for metric, depth in zip(self.metrics, self.depths): 65 run = runs[metric][depth].run 66 logger.info(f"{metric}@{depth} Average: {sum(run.values()) / len(run):.4}")
Averages the metric per topic scores into a single averaged score.
Parameters
- runs: Parsed run dictionary: {metric_name@depth: Run object}
- logger: Logger to print metrics
def
sigtests(self, results_a, results_b):
68 def sigtests(self, results_a, results_b): 69 """ 70 Run a paired significance test on two result files 71 72 :param results_a: 73 :param results_b: 74 :return: 75 """ 76 return sigtests.paired.paired_t_test(results_a, results_b, self.qrels)
Run a paired significance test on two result files
Parameters
- results_a:
- results_b:
Returns
@classmethod
def
build_from_config( cls, config: debeir.core.config.GenericConfig, metrics_config: debeir.core.config.MetricsConfig):