debeir.training.train_sentence_encoder

  1from functools import partial
  2from typing import List, Union
  3
  4import transformers
  5from debeir.datasets.types import InputExample, RelevanceExample
  6from debeir.training.utils import _train_sentence_transformer, tokenize_function
  7from sentence_transformers.evaluation import SentenceEvaluator
  8from transformers import AutoModel, SchedulerType, Trainer, TrainingArguments
  9
 10import datasets
 11
 12
 13def train_biencoder(model_fp_or_name: str, output_dir: str, train_examples: List[InputExample],
 14                    dev_examples: List[InputExample], train_batch_size=32, num_epochs=3, warmup_steps=None,
 15                    evaluate_every_n_step: int = 1000,
 16                    special_tokens=None, pooling_mode=None, loss_func=None,
 17                    evaluator: SentenceEvaluator = None, *args, **kwargs):
 18    """
 19    Train a universal sentence encoder
 20
 21    :param model_fp_or_name: The model name or path to the model
 22    :param output_dir: Output directory to save model, logs etc.
 23    :param train_examples: Training Examples
 24    :param dev_examples: Dev examples
 25    :param train_batch_size: Training batch size
 26    :param num_epochs: Number of epochs
 27    :param warmup_steps: Warmup steps for the scheduler
 28    :param evaluate_every_n_step: Evaluate the model every n steps
 29    :param special_tokens: Special tokens to add
 30    :param pooling_mode: Pooling mode for a sentence transformer model
 31    :param loss_func: Loss function(s) to use
 32    :param evaluator: Evaluator to use
 33    """
 34
 35    return _train_sentence_transformer(model_fp_or_name, output_dir, train_examples, dev_examples, train_batch_size,
 36                                       num_epochs, warmup_steps, evaluate_every_n_step, special_tokens,
 37                                       pooling_mode, loss_func, evaluator)
 38
 39
 40def train_huggingface_transformer(model_fp_or_name_or_cls: Union[str, transformers.PreTrainedModel],
 41                                  tokenizer: transformers.PreTrainedTokenizer,
 42                                  output_dir: str,
 43                                  compute_metric_fn,
 44                                  metric: datasets.Metric,
 45                                  dataset: datasets.DatasetDict = None,
 46                                  train_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None,
 47                                  eval_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None,
 48                                  train_batch_size=32, num_epochs=3,
 49                                  learning_rate=5e-5,
 50                                  lr_scheduler_type: SchedulerType = SchedulerType.CONSTANT_WITH_WARMUP,
 51                                  optimizer: str = "adamw_hf",
 52                                  warmup_ratio=0.1, evaluate_every_n_step: int = 1000,
 53                                  pooling_mode=None, loss_func=None,
 54                                  model_args=None, model_kwargs=None,
 55                                  padding_strategy="max_length",
 56                                  truncate=True,
 57                                  special_tokens=None,
 58                                  seed=42,
 59                                  *args, **kwargs) -> Trainer:
 60    """
 61    Train a transformer model using the Huggingface API
 62
 63    :param model_fp_or_name_or_cls: Model name or model class to instantiate
 64    :param tokenizer: Tokenizer
 65    :param output_dir: Output directory to write to
 66    :param compute_metric_fn: Metric function to compute metrics
 67    :param metric: Metric used by the compute_metric_fn
 68    :param dataset: Huggingface Dataset Dict
 69    :param train_dataset: Training dataset to be used by the Trainer class
 70    :param eval_dataset: Evaluation dataset to be used by the Trainer class
 71    :param train_batch_size: Batch size to use for training
 72    :param num_epochs: Number of training epochs (default: 3)
 73    :param learning_rate: Learning rate (default: 5e-5)
 74    :param lr_scheduler_type: Learning rate type, see SchedulerType
 75    :param optimizer: Optimizer
 76    :param warmup_ratio: Warmup ratios as ratio of steps (default 0.1)
 77    :param evaluate_every_n_step: Number of steps to evaluate
 78    :param pooling_mode: Pooling mode for your model
 79    :param loss_func: Loss function to instantiate model
 80    :param model_args: Model arguments to pass
 81    :param model_kwargs: Model keyword arguments
 82    :param padding_strategy: Tokenization padding strategy
 83    :param truncate: Truncate tokenization strategy
 84    :param special_tokens: Special tokens to add to the tokenizer
 85    :param seed: Dataset shuffle seed
 86    :param args:
 87    :param kwargs:
 88    :return:
 89    """
 90
 91    if isinstance(model_fp_or_name_or_cls, str):
 92        model = AutoModel.from_pretrained(model_fp_or_name_or_cls)
 93    elif isinstance(model_fp_or_name_or_cls, type):
 94        # is already instantiated
 95        model = model_fp_or_name_or_cls
 96    else:
 97        # Is not instantiated
 98        model = model_fp_or_name_or_cls(loss_func=loss_func,
 99                                        pooling_mode=pooling_mode,
100                                        *model_args, **model_kwargs)
101
102    if special_tokens:
103        tokenizer.add_tokens(special_tokens, special_tokens=True)
104        tokenizer.resize_token_embeddings(len(tokenizer))
105
106    tokenized_datasets = dataset.map(
107        partial(
108            tokenize_function, tokenizer,
109            padding_strategy=padding_strategy,
110            truncate=truncate
111        ), batched=True)
112
113    if dataset:
114        train_dataset = tokenized_datasets["train"].shuffle(seed=seed)
115        eval_dataset = tokenized_datasets["dev"].shuffle(seed=seed)
116
117    training_args = TrainingArguments(output_dir=output_dir,
118                                      per_gpu_train_batch_size=train_batch_size,
119                                      num_train_epochs=num_epochs,
120                                      warmup_ratio=warmup_ratio,
121                                      eval_steps=evaluate_every_n_step,
122                                      learning_rate=learning_rate,
123                                      lr_scheduler_type=lr_scheduler_type,
124                                      optim=optimizer,
125                                      prediction_loss_only=True if metric is None and compute_metric_fn is None else False,
126                                      fp16=True)
127
128    trainer = Trainer(
129        model=model,
130        args=training_args,
131        train_dataset=train_dataset,
132        eval_dataset=eval_dataset,
133        compute_metrics=partial(compute_metric_fn, metric) if compute_metric_fn else None,
134    )
135
136    trainer.fit()
137
138    return trainer
def train_biencoder( model_fp_or_name: str, output_dir: str, train_examples: List[debeir.datasets.types.InputExample], dev_examples: List[debeir.datasets.types.InputExample], train_batch_size=32, num_epochs=3, warmup_steps=None, evaluate_every_n_step: int = 1000, special_tokens=None, pooling_mode=None, loss_func=None, evaluator: sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator = None, *args, **kwargs):
14def train_biencoder(model_fp_or_name: str, output_dir: str, train_examples: List[InputExample],
15                    dev_examples: List[InputExample], train_batch_size=32, num_epochs=3, warmup_steps=None,
16                    evaluate_every_n_step: int = 1000,
17                    special_tokens=None, pooling_mode=None, loss_func=None,
18                    evaluator: SentenceEvaluator = None, *args, **kwargs):
19    """
20    Train a universal sentence encoder
21
22    :param model_fp_or_name: The model name or path to the model
23    :param output_dir: Output directory to save model, logs etc.
24    :param train_examples: Training Examples
25    :param dev_examples: Dev examples
26    :param train_batch_size: Training batch size
27    :param num_epochs: Number of epochs
28    :param warmup_steps: Warmup steps for the scheduler
29    :param evaluate_every_n_step: Evaluate the model every n steps
30    :param special_tokens: Special tokens to add
31    :param pooling_mode: Pooling mode for a sentence transformer model
32    :param loss_func: Loss function(s) to use
33    :param evaluator: Evaluator to use
34    """
35
36    return _train_sentence_transformer(model_fp_or_name, output_dir, train_examples, dev_examples, train_batch_size,
37                                       num_epochs, warmup_steps, evaluate_every_n_step, special_tokens,
38                                       pooling_mode, loss_func, evaluator)

Train a universal sentence encoder

Parameters
  • model_fp_or_name: The model name or path to the model
  • output_dir: Output directory to save model, logs etc.
  • train_examples: Training Examples
  • dev_examples: Dev examples
  • train_batch_size: Training batch size
  • num_epochs: Number of epochs
  • warmup_steps: Warmup steps for the scheduler
  • evaluate_every_n_step: Evaluate the model every n steps
  • special_tokens: Special tokens to add
  • pooling_mode: Pooling mode for a sentence transformer model
  • loss_func: Loss function(s) to use
  • evaluator: Evaluator to use
def train_huggingface_transformer( model_fp_or_name_or_cls: Union[str, transformers.modeling_utils.PreTrainedModel], tokenizer: transformers.tokenization_utils.PreTrainedTokenizer, output_dir: str, compute_metric_fn, metric: datasets.metric.Metric, dataset: datasets.dataset_dict.DatasetDict = None, train_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None, eval_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None, train_batch_size=32, num_epochs=3, learning_rate=5e-05, lr_scheduler_type: transformers.trainer_utils.SchedulerType = <SchedulerType.CONSTANT_WITH_WARMUP: 'constant_with_warmup'>, optimizer: str = 'adamw_hf', warmup_ratio=0.1, evaluate_every_n_step: int = 1000, pooling_mode=None, loss_func=None, model_args=None, model_kwargs=None, padding_strategy='max_length', truncate=True, special_tokens=None, seed=42, *args, **kwargs) -> transformers.trainer.Trainer:
 41def train_huggingface_transformer(model_fp_or_name_or_cls: Union[str, transformers.PreTrainedModel],
 42                                  tokenizer: transformers.PreTrainedTokenizer,
 43                                  output_dir: str,
 44                                  compute_metric_fn,
 45                                  metric: datasets.Metric,
 46                                  dataset: datasets.DatasetDict = None,
 47                                  train_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None,
 48                                  eval_dataset: List[Union[RelevanceExample, InputExample, datasets.Dataset]] = None,
 49                                  train_batch_size=32, num_epochs=3,
 50                                  learning_rate=5e-5,
 51                                  lr_scheduler_type: SchedulerType = SchedulerType.CONSTANT_WITH_WARMUP,
 52                                  optimizer: str = "adamw_hf",
 53                                  warmup_ratio=0.1, evaluate_every_n_step: int = 1000,
 54                                  pooling_mode=None, loss_func=None,
 55                                  model_args=None, model_kwargs=None,
 56                                  padding_strategy="max_length",
 57                                  truncate=True,
 58                                  special_tokens=None,
 59                                  seed=42,
 60                                  *args, **kwargs) -> Trainer:
 61    """
 62    Train a transformer model using the Huggingface API
 63
 64    :param model_fp_or_name_or_cls: Model name or model class to instantiate
 65    :param tokenizer: Tokenizer
 66    :param output_dir: Output directory to write to
 67    :param compute_metric_fn: Metric function to compute metrics
 68    :param metric: Metric used by the compute_metric_fn
 69    :param dataset: Huggingface Dataset Dict
 70    :param train_dataset: Training dataset to be used by the Trainer class
 71    :param eval_dataset: Evaluation dataset to be used by the Trainer class
 72    :param train_batch_size: Batch size to use for training
 73    :param num_epochs: Number of training epochs (default: 3)
 74    :param learning_rate: Learning rate (default: 5e-5)
 75    :param lr_scheduler_type: Learning rate type, see SchedulerType
 76    :param optimizer: Optimizer
 77    :param warmup_ratio: Warmup ratios as ratio of steps (default 0.1)
 78    :param evaluate_every_n_step: Number of steps to evaluate
 79    :param pooling_mode: Pooling mode for your model
 80    :param loss_func: Loss function to instantiate model
 81    :param model_args: Model arguments to pass
 82    :param model_kwargs: Model keyword arguments
 83    :param padding_strategy: Tokenization padding strategy
 84    :param truncate: Truncate tokenization strategy
 85    :param special_tokens: Special tokens to add to the tokenizer
 86    :param seed: Dataset shuffle seed
 87    :param args:
 88    :param kwargs:
 89    :return:
 90    """
 91
 92    if isinstance(model_fp_or_name_or_cls, str):
 93        model = AutoModel.from_pretrained(model_fp_or_name_or_cls)
 94    elif isinstance(model_fp_or_name_or_cls, type):
 95        # is already instantiated
 96        model = model_fp_or_name_or_cls
 97    else:
 98        # Is not instantiated
 99        model = model_fp_or_name_or_cls(loss_func=loss_func,
100                                        pooling_mode=pooling_mode,
101                                        *model_args, **model_kwargs)
102
103    if special_tokens:
104        tokenizer.add_tokens(special_tokens, special_tokens=True)
105        tokenizer.resize_token_embeddings(len(tokenizer))
106
107    tokenized_datasets = dataset.map(
108        partial(
109            tokenize_function, tokenizer,
110            padding_strategy=padding_strategy,
111            truncate=truncate
112        ), batched=True)
113
114    if dataset:
115        train_dataset = tokenized_datasets["train"].shuffle(seed=seed)
116        eval_dataset = tokenized_datasets["dev"].shuffle(seed=seed)
117
118    training_args = TrainingArguments(output_dir=output_dir,
119                                      per_gpu_train_batch_size=train_batch_size,
120                                      num_train_epochs=num_epochs,
121                                      warmup_ratio=warmup_ratio,
122                                      eval_steps=evaluate_every_n_step,
123                                      learning_rate=learning_rate,
124                                      lr_scheduler_type=lr_scheduler_type,
125                                      optim=optimizer,
126                                      prediction_loss_only=True if metric is None and compute_metric_fn is None else False,
127                                      fp16=True)
128
129    trainer = Trainer(
130        model=model,
131        args=training_args,
132        train_dataset=train_dataset,
133        eval_dataset=eval_dataset,
134        compute_metrics=partial(compute_metric_fn, metric) if compute_metric_fn else None,
135    )
136
137    trainer.fit()
138
139    return trainer

Train a transformer model using the Huggingface API

Parameters
  • model_fp_or_name_or_cls: Model name or model class to instantiate
  • tokenizer: Tokenizer
  • output_dir: Output directory to write to
  • compute_metric_fn: Metric function to compute metrics
  • metric: Metric used by the compute_metric_fn
  • dataset: Huggingface Dataset Dict
  • train_dataset: Training dataset to be used by the Trainer class
  • eval_dataset: Evaluation dataset to be used by the Trainer class
  • train_batch_size: Batch size to use for training
  • num_epochs: Number of training epochs (default: 3)
  • learning_rate: Learning rate (default: 5e-5)
  • lr_scheduler_type: Learning rate type, see SchedulerType
  • optimizer: Optimizer
  • warmup_ratio: Warmup ratios as ratio of steps (default 0.1)
  • evaluate_every_n_step: Number of steps to evaluate
  • pooling_mode: Pooling mode for your model
  • loss_func: Loss function to instantiate model
  • model_args: Model arguments to pass
  • model_kwargs: Model keyword arguments
  • padding_strategy: Tokenization padding strategy
  • truncate: Truncate tokenization strategy
  • special_tokens: Special tokens to add to the tokenizer
  • seed: Dataset shuffle seed
  • args:
  • kwargs:
Returns