Source code for bootleg.tasks.ned_task

"""NED task definitions."""
import torch
import torch.nn.functional as F
from emmental.scorer import Scorer
from emmental.task import Action, EmmentalTask
from torch import nn
from transformers import AutoModel

from bootleg.layers.bert_encoder import Encoder
from bootleg.layers.static_entity_embeddings import EntityEmbedding
from bootleg.scorer import BootlegSlicedScorer
from bootleg.task_config import NED_TASK
from bootleg.utils import eval_utils


[docs]class DisambigLoss: """Disambiguation loss.""" def __init__(self, normalize, temperature, entity_encoder_key): """Disambiguation loss initializer.""" self.normalize = normalize self.temperature = temperature self.entity_encoder_key = entity_encoder_key
[docs] def disambig_output(self, intermediate_output_dict): """Return the probs for a task in Emmental. Args: intermediate_output_dict: output dict from Emmental task flow Returns: NED probabilities for candidates (B x M x K) """ mask = intermediate_output_dict["_input_"]["entity_cand_eval_mask"] out = intermediate_output_dict["context_encoder"][0].unsqueeze(1) ent_out = intermediate_output_dict[self.entity_encoder_key][0] if self.normalize: out = F.normalize(out, p=2, dim=-1) ent_out = F.normalize(ent_out, p=2, dim=-1) pred = torch.bmm(out, ent_out.transpose(-2, -1)) mask = mask.reshape(*pred.shape) ret = eval_utils.masked_class_logsoftmax( pred=pred, mask=~mask, temp=self.temperature ).squeeze( 1 ) # Squeeze single alias return ret.exp()
[docs] def disambig_loss(self, intermediate_output_dict, Y): """Return the entity disambiguation loss on prediction heads. Args: intermediate_output_dict: output dict from the Emmental task flor Y: gold labels Returns: loss """ # Grab the first value of training (when doing distributed training, we will have one per process) if len(intermediate_output_dict["context_encoder"][1].shape) <= 0: training = intermediate_output_dict["context_encoder"][1].item() else: training = intermediate_output_dict["context_encoder"][1][0].item() assert type(training) is bool mask = intermediate_output_dict["_input_"]["entity_cand_eval_mask"] out = intermediate_output_dict["context_encoder"][0].unsqueeze(1) ent_out = intermediate_output_dict[self.entity_encoder_key][0] if self.normalize: out = F.normalize(out, p=2, dim=-1) ent_out = F.normalize(ent_out, p=2, dim=-1) pred = torch.bmm(out, ent_out.transpose(-2, -1)) mask = mask.reshape(*pred.shape) labels = Y # During eval, even if our model does not predict a NIC candidate, we allow for a NIC gold QID # This qid gets assigned the label of -2 and is always incorrect # As NLLLoss assumes classes of 0 to #classes-1 except for pad idx, we manually mask # the -2 labels for the loss computation only. As this is just for eval, it won't matter. masked_labels = labels if not training: label_mask = labels == -2 masked_labels = torch.where( ~label_mask, labels, torch.ones_like(labels) * -1 ) log_probs = eval_utils.masked_class_logsoftmax( pred=pred, mask=~mask, temp=self.temperature ).squeeze( 1 ) # Squeeze single alias loss = nn.NLLLoss(ignore_index=-1)(log_probs, masked_labels.long()) return loss
[docs] def batch_cands_disambig_output(self, intermediate_output_dict): """Return the probs for a task in Emmental. Args: intermediate_output_dict: output dict from Emmental task flow Returns: NED probabilities for candidates (B x M x K) """ out = intermediate_output_dict["context_encoder"][0] ent_out = intermediate_output_dict[self.entity_encoder_key][0] if self.normalize: out = F.normalize(out, p=2, dim=-1) ent_out = F.normalize(ent_out, p=2, dim=-1) score = torch.mm(out, ent_out.t()) / self.temperature return F.softmax(score, dim=-1)
[docs] def batch_cands_disambig_loss(self, intermediate_output_dict, Y): """Return the entity disambiguation loss on prediction heads. Args: intermediate_output_dict: output dict from the Emmental task flor Y: gold labels Returns: loss """ # Grab the first value of training (when doing distributed training, we will have one per process) training = intermediate_output_dict["context_encoder"][1].item() assert type(training) is bool out = intermediate_output_dict["context_encoder"][0] ent_out = intermediate_output_dict[self.entity_encoder_key][0] if self.normalize: out = F.normalize(out, p=2, dim=-1) ent_out = F.normalize(ent_out, p=2, dim=-1) score = torch.mm(out, ent_out.t()) / self.temperature labels = Y masked_labels = labels.reshape(out.shape[0]) if not training: label_mask = labels == -2 masked_labels = torch.where( ~label_mask, labels, torch.ones_like(labels) * -1 ) masked_labels = masked_labels.reshape(out.shape[0]) temp = nn.CrossEntropyLoss(ignore_index=-1)(score, masked_labels.long()) return temp
[docs]def create_task( args, use_batch_cands, len_context_tok, slice_datasets=None, entity_emb_file=None ): """Return an EmmentalTask for named entity disambiguation (NED). Args: args: args use_batch_cands: use batch candidates for training len_context_tok: length of the context tokenizer slice_datasets: slice datasets used in scorer (default None) entity_emb_file: file for pretrained entity embeddings - used for EVAL only Returns: EmmentalTask for NED """ if entity_emb_file is None: entity_model = AutoModel.from_pretrained( args.data_config.word_embedding.bert_model ) entity_model.encoder.layer = entity_model.encoder.layer[ : args.data_config.word_embedding.entity_layers ] entity_model.resize_token_embeddings(len_context_tok) entity_model = Encoder(entity_model, args.model_config.hidden_size) entity_inputs = [ ("_input_", "entity_cand_input_ids"), ("_input_", "entity_cand_attention_mask"), ("_input_", "entity_cand_token_type_ids"), ] entity_encoder_str = "entity_encoder" else: entity_model = EntityEmbedding(entity_emb_file) entity_inputs = [("_input_", "entity_cand_eid")] entity_encoder_str = "entity_encoder_static" # Create sentence encoder context_model = AutoModel.from_pretrained( args.data_config.word_embedding.bert_model ) context_model.encoder.layer = context_model.encoder.layer[ : args.data_config.word_embedding.context_layers ] context_model.resize_token_embeddings(len_context_tok) context_model = Encoder(context_model, args.model_config.hidden_size) sliced_scorer = BootlegSlicedScorer( args.data_config.train_in_candidates, slice_datasets ) disamig_loss = DisambigLoss( args.model_config.normalize, args.model_config.temperature, entity_encoder_str ) output_func = disamig_loss.disambig_output loss_func = disamig_loss.disambig_loss if use_batch_cands: loss_func = disamig_loss.batch_cands_disambig_loss output_func = disamig_loss.batch_cands_disambig_output # Create module pool and combine with embedding module pool module_pool = nn.ModuleDict( { "context_encoder": context_model, entity_encoder_str: entity_model, } ) # Create task flow task_flow = [ Action( name=entity_encoder_str, module=entity_encoder_str, inputs=entity_inputs ), Action( name="context_encoder", module="context_encoder", inputs=[ ("_input_", "input_ids"), ("_input_", "token_type_ids"), ("_input_", "attention_mask"), ], ), ] return EmmentalTask( name=NED_TASK, module_pool=module_pool, task_flow=task_flow, loss_func=loss_func, output_func=output_func, require_prob_for_eval=False, require_pred_for_eval=True, # action_outputs are used to stitch together sentence fragments action_outputs=[ ("_input_", "sent_idx"), ("_input_", "subsent_idx"), ("_input_", "alias_orig_list_pos"), ("_input_", "for_dump_gold_cand_K_idx_train"), (entity_encoder_str, 0), # entity embeddings ], scorer=Scorer( customize_metric_funcs={f"{NED_TASK}_scorer": sliced_scorer.bootleg_score} ), )