Source code for vnlp.named_entity_recognizer.charner

from typing import List, Tuple

import pickle

import numpy as np
import tensorflow as tf

from ..tokenizer import WordPunctTokenize
from ..utils import check_and_download, load_keras_tokenizer
from .utils import ner_to_displacy_format
from ._charner_utils import create_charner_model

# Resolving parent dependencies
from inspect import getsourcefile
import os
import sys
current_path = os.path.abspath(getsourcefile(lambda:0))
current_dir = os.path.dirname(current_path)
parent_dir = current_dir[:current_dir.rfind(os.path.sep)]
sys.path.insert(0, parent_dir)

RESOURCES_PATH = os.path.join(os.path.dirname(__file__), "resources/")

PROD_WEIGHTS_LOC = RESOURCES_PATH + "NER_CharNER_prod.weights"
EVAL_WEIGHTS_LOC = RESOURCES_PATH + "NER_CharNER_eval.weights"

PROD_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_prod.weights"
EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_eval.weights"

TOKENIZER_CHAR_LOC = RESOURCES_PATH + "CharNER_char_tokenizer.json"
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.json"

CHAR_VOCAB_SIZE = 150
SEQ_LEN_MAX = 256
OOV_TOKEN = '<OOV>'
PADDING_STRAT = 'post'

EMBED_SIZE = 32
RNN_DIM = 128
NUM_RNN_STACKS = 5
MLP_DIM = 32
NUM_CLASSES = 5 # Equals to len(tokenizer_label.index_word) + 1. +1 is reserved to 0, which corresponds to padded values.
DROPOUT = 0.3

[docs]class CharNER:
    """
    CharNER Named Entity Recognizer.

    - This is an implementation of `CharNER: Character-Level Named Entity Recognition <https://aclanthology.org/C16-1087/>`_.
    - There are slight modifications to the original paper:
    - This version is trained for Turkish language only.
    - This version uses simple Mode operation among the character predictions of each token, instead of Viterbi Decoder
    - It achieves 0.9589 Accuracy and 0.9200 F1_macro_score.
    - Input data is processed by NLTK.tokenize.WordPunctTokenizer so that each punctuation becomes a new token.
    - Entity labels are: ['O', 'PER', 'LOC', 'ORG']
    - For more details about the training procedure, dataset and evaluation metrics, see `ReadMe <https://github.com/vngrs-ai/VNLP/blob/main/vnlp/named_entity_recognizer/ReadMe.md>`_.
    """
    def __init__(self, evaluate):
        self.model = create_charner_model(CHAR_VOCAB_SIZE, EMBED_SIZE, SEQ_LEN_MAX, NUM_RNN_STACKS, RNN_DIM, MLP_DIM, NUM_CLASSES, DROPOUT)
        # Check and download model weights
        if evaluate:
            MODEL_WEIGHTS_LOC = EVAL_WEIGHTS_LOC
            MODEL_WEIGHTS_LINK = EVAL_WEIGHTS_LINK
        else:
            MODEL_WEIGHTS_LOC = PROD_WEIGHTS_LOC
            MODEL_WEIGHTS_LINK = PROD_WEIGHTS_LINK

        check_and_download(MODEL_WEIGHTS_LOC, MODEL_WEIGHTS_LINK)

        # Load Model weights
        with open(MODEL_WEIGHTS_LOC, 'rb') as fp:
            model_weights = pickle.load(fp)
            
        # Set model weights
        self.model.set_weights(model_weights)

        tokenizer_char = load_keras_tokenizer(TOKENIZER_CHAR_LOC)
        tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)

        self.tokenizer_char = tokenizer_char
        self.tokenizer_label = tokenizer_label


    def _predict_char_level(self, word_punct_tokenized: List[str]) -> List[int]:
        """
        Returns char level predictions in integers, which will be passed to decoder.

        Args:
            word_punct_tokenized:
                List of tokens, tokenized by WordPunctTokenizer.

        Returns:
            List of integers, indicating entity classes for each character.
        """
        white_space_joined_word_punct_tokens = " ".join(word_punct_tokenized)
        white_space_joined_word_punct_tokens = [char for char in white_space_joined_word_punct_tokens]
        sequences = self.tokenizer_char.texts_to_sequences([white_space_joined_word_punct_tokens])
        padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = SEQ_LEN_MAX, padding = PADDING_STRAT)
        raw_pred = self.model([padded]).numpy()
        arg_max_pred = np.argmax(raw_pred, axis = 2).reshape(-1)
        
        return arg_max_pred

    def _charner_decoder(self, word_punct_tokenized: List[str], arg_max_pred: List[int]) -> List[str]:
        """
        Args:
            word_punct_tokenized:
                List of tokens, tokenized by WordPunctTokenizer.
            arg_max_pred:
                List of integers, indicating entity classes for each character.
        
        Returns:
            decoded_entities: List of entities, one entity per token.
        """
        
        lens = [0] + [len(token) + 1 for token in word_punct_tokenized]
        cumsum_of_lens = np.cumsum(lens)
        
        decoded_entities = []
        for idx in range(len(cumsum_of_lens) - 1):
            lower_bound = cumsum_of_lens[idx]
            upper_bound = cumsum_of_lens[idx + 1] -1 # minus one prevents including the whitespace after the token

            island = arg_max_pred[lower_bound:upper_bound]
            # Extracting mode value
            vals, counts = np.unique(island, return_counts = True)
            mode_value = vals[np.argmax(counts)]
            
            detokenized_pred = self.tokenizer_label.sequences_to_texts([[mode_value]])[0]
            decoded_entities.append(detokenized_pred)
            
        return decoded_entities

[docs]    def predict(self, text: str, displacy_format: bool = False) -> List[Tuple[str, str]]:
        """
        Args:
            text:
                Input text.
            displacy_format:
                When set True, returns the result in spacy.displacy format to allow visualization.

        Returns:
            NER result as pairs of (token, entity).
        """
        word_punct_tokenized = WordPunctTokenize(text)

        # if len chars (including whitespaces) > sequence length, split it recursively
        len_text = len(list(" ".join(word_punct_tokenized)))
        if len_text > SEQ_LEN_MAX:
            
            num_tokens = len(word_punct_tokenized)
            
            first_half_result = self.predict(" ".join(word_punct_tokenized[:num_tokens // 2]))
            first_half_tokens = [pair[0] for pair in first_half_result]
            first_half_entities = [pair[1] for pair in first_half_result]
            
            second_half_result = self.predict(" ".join(word_punct_tokenized[(num_tokens // 2):]))
            second_half_tokens =  [pair[0] for pair in second_half_result]
            second_half_entities = [pair[1] for pair in second_half_result]

            word_punct_tokenized = first_half_tokens + second_half_tokens
            decoded_entities = first_half_entities + second_half_entities

        else:
            charlevel_pred = self._predict_char_level(word_punct_tokenized)
            decoded_entities = self._charner_decoder(word_punct_tokenized, charlevel_pred)
            
        ner_result = [(t,e) for t,e in zip(word_punct_tokenized, decoded_entities)]
        
        if not displacy_format:
            return ner_result
        else:
            return ner_to_displacy_format(text, ner_result)