Source code for vnlp.named_entity_recognizer.charner

from typing import List, Tuple

import pickle

import numpy as np
import tensorflow as tf

from ..tokenizer import WordPunctTokenize
from ..utils import check_and_download, load_keras_tokenizer
from .utils import ner_to_displacy_format
from ._charner_utils import create_charner_model

# Resolving parent dependencies
from inspect import getsourcefile
import os
import sys

current_path = os.path.abspath(getsourcefile(lambda: 0))
current_dir = os.path.dirname(current_path)
parent_dir = current_dir[: current_dir.rfind(os.path.sep)]
sys.path.insert(0, parent_dir)

RESOURCES_PATH = os.path.join(os.path.dirname(__file__), "resources/")

PROD_WEIGHTS_LOC = RESOURCES_PATH + "NER_CharNER_prod.weights"
EVAL_WEIGHTS_LOC = RESOURCES_PATH + "NER_CharNER_eval.weights"

PROD_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_prod.weights"
EVAL_WEIGHTS_LINK = "https://vnlp-model-weights.s3.eu-west-1.amazonaws.com/NER_CharNER_eval.weights"

TOKENIZER_CHAR_LOC = RESOURCES_PATH + "CharNER_char_tokenizer.json"
TOKENIZER_LABEL_LOC = RESOURCES_PATH + "NER_label_tokenizer.json"

CHAR_VOCAB_SIZE = 150
SEQ_LEN_MAX = 256
OOV_TOKEN = "<OOV>"
PADDING_STRAT = "post"

EMBED_SIZE = 32
RNN_DIM = 128
NUM_RNN_STACKS = 5
MLP_DIM = 32
NUM_CLASSES = 5  # Equals to len(tokenizer_label.index_word) + 1. +1 is reserved to 0, which corresponds to padded values.
DROPOUT = 0.3


[docs]class CharNER:
    """
    CharNER Named Entity Recognizer.

    - This is an implementation of `CharNER: Character-Level Named Entity Recognition <https://aclanthology.org/C16-1087/>`_.
    - There are slight modifications to the original paper:
    - This version is trained for Turkish language only.
    - This version uses simple Mode operation among the character predictions of each token, instead of Viterbi Decoder
    - It achieves 0.9589 Accuracy and 0.9200 F1_macro_score.
    - Input data is processed by NLTK.tokenize.WordPunctTokenizer so that each punctuation becomes a new token.
    - Entity labels are: ['O', 'PER', 'LOC', 'ORG']
    - For more details about the training procedure, dataset and evaluation metrics, see `ReadMe <https://github.com/vngrs-ai/VNLP/blob/main/vnlp/named_entity_recognizer/ReadMe.md>`_.
    """

    def __init__(self, evaluate):
        self.model = create_charner_model(
            CHAR_VOCAB_SIZE,
            EMBED_SIZE,
            SEQ_LEN_MAX,
            NUM_RNN_STACKS,
            RNN_DIM,
            MLP_DIM,
            NUM_CLASSES,
            DROPOUT,
        )
        # Check and download model weights
        if evaluate:
            MODEL_WEIGHTS_LOC = EVAL_WEIGHTS_LOC
            MODEL_WEIGHTS_LINK = EVAL_WEIGHTS_LINK
        else:
            MODEL_WEIGHTS_LOC = PROD_WEIGHTS_LOC
            MODEL_WEIGHTS_LINK = PROD_WEIGHTS_LINK

        check_and_download(MODEL_WEIGHTS_LOC, MODEL_WEIGHTS_LINK)

        # Load Model weights
        with open(MODEL_WEIGHTS_LOC, "rb") as fp:
            model_weights = pickle.load(fp)

        # Set model weights
        self.model.set_weights(model_weights)

        tokenizer_char = load_keras_tokenizer(TOKENIZER_CHAR_LOC)
        tokenizer_label = load_keras_tokenizer(TOKENIZER_LABEL_LOC)

        self.tokenizer_char = tokenizer_char
        self.tokenizer_label = tokenizer_label

    def _predict_char_level(
        self, word_punct_tokenized: List[str]
    ) -> List[int]:
        """
        Returns char level predictions in integers, which will be passed to decoder.

        Args:
            word_punct_tokenized:
                List of tokens, tokenized by WordPunctTokenizer.

        Returns:
            List of integers, indicating entity classes for each character.
        """
        white_space_joined_word_punct_tokens = " ".join(word_punct_tokenized)
        white_space_joined_word_punct_tokens = [
            char for char in white_space_joined_word_punct_tokens
        ]
        sequences = self.tokenizer_char.texts_to_sequences(
            [white_space_joined_word_punct_tokens]
        )
        padded = tf.keras.preprocessing.sequence.pad_sequences(
            sequences, maxlen=SEQ_LEN_MAX, padding=PADDING_STRAT
        )
        raw_pred = self.model([padded]).numpy()
        arg_max_pred = np.argmax(raw_pred, axis=2).reshape(-1)

        return arg_max_pred

    def _charner_decoder(
        self, word_punct_tokenized: List[str], arg_max_pred: List[int]
    ) -> List[str]:
        """
        Args:
            word_punct_tokenized:
                List of tokens, tokenized by WordPunctTokenizer.
            arg_max_pred:
                List of integers, indicating entity classes for each character.

        Returns:
            decoded_entities: List of entities, one entity per token.
        """

        lens = [0] + [len(token) + 1 for token in word_punct_tokenized]
        cumsum_of_lens = np.cumsum(lens)

        decoded_entities = []
        for idx in range(len(cumsum_of_lens) - 1):
            lower_bound = cumsum_of_lens[idx]
            upper_bound = (
                cumsum_of_lens[idx + 1] - 1
            )  # minus one prevents including the whitespace after the token

            island = arg_max_pred[lower_bound:upper_bound]
            # Extracting mode value
            vals, counts = np.unique(island, return_counts=True)
            mode_value = vals[np.argmax(counts)]

            detokenized_pred = self.tokenizer_label.sequences_to_texts(
                [[mode_value]]
            )[0]
            decoded_entities.append(detokenized_pred)

        return decoded_entities

[docs]    def predict(
        self, text: str, displacy_format: bool = False
    ) -> List[Tuple[str, str]]:
        """
        Args:
            text:
                Input text.
            displacy_format:
                When set True, returns the result in spacy.displacy format to allow visualization.

        Returns:
            NER result as pairs of (token, entity).
        """
        word_punct_tokenized = WordPunctTokenize(text)

        # if len chars (including whitespaces) > sequence length, split it recursively
        len_text = len(list(" ".join(word_punct_tokenized)))
        if len_text > SEQ_LEN_MAX:
            num_tokens = len(word_punct_tokenized)

            first_half_result = self.predict(
                " ".join(word_punct_tokenized[: num_tokens // 2])
            )
            first_half_tokens = [pair[0] for pair in first_half_result]
            first_half_entities = [pair[1] for pair in first_half_result]

            second_half_result = self.predict(
                " ".join(word_punct_tokenized[(num_tokens // 2) :])
            )
            second_half_tokens = [pair[0] for pair in second_half_result]
            second_half_entities = [pair[1] for pair in second_half_result]

            word_punct_tokenized = first_half_tokens + second_half_tokens
            decoded_entities = first_half_entities + second_half_entities

        else:
            charlevel_pred = self._predict_char_level(word_punct_tokenized)
            decoded_entities = self._charner_decoder(
                word_punct_tokenized, charlevel_pred
            )

        ner_result = [
            (t, e) for t, e in zip(word_punct_tokenized, decoded_entities)
        ]

        if not displacy_format:
            return ner_result
        else:
            return ner_to_displacy_format(text, ner_result)