Source code for vnlp.normalizer.normalizer

from typing import List
from pathlib import Path

from spylls.hunspell import Dictionary

from ._deasciifier import Deasciifier
from ..stemmer_morph_analyzer import StemmerAnalyzer

RESOURCES_PATH = str(Path(__file__).parent.parent / 'resources')


[docs]class Normalizer:
    """
    Normalizer class 
    
    - It contains the following functions to process and normalize text:

        - Spelling/Typo correction
        - Deasciification
        - Convert numbers to word form
        - Lower case
        - Punctuation Remover
        - Remove accent marks

    - For more details about the algorithms and datasets, see `Readme <https://github.com/vngrs-ai/VNLP/blob/main/vnlp/normalizer/ReadMe.md>`_.
    """
    def __init__(self):
        # Word Lexicon merged from TDK-Zemberek, Zargan, Bilkent Creative Writing, Turkish Broadcast News
        with open(RESOURCES_PATH + '/turkish_known_words_lexicon.txt', 'r', encoding='utf-8') as f:
            words_lexicon = [line.strip() for line in f]
        dict_words_lexicon = dict.fromkeys(words_lexicon)

        self._words_lexicon = dict_words_lexicon

        self._stemmer_analyzer = StemmerAnalyzer()

        self._dictionary = Dictionary.from_files(
            RESOURCES_PATH + '/tdd-hunspell-tr-1.1.0/tr_TR')

[docs]    @staticmethod
    def lower_case(text: str) -> str:
        """
        Converts a string of text to lowercase for Turkish language.
        
        This is needed because Python does not properly handle all Turkish characters, e.g., "İ" -> "i".

        Args:
            text:
                Input text.

        Returns:
            Text in lowercase form.

        Example::
        
            from vnlp import Normalizer
            Normalizer.lower_case("Test karakterleri: İIĞÜÖŞÇ")
        
            'test karakterleri: iığüöşç'
        """
        turkish_lowercase_dict = {"İ": "i", "I": "ı", "Ğ": "ğ", "Ü": "ü", "Ö": "ö", "Ş": "ş", "Ç": "ç"}
        for k, v in turkish_lowercase_dict.items():
            text = text.replace(k, v)

        return text.lower()
    
[docs]    @staticmethod
    def remove_punctuations(text: str)-> str:
        """
        Removes punctuations from the given string.

        Args:
            text: Input text.

        Returns:
            Text stripped from punctuations.

        Example::

            from vnlp import Normalizer
            Normalizer.remove_punctuations("merhaba,.!")

            'merhaba'
        """
        return ''.join([t for t in text if (t.isalnum() or t == " ")])

[docs]    @staticmethod
    def remove_accent_marks(text: str)-> str:
        """
        Removes accent marks from the given string.

        Args:
            text:
                Input text.

        Returns:
            Text stripped from accent marks.

        Example::

            from vnlp import Normalizer
            Normalizer.remove_accent_marks("merhâbâ")

            'merhaba'
        """
        _non_turkish_accent_marks = {'â':'a', 'ô':'o', 'î':'ı', 'ê':'e', 'û':'u',
                                     'Â':'A', 'Ô':'o', 'Î':'ı', 'Ê':'e', 'Û': 'u'}
        return ''.join(_non_turkish_accent_marks.get(char, char) for char in text)

[docs]    @staticmethod
    def deasciify(tokens: List[str]) -> List[str]:
        """
        Deasciifies the given text for Turkish.
        
        This function uses `Emre Sevinç's implementation <https://github.com/emres/turkish-deasciifier>`_. 

        Args:
            tokens:
                List of input tokens.

        Returns:
            List of deasciified tokens.

        Example::

            from vnlp import Normalizer
            Normalizer.deasciify("dusunuyorum da boyle sey gormedim duymadim".split())

            ["düşünüyorum", "da", "böyle", "şey", "görmedim", "duymadım"]
        """
        deasciified_tokens = []
        for token in tokens:
            deasciifier = Deasciifier(token)
            deasciified_tokens.append(deasciifier.convert_to_turkish())
        return deasciified_tokens

[docs]    def correct_typos(self, tokens: List[str]) -> List[str]:
        """
        Detects and corrects spelling mistakes and typos.

        This implementation uses StemmerAnalyzer and Hunspell to detect typos.
        Detected typos are corrected by Hunspell algorithm using "tdd-hunspell-tr-1.1.0" dict.

        Args:
            tokens:
                List of input tokens.

        Returns:
            List of corrected tokens.

        Example::

            from vnlp import Normalizer
            normalizer = Normalizer()
            normalizer.correct_typos("Kasıtlı yazişm hatasıı ekliyoruum".split())

            ["Kasıtlı", "yazım", "hatası", "ekliyorum"]
        """
        corrected_tokens = []
        for token in tokens:
            if (self._is_token_valid_turkish(token)) or (self._dictionary.lookup(token)):
                corrected_tokens.append(token)
            else:
                hunspell_suggestions = list(self._dictionary.suggest(token))
                if len(hunspell_suggestions) > 0:
                    corrected_token = hunspell_suggestions[0]
                    corrected_tokens.append(corrected_token)
                else:
                    # there is no suggestion so return the original token
                    corrected_tokens.append(token)
        
        return corrected_tokens
    
[docs]    def convert_numbers_to_words(self, tokens: List[str], num_dec_digits: int = 6, decimal_seperator: str = ',')-> List[str]:
        """
        Converts numbers to word form.

        Args:
            tokens:
                List of input tokens.
            num_dec_digits:
                Number of precision (decimal points) for floats.
            decimal_seperator:
                Decimal seperator character. Can be either "." or ",".

        Returns:
            List of converted tokens

        Raises:
            ValueError: Given 'decimal seperator' is not a valid decimal seperator value. Use either "." or ",".

        Example::

            from vnlp import Normalizer
            normalizer = Normalizer()
            normalizer.convert_numbers_to_words("sabah 3 yumurta yedim ve tartıldığımda 1,15 kilogram aldığımı gördüm".split())

            ['sabah',
            'üç',
            'yumurta',
            'yedim',
            've',
            'tartıldığımda',
            'bir',
            'virgül',
            'on',
            'beş',
            'kilogram',
            'aldığımı',
            'gördüm']
        """
        converted_tokens = []
        for token in tokens:
            # if there's any numeric character in token
            if any([char.isnumeric() for char in token]):
                if decimal_seperator == ',':
                    # if decimal seperator is comma, then thousands seperator is dot and it will be converted to python's
                    # thousands seperator underscore.
                    # furthermore, comma will be converted to dot, python's decimal seperator.
                    token = token.replace('.', '_').replace(',', '.') 
                elif decimal_seperator == '.':
                    # if decimal seperator is dot, then thousands seperator is comma and it will be converted to python's
                    # thousands seperator underscore.
                    token = token.replace(',', '_')
                else:
                    raise ValueError(decimal_seperator, 'is not a valid decimal seperator value. Use either "." or ","')


            # Try to convert token to number
            try:
                num = float(token)
                converted_tokens += self._num_to_words(num, num_dec_digits).split()
            # If fails, then return it as string
            except:
                converted_tokens.append(token)
                
        return converted_tokens

    def _is_token_valid_turkish(self, token):
        """
        Checks whether given token is valid according to Turkish.
        """
        valid_according_to_stemmer_analyzer = not (self._stemmer_analyzer.candidate_generator.get_analysis_candidates(token)[0][-1] == 'Unknown')
        valid_according_to_lexicon = token in self._words_lexicon
        return valid_according_to_stemmer_analyzer or valid_according_to_lexicon

    def _int_to_words(self, main_num, put_commas = False):
        """
        This function is adapted from:
        https://github.com/Omerktn/Turkish-Lexical-Representation-of-Numbers/blob/master/src.py
        It had a few bugs with numbers like 1000 and 1010, which are resolved.
        """
        
        # yüz=10^2 ve vigintilyon=10^63, ith element is 10^3 times greater then (i-1)th.
        tp = [" yüz", " bin", "", "", " milyon", " milyar", " trilyon", " katrilyon", " kentilyon",
            " seksilyon", " septilyon", " oktilyon", " nonilyon", " desilyon", " undesilyon",
            " dodesilyon", " tredesilyon", " katordesilyon", " seksdesilyon", " septendesilyon",
            " oktodesilyon", " nove mdesilyon", " vigintilyon"]

        # dec[]: every decimal digit,  ten[]: every tenth number
        dec = ["", " bir", " iki", " üç", " dört", " beş", " altı", " yedi", " sekiz", " dokuz"]
        ten = ["", " on", " yirmi", " otuz", " kırk", " elli", " altmış", " yetmiş", " seksen", " doksan"]

        text = ""

        # get length of main_num
        num = main_num
        leng = 0
        while num != 0:
            num = num // 10
            leng += 1

        if main_num == 0:
            text = " sıfır"

        # split main_num to (three digit) pieces and read them by mod 3.
        for i in range(leng, 0, -1):
            digit = int((main_num // (10 ** (i - 1))) % 10)
            if i % 3 == 0:
                if digit == 1:
                    text += tp[0]
                elif digit == 0:
                    text += dec[digit]
                else:
                    text += dec[digit] + tp[0]
            elif i % 3 == 1:
                if (i > 3):
                    if main_num > 1999:
                        text += dec[digit] + tp[i - 3]
                    else:
                        text += tp[i - 3]
                else:
                    text += dec[digit]
                if i>3 and put_commas: 
                    text += ","
            elif i % 3 == 2:
                text += ten[digit]
        
        return text[1:]

    def _num_to_words(self, num, num_dec_digits):
        integer_part = int(num)
        decimal_part = round(num % 1, num_dec_digits)

        # if number is int (considering significant decimal digits)
        if decimal_part < 10**-num_dec_digits:
            return self._int_to_words(integer_part)
        # if number is float
        else:
            str_decimal = '{:f}'.format(round(num % 1, num_dec_digits))[2:]
            
            zeros_after_decimal = 0
            for char in str_decimal:
                if char =="0":
                    zeros_after_decimal+=1
                else:
                    break
            str_decimal_stripped_from_zeros = str_decimal.strip("0") # strip gets rid of heading and trailing 0s in string form
            if str_decimal_stripped_from_zeros == "":
                decimal_part = 0
            else:
                decimal_part = int(str_decimal_stripped_from_zeros)

            return self._int_to_words(integer_part) + " virgül " + "sıfır " * zeros_after_decimal + self._int_to_words(decimal_part)