Source code for vnlp.normalizer.normalizer

from typing import List
from pathlib import Path

from spylls.hunspell import Dictionary

from ._deasciifier import Deasciifier
from ..stemmer_morph_analyzer import StemmerAnalyzer

RESOURCES_PATH = str(Path(__file__).parent.parent / 'resources')

[docs]class Normalizer: """ Normalizer class - It contains the following functions to process and normalize text: - Spelling/Typo correction - Deasciification - Convert numbers to word form - Lower case - Punctuation Remover - Remove accent marks - For more details about the algorithms and datasets, see `Readme <>`_. """ def __init__(self): # Word Lexicon merged from TDK-Zemberek, Zargan, Bilkent Creative Writing, Turkish Broadcast News with open(RESOURCES_PATH + '/turkish_known_words_lexicon.txt', 'r', encoding='utf-8') as f: words_lexicon = [line.strip() for line in f] dict_words_lexicon = dict.fromkeys(words_lexicon) self._words_lexicon = dict_words_lexicon self._stemmer_analyzer = StemmerAnalyzer() self._dictionary = Dictionary.from_files( RESOURCES_PATH + '/tdd-hunspell-tr-1.1.0/tr_TR')
[docs] @staticmethod def lower_case(text: str) -> str: """ Converts a string of text to lowercase for Turkish language. This is needed because Python does not properly handle all Turkish characters, e.g., "İ" -> "i". Args: text: Input text. Returns: Text in lowercase form. Example:: from vnlp import Normalizer Normalizer.lower_case("Test karakterleri: İIĞÜÖŞÇ") 'test karakterleri: iığüöşç' """ turkish_lowercase_dict = {"İ": "i", "I": "ı", "Ğ": "ğ", "Ü": "ü", "Ö": "ö", "Ş": "ş", "Ç": "ç"} for k, v in turkish_lowercase_dict.items(): text = text.replace(k, v) return text.lower()
[docs] @staticmethod def remove_punctuations(text: str)-> str: """ Removes punctuations from the given string. Args: text: Input text. Returns: Text stripped from punctuations. Example:: from vnlp import Normalizer Normalizer.remove_punctuations("merhaba,.!") 'merhaba' """ return ''.join([t for t in text if (t.isalnum() or t == " ")])
[docs] @staticmethod def remove_accent_marks(text: str)-> str: """ Removes accent marks from the given string. Args: text: Input text. Returns: Text stripped from accent marks. Example:: from vnlp import Normalizer Normalizer.remove_accent_marks("merhâbâ") 'merhaba' """ _non_turkish_accent_marks = {'â':'a', 'ô':'o', 'î':'ı', 'ê':'e', 'û':'u', 'Â':'A', 'Ô':'o', 'Î':'ı', 'Ê':'e', 'Û': 'u'} return ''.join(_non_turkish_accent_marks.get(char, char) for char in text)
[docs] @staticmethod def deasciify(tokens: List[str]) -> List[str]: """ Deasciifies the given text for Turkish. This function uses `Emre Sevinç's implementation <>`_. Args: tokens: List of input tokens. Returns: List of deasciified tokens. Example:: from vnlp import Normalizer Normalizer.deasciify("dusunuyorum da boyle sey gormedim duymadim".split()) ["düşünüyorum", "da", "böyle", "şey", "görmedim", "duymadım"] """ deasciified_tokens = [] for token in tokens: deasciifier = Deasciifier(token) deasciified_tokens.append(deasciifier.convert_to_turkish()) return deasciified_tokens
[docs] def correct_typos(self, tokens: List[str]) -> List[str]: """ Detects and corrects spelling mistakes and typos. This implementation uses StemmerAnalyzer and Hunspell to detect typos. Detected typos are corrected by Hunspell algorithm using "tdd-hunspell-tr-1.1.0" dict. Args: tokens: List of input tokens. Returns: List of corrected tokens. Example:: from vnlp import Normalizer normalizer = Normalizer() normalizer.correct_typos("Kasıtlı yazişm hatasıı ekliyoruum".split()) ["Kasıtlı", "yazım", "hatası", "ekliyorum"] """ corrected_tokens = [] for token in tokens: if (self._is_token_valid_turkish(token)) or (self._dictionary.lookup(token)): corrected_tokens.append(token) else: hunspell_suggestions = list(self._dictionary.suggest(token)) if len(hunspell_suggestions) > 0: corrected_token = hunspell_suggestions[0] corrected_tokens.append(corrected_token) else: # there is no suggestion so return the original token corrected_tokens.append(token) return corrected_tokens
[docs] def convert_numbers_to_words(self, tokens: List[str], num_dec_digits: int = 6, decimal_seperator: str = ',')-> List[str]: """ Converts numbers to word form. Args: tokens: List of input tokens. num_dec_digits: Number of precision (decimal points) for floats. decimal_seperator: Decimal seperator character. Can be either "." or ",". Returns: List of converted tokens Raises: ValueError: Given 'decimal seperator' is not a valid decimal seperator value. Use either "." or ",". Example:: from vnlp import Normalizer normalizer = Normalizer() normalizer.convert_numbers_to_words("sabah 3 yumurta yedim ve tartıldığımda 1,15 kilogram aldığımı gördüm".split()) ['sabah', 'üç', 'yumurta', 'yedim', 've', 'tartıldığımda', 'bir', 'virgül', 'on', 'beş', 'kilogram', 'aldığımı', 'gördüm'] """ converted_tokens = [] for token in tokens: # if there's any numeric character in token if any([char.isnumeric() for char in token]): if decimal_seperator == ',': # if decimal seperator is comma, then thousands seperator is dot and it will be converted to python's # thousands seperator underscore. # furthermore, comma will be converted to dot, python's decimal seperator. token = token.replace('.', '_').replace(',', '.') elif decimal_seperator == '.': # if decimal seperator is dot, then thousands seperator is comma and it will be converted to python's # thousands seperator underscore. token = token.replace(',', '_') else: raise ValueError(decimal_seperator, 'is not a valid decimal seperator value. Use either "." or ","') # Try to convert token to number try: num = float(token) converted_tokens += self._num_to_words(num, num_dec_digits).split() # If fails, then return it as string except: converted_tokens.append(token) return converted_tokens
def _is_token_valid_turkish(self, token): """ Checks whether given token is valid according to Turkish. """ valid_according_to_stemmer_analyzer = not (self._stemmer_analyzer.candidate_generator.get_analysis_candidates(token)[0][-1] == 'Unknown') valid_according_to_lexicon = token in self._words_lexicon return valid_according_to_stemmer_analyzer or valid_according_to_lexicon def _int_to_words(self, main_num, put_commas = False): """ This function is adapted from: It had a few bugs with numbers like 1000 and 1010, which are resolved. """ # yüz=10^2 ve vigintilyon=10^63, ith element is 10^3 times greater then (i-1)th. tp = [" yüz", " bin", "", "", " milyon", " milyar", " trilyon", " katrilyon", " kentilyon", " seksilyon", " septilyon", " oktilyon", " nonilyon", " desilyon", " undesilyon", " dodesilyon", " tredesilyon", " katordesilyon", " seksdesilyon", " septendesilyon", " oktodesilyon", " nove mdesilyon", " vigintilyon"] # dec[]: every decimal digit, ten[]: every tenth number dec = ["", " bir", " iki", " üç", " dört", " beş", " altı", " yedi", " sekiz", " dokuz"] ten = ["", " on", " yirmi", " otuz", " kırk", " elli", " altmış", " yetmiş", " seksen", " doksan"] text = "" # get length of main_num num = main_num leng = 0 while num != 0: num = num // 10 leng += 1 if main_num == 0: text = " sıfır" # split main_num to (three digit) pieces and read them by mod 3. for i in range(leng, 0, -1): digit = int((main_num // (10 ** (i - 1))) % 10) if i % 3 == 0: if digit == 1: text += tp[0] elif digit == 0: text += dec[digit] else: text += dec[digit] + tp[0] elif i % 3 == 1: if (i > 3): if main_num > 1999: text += dec[digit] + tp[i - 3] else: text += tp[i - 3] else: text += dec[digit] if i>3 and put_commas: text += "," elif i % 3 == 2: text += ten[digit] return text[1:] def _num_to_words(self, num, num_dec_digits): integer_part = int(num) decimal_part = round(num % 1, num_dec_digits) # if number is int (considering significant decimal digits) if decimal_part < 10**-num_dec_digits: return self._int_to_words(integer_part) # if number is float else: str_decimal = '{:f}'.format(round(num % 1, num_dec_digits))[2:] zeros_after_decimal = 0 for char in str_decimal: if char =="0": zeros_after_decimal+=1 else: break str_decimal_stripped_from_zeros = str_decimal.strip("0") # strip gets rid of heading and trailing 0s in string form if str_decimal_stripped_from_zeros == "": decimal_part = 0 else: decimal_part = int(str_decimal_stripped_from_zeros) return self._int_to_words(integer_part) + " virgül " + "sıfır " * zeros_after_decimal + self._int_to_words(decimal_part)