from typing import List
from pathlib import Path
from spylls.hunspell import Dictionary
from ._deasciifier import Deasciifier
from ..stemmer_morph_analyzer import StemmerAnalyzer
RESOURCES_PATH = str(Path(__file__).parent.parent / "resources")
[docs]class Normalizer:
"""
Normalizer class
- It contains the following functions to process and normalize text:
- Spelling/Typo correction
- Deasciification
- Convert numbers to word form
- Lower case
- Punctuation Remover
- Remove accent marks
- For more details about the algorithms and datasets, see `Readme <https://github.com/vngrs-ai/VNLP/blob/main/vnlp/normalizer/ReadMe.md>`_.
"""
def __init__(self):
# Word Lexicon merged from TDK-Zemberek, Zargan, Bilkent Creative Writing, Turkish Broadcast News
with open(
RESOURCES_PATH + "/turkish_known_words_lexicon.txt",
"r",
encoding="utf-8",
) as f:
words_lexicon = [line.strip() for line in f]
dict_words_lexicon = dict.fromkeys(words_lexicon)
self._words_lexicon = dict_words_lexicon
self._stemmer_analyzer = StemmerAnalyzer()
self._dictionary = Dictionary.from_files(
RESOURCES_PATH + "/tdd-hunspell-tr-1.1.0/tr_TR"
)
[docs] @staticmethod
def lower_case(text: str) -> str:
"""
Converts a string of text to lowercase for Turkish language.
This is needed because Python does not properly handle all Turkish characters, e.g., "İ" -> "i".
Args:
text:
Input text.
Returns:
Text in lowercase form.
Example::
from vnlp import Normalizer
Normalizer.lower_case("Test karakterleri: İIĞÜÖŞÇ")
'test karakterleri: iığüöşç'
"""
turkish_lowercase_dict = {
"İ": "i",
"I": "ı",
"Ğ": "ğ",
"Ü": "ü",
"Ö": "ö",
"Ş": "ş",
"Ç": "ç",
}
for k, v in turkish_lowercase_dict.items():
text = text.replace(k, v)
return text.lower()
[docs] @staticmethod
def remove_punctuations(text: str) -> str:
"""
Removes punctuations from the given string.
Args:
text: Input text.
Returns:
Text stripped from punctuations.
Example::
from vnlp import Normalizer
Normalizer.remove_punctuations("merhaba,.!")
'merhaba'
"""
return "".join([t for t in text if (t.isalnum() or t == " ")])
[docs] @staticmethod
def remove_accent_marks(text: str) -> str:
"""
Removes accent marks from the given string.
Args:
text:
Input text.
Returns:
Text stripped from accent marks.
Example::
from vnlp import Normalizer
Normalizer.remove_accent_marks("merhâbâ")
'merhaba'
"""
_non_turkish_accent_marks = {
"â": "a",
"ô": "o",
"î": "ı",
"ê": "e",
"û": "u",
"Â": "A",
"Ô": "o",
"Î": "ı",
"Ê": "e",
"Û": "u",
}
return "".join(
_non_turkish_accent_marks.get(char, char) for char in text
)
[docs] @staticmethod
def deasciify(tokens: List[str]) -> List[str]:
"""
Deasciifies the given text for Turkish.
This function uses `Emre Sevinç's implementation <https://github.com/emres/turkish-deasciifier>`_.
Args:
tokens:
List of input tokens.
Returns:
List of deasciified tokens.
Example::
from vnlp import Normalizer
Normalizer.deasciify("dusunuyorum da boyle sey gormedim duymadim".split())
["düşünüyorum", "da", "böyle", "şey", "görmedim", "duymadım"]
"""
deasciified_tokens = []
for token in tokens:
deasciifier = Deasciifier(token)
deasciified_tokens.append(deasciifier.convert_to_turkish())
return deasciified_tokens
[docs] def correct_typos(self, tokens: List[str]) -> List[str]:
"""
Detects and corrects spelling mistakes and typos.
This implementation uses StemmerAnalyzer and Hunspell to detect typos.
Detected typos are corrected by Hunspell algorithm using "tdd-hunspell-tr-1.1.0" dict.
Args:
tokens:
List of input tokens.
Returns:
List of corrected tokens.
Example::
from vnlp import Normalizer
normalizer = Normalizer()
normalizer.correct_typos("Kasıtlı yazişm hatasıı ekliyoruum".split())
["Kasıtlı", "yazım", "hatası", "ekliyorum"]
"""
corrected_tokens = []
for token in tokens:
if (self._is_token_valid_turkish(token)) or (
self._dictionary.lookup(token)
):
corrected_tokens.append(token)
else:
hunspell_suggestions = list(self._dictionary.suggest(token))
if len(hunspell_suggestions) > 0:
corrected_token = hunspell_suggestions[0]
corrected_tokens.append(corrected_token)
else:
# there is no suggestion so return the original token
corrected_tokens.append(token)
return corrected_tokens
[docs] def convert_numbers_to_words(
self,
tokens: List[str],
num_dec_digits: int = 6,
decimal_seperator: str = ",",
) -> List[str]:
"""
Converts numbers to word form.
Args:
tokens:
List of input tokens.
num_dec_digits:
Number of precision (decimal points) for floats.
decimal_seperator:
Decimal seperator character. Can be either "." or ",".
Returns:
List of converted tokens
Raises:
ValueError: Given 'decimal seperator' is not a valid decimal seperator value. Use either "." or ",".
Example::
from vnlp import Normalizer
normalizer = Normalizer()
normalizer.convert_numbers_to_words("sabah 3 yumurta yedim ve tartıldığımda 1,15 kilogram aldığımı gördüm".split())
['sabah',
'üç',
'yumurta',
'yedim',
've',
'tartıldığımda',
'bir',
'virgül',
'on',
'beş',
'kilogram',
'aldığımı',
'gördüm']
"""
converted_tokens = []
for token in tokens:
# if there's any numeric character in token
if any([char.isnumeric() for char in token]):
if decimal_seperator == ",":
# if decimal seperator is comma, then thousands seperator is dot and it will be converted to python's
# thousands seperator underscore.
# furthermore, comma will be converted to dot, python's decimal seperator.
token = token.replace(".", "_").replace(",", ".")
elif decimal_seperator == ".":
# if decimal seperator is dot, then thousands seperator is comma and it will be converted to python's
# thousands seperator underscore.
token = token.replace(",", "_")
else:
raise ValueError(
decimal_seperator,
'is not a valid decimal seperator value. Use either "." or ","',
)
# Try to convert token to number
try:
num = float(token)
converted_tokens += self._num_to_words(
num, num_dec_digits
).split()
# If fails, then return it as string
except ValueError:
converted_tokens.append(token)
return converted_tokens
def _is_token_valid_turkish(self, token):
"""
Checks whether given token is valid according to Turkish.
"""
valid_according_to_stemmer_analyzer = not (
self._stemmer_analyzer.candidate_generator.get_analysis_candidates(
token
)[0][-1]
== "Unknown"
)
valid_according_to_lexicon = token in self._words_lexicon
return (
valid_according_to_stemmer_analyzer or valid_according_to_lexicon
)
def _int_to_words(self, main_num, put_commas=False):
"""
This function is adapted from:
https://github.com/Omerktn/Turkish-Lexical-Representation-of-Numbers/blob/master/src.py
It had a few bugs with numbers like 1000 and 1010, which are resolved.
"""
# yüz=10^2 ve vigintilyon=10^63, ith element is 10^3 times greater then (i-1)th.
tp = [
" yüz",
" bin",
"",
"",
" milyon",
" milyar",
" trilyon",
" katrilyon",
" kentilyon",
" seksilyon",
" septilyon",
" oktilyon",
" nonilyon",
" desilyon",
" undesilyon",
" dodesilyon",
" tredesilyon",
" katordesilyon",
" seksdesilyon",
" septendesilyon",
" oktodesilyon",
" nove mdesilyon",
" vigintilyon",
]
# dec[]: every decimal digit, ten[]: every tenth number
dec = [
"",
" bir",
" iki",
" üç",
" dört",
" beş",
" altı",
" yedi",
" sekiz",
" dokuz",
]
ten = [
"",
" on",
" yirmi",
" otuz",
" kırk",
" elli",
" altmış",
" yetmiş",
" seksen",
" doksan",
]
text = ""
# get length of main_num
num = main_num
leng = 0
while num != 0:
num = num // 10
leng += 1
if main_num == 0:
text = " sıfır"
# split main_num to (three digit) pieces and read them by mod 3.
for i in range(leng, 0, -1):
digit = int((main_num // (10 ** (i - 1))) % 10)
if i % 3 == 0:
if digit == 1:
text += tp[0]
elif digit == 0:
text += dec[digit]
else:
text += dec[digit] + tp[0]
elif i % 3 == 1:
if i > 3:
if main_num > 1999:
text += dec[digit] + tp[i - 3]
else:
text += tp[i - 3]
else:
text += dec[digit]
if i > 3 and put_commas:
text += ","
elif i % 3 == 2:
text += ten[digit]
return text[1:]
def _num_to_words(self, num, num_dec_digits):
integer_part = int(num)
decimal_part = round(num % 1, num_dec_digits)
# if number is int (considering significant decimal digits)
if decimal_part < 10**-num_dec_digits:
return self._int_to_words(integer_part)
# if number is float
else:
str_decimal = "{:f}".format(round(num % 1, num_dec_digits))[2:]
zeros_after_decimal = 0
for char in str_decimal:
if char == "0":
zeros_after_decimal += 1
else:
break
str_decimal_stripped_from_zeros = str_decimal.strip(
"0"
) # strip gets rid of heading and trailing 0s in string form
if str_decimal_stripped_from_zeros == "":
decimal_part = 0
else:
decimal_part = int(str_decimal_stripped_from_zeros)
return (
self._int_to_words(integer_part)
+ " virgül "
+ "sıfır " * zeros_after_decimal
+ self._int_to_words(decimal_part)
)