Source code for vnlp.sentence_splitter.sentence_splitter

from typing import List
from pathlib import Path
from enum import Enum
import regex

PATH = "../resources/"
PATH = str(Path(__file__).parent / PATH)


[docs]class SentenceSplitter: """ This is a rule based sentence splitter adapted from `Philipp Koehn and Josh Schroeder's project <https://pypi.org/project/sentence-splitter/>`_. The code is reduced and simplifed for Turkish language. Abbreviations lexicon is expanded. """ class _PrefixType(Enum): DEFAULT = 1 NUMERIC_ONLY = 2 def __init__(self): # ISO 639-1 language code self._non_breaking_prefixes = dict() with open( PATH + "/non_breaking_prefixes_tr.txt", mode="r", encoding="utf-8" ) as prefix_file: for line in prefix_file.readlines(): if "#NUMERIC_ONLY#" in line: prefix_type = SentenceSplitter._PrefixType.NUMERIC_ONLY else: prefix_type = SentenceSplitter._PrefixType.DEFAULT # non_brekaing_prefixes_tr file contains comments for ease of read # so this part removes them line = regex.sub( pattern=r"#.*", repl="", string=line, flags=regex.DOTALL | regex.UNICODE, ) line = line.strip() if not line: continue self._non_breaking_prefixes[line] = prefix_type # lower level function used by the class to split given string into list of strings, thus sentences # TODO: reduce complexity of this function def _split(self, text): # noqa: C901 if (text is None) | (not text): return [] # Sentence Breaker Rules: # Sentence markes such as "?", "!" that are not period, followed by sentence starter text = regex.sub( pattern=r'([?!]) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\p{Uppercase_Letter}\p{Other_Letter}])', repl="\\1\n\\2", string=text, flags=regex.UNICODE, ) # Multiple dots ("...") followed by sentence starter text = regex.sub( pattern=r'(\.[\.]+) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\p{Uppercase_Letter}\p{Other_Letter}])', repl="\\1\n\\2", string=text, flags=regex.UNICODE, ) # Sentence ending with punctuation, within quotation marks or parenthesis, followed by sentence starter punctuation and upper case text = regex.sub( pattern=( r'([?!\.][\ ]*[\'")\]\p{Final_Punctuation}]+) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\ ]*' r"[\p{Uppercase_Letter}\p{Other_Letter}])" ), repl="\\1\n\\2", string=text, flags=regex.UNICODE, ) # Sentence ending with punctuation and followed by sentence starter punctuation and capital letter text = regex.sub( pattern=( r'([?!\.]) +([\'"[\u00bf\u00A1\p{Initial_Punctuation}]+[\ ]*[\p{Uppercase_Letter}\p{Other_Letter}])' ), repl="\\1\n\\2", string=text, flags=regex.UNICODE, ) # Special punctuation cases words = regex.split(pattern=r" +", string=text, flags=regex.UNICODE) text = "" for i in range(0, len(words) - 1): match = regex.search( pattern=r"([\w\.\-]*)([\'\"\)\]\%\p{Final_Punctuation}]*)(\.+)$", string=words[i], flags=regex.UNICODE, ) if match: prefix = match.group(1) starting_punct = match.group(2) def is_honorific_prefix(prefix_, starting_punct_): """Check if \\1 is a known honorific and \\2 is empty.""" if prefix_: if prefix_ in self._non_breaking_prefixes: if ( self._non_breaking_prefixes[prefix_] == SentenceSplitter._PrefixType.DEFAULT ): if not starting_punct_: return True return False if is_honorific_prefix( prefix_=prefix, starting_punct_=starting_punct ): # Not breaking pass elif regex.search( pattern=r"(\.)[\p{Uppercase_Letter}\p{Other_Letter}\-]+(\.+)$", string=words[i], flags=regex.UNICODE, ): # Not breaking - upper case acronym pass elif regex.search( pattern=( r'^([ ]*[\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[ ]*[\p{Uppercase_Letter}' r"\p{Other_Letter}0-9])" ), string=words[i + 1], flags=regex.UNICODE, ): def is_numeric(prefix_, starting_punct_, next_word): """The next word has a bunch of initial quotes, maybe a space, then either upper case or a number.""" if prefix_: if prefix_ in self._non_breaking_prefixes: if ( self._non_breaking_prefixes[prefix_] == SentenceSplitter._PrefixType.NUMERIC_ONLY ): if not starting_punct_: if regex.search( pattern="^[0-9]+", string=next_word, flags=regex.UNICODE, ): return True return False if not is_numeric( prefix_=prefix, starting_punct_=starting_punct, next_word=words[i + 1], ): words[i] = words[i] + "\n" # A return is always added unless there is a numeric non-breaker and a number start text = text + words[i] + " " # Stopped one token away from the end, so that easy look-ahead is possible. Then appended. text = text + words[-1] # White spaces at the head and tail are removed. # Double white spaces are also removed. text = regex.sub(pattern=" +", repl=" ", string=text) text = regex.sub(pattern="\n ", repl="\n", string=text) text = regex.sub(pattern=" \n", repl="\n", string=text) text = text.strip() sentences = text.split("\n") return sentences # higher level function that is called by the user to split
[docs] def split_sentences(self, text: str) -> List[str]: """ Given a string of sentences, returns list of strings, where each string in the list is a sentence. Args: text: Input sentences. Returns: List of splitted sentences. Example:: from vnlp import SentenceSplitter sentence_splitter = SentenceSplitter() sentence_splitter.split_sentences('Av. Meryem Beşer, 3.5 yıldır süren dava ile ilgili dedi ki, "Duruşma bitti, dava lehimize sonuçlandı." Bu harika bir haber.') ['Av. Meryem beşer, 3.5 yıldır süren dava ile ilgili dedi ki, "Duruşma bitti, dava lehimize sonuçlandı."', 'Bu harika bir haber.'] """ return self._split(text)