Source code for vnlp.sentence_splitter.sentence_splitter

from typing import List
from pathlib import Path
from enum import Enum
import regex

PATH = "../resources/"
PATH = str(Path(__file__).parent / PATH)


[docs]class SentenceSplitter:
    """
    This is a rule based sentence splitter adapted from `Philipp Koehn and Josh Schroeder's project <https://pypi.org/project/sentence-splitter/>`_.

    The code is reduced and simplifed for Turkish language.

    Abbreviations lexicon is expanded.
    """

    class _PrefixType(Enum):
        DEFAULT = 1
        NUMERIC_ONLY = 2

    def __init__(self):  # ISO 639-1 language code
        self._non_breaking_prefixes = dict()
        with open(
            PATH + "/non_breaking_prefixes_tr.txt", mode="r", encoding="utf-8"
        ) as prefix_file:
            for line in prefix_file.readlines():
                if "#NUMERIC_ONLY#" in line:
                    prefix_type = SentenceSplitter._PrefixType.NUMERIC_ONLY
                else:
                    prefix_type = SentenceSplitter._PrefixType.DEFAULT

                # non_brekaing_prefixes_tr file contains comments for ease of read
                # so this part removes them
                line = regex.sub(
                    pattern=r"#.*",
                    repl="",
                    string=line,
                    flags=regex.DOTALL | regex.UNICODE,
                )
                line = line.strip()

                if not line:
                    continue

                self._non_breaking_prefixes[line] = prefix_type

    # lower level function used by the class to split given string into list of strings, thus sentences
    # TODO: reduce complexity of this function
    def _split(self, text):  # noqa: C901
        if (text is None) | (not text):
            return []

        # Sentence Breaker Rules:
        # Sentence markes such as "?", "!" that are not period, followed by sentence starter
        text = regex.sub(
            pattern=r'([?!]) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\p{Uppercase_Letter}\p{Other_Letter}])',
            repl="\\1\n\\2",
            string=text,
            flags=regex.UNICODE,
        )

        # Multiple dots ("...") followed by sentence starter
        text = regex.sub(
            pattern=r'(\.[\.]+) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\p{Uppercase_Letter}\p{Other_Letter}])',
            repl="\\1\n\\2",
            string=text,
            flags=regex.UNICODE,
        )

        # Sentence ending with punctuation, within quotation marks or parenthesis, followed by sentence starter punctuation and upper case
        text = regex.sub(
            pattern=(
                r'([?!\.][\ ]*[\'")\]\p{Final_Punctuation}]+) +([\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[\ ]*'
                r"[\p{Uppercase_Letter}\p{Other_Letter}])"
            ),
            repl="\\1\n\\2",
            string=text,
            flags=regex.UNICODE,
        )

        # Sentence ending with punctuation and followed by sentence starter punctuation and capital letter
        text = regex.sub(
            pattern=(
                r'([?!\.]) +([\'"[\u00bf\u00A1\p{Initial_Punctuation}]+[\ ]*[\p{Uppercase_Letter}\p{Other_Letter}])'
            ),
            repl="\\1\n\\2",
            string=text,
            flags=regex.UNICODE,
        )

        # Special punctuation cases
        words = regex.split(pattern=r" +", string=text, flags=regex.UNICODE)
        text = ""
        for i in range(0, len(words) - 1):
            match = regex.search(
                pattern=r"([\w\.\-]*)([\'\"\)\]\%\p{Final_Punctuation}]*)(\.+)$",
                string=words[i],
                flags=regex.UNICODE,
            )
            if match:
                prefix = match.group(1)
                starting_punct = match.group(2)

                def is_honorific_prefix(prefix_, starting_punct_):
                    """Check if \\1 is a known honorific and \\2 is empty."""
                    if prefix_:
                        if prefix_ in self._non_breaking_prefixes:
                            if (
                                self._non_breaking_prefixes[prefix_]
                                == SentenceSplitter._PrefixType.DEFAULT
                            ):
                                if not starting_punct_:
                                    return True
                    return False

                if is_honorific_prefix(
                    prefix_=prefix, starting_punct_=starting_punct
                ):
                    # Not breaking
                    pass

                elif regex.search(
                    pattern=r"(\.)[\p{Uppercase_Letter}\p{Other_Letter}\-]+(\.+)$",
                    string=words[i],
                    flags=regex.UNICODE,
                ):
                    # Not breaking - upper case acronym
                    pass

                elif regex.search(
                    pattern=(
                        r'^([ ]*[\'"([\u00bf\u00A1\p{Initial_Punctuation}]*[ ]*[\p{Uppercase_Letter}'
                        r"\p{Other_Letter}0-9])"
                    ),
                    string=words[i + 1],
                    flags=regex.UNICODE,
                ):

                    def is_numeric(prefix_, starting_punct_, next_word):
                        """The next word has a bunch of initial quotes, maybe a space, then either upper case or a
                        number."""
                        if prefix_:
                            if prefix_ in self._non_breaking_prefixes:
                                if (
                                    self._non_breaking_prefixes[prefix_]
                                    == SentenceSplitter._PrefixType.NUMERIC_ONLY
                                ):
                                    if not starting_punct_:
                                        if regex.search(
                                            pattern="^[0-9]+",
                                            string=next_word,
                                            flags=regex.UNICODE,
                                        ):
                                            return True
                        return False

                    if not is_numeric(
                        prefix_=prefix,
                        starting_punct_=starting_punct,
                        next_word=words[i + 1],
                    ):
                        words[i] = words[i] + "\n"

                    # A return is always added unless there is a numeric non-breaker and a number start

            text = text + words[i] + " "

        # Stopped one token away from the end, so that easy look-ahead is possible. Then appended.
        text = text + words[-1]

        # White spaces at the head and tail are removed.
        # Double white spaces are also removed.
        text = regex.sub(pattern=" +", repl=" ", string=text)
        text = regex.sub(pattern="\n ", repl="\n", string=text)
        text = regex.sub(pattern=" \n", repl="\n", string=text)
        text = text.strip()

        sentences = text.split("\n")

        return sentences

    # higher level function that is called by the user to split
[docs]    def split_sentences(self, text: str) -> List[str]:
        """
        Given a string of sentences, returns list of strings, where each string in the list is a sentence.

        Args:
            text:
                Input sentences.

        Returns:
             List of splitted sentences.

        Example::

            from vnlp import SentenceSplitter
            sentence_splitter = SentenceSplitter()
            sentence_splitter.split_sentences('Av. Meryem Beşer, 3.5 yıldır süren dava ile ilgili dedi ki, "Duruşma bitti, dava lehimize sonuçlandı." Bu harika bir haber.')

            ['Av. Meryem beşer, 3.5 yıldır süren dava ile ilgili dedi ki, "Duruşma bitti, dava lehimize sonuçlandı."',
            'Bu harika bir haber.']
        """

        return self._split(text)