Source code for vnlp.stopword_remover.stopword_remover

from typing import List
from pathlib import Path

import numpy as np

# To suppress zero and nan division errors
np.seterr(divide="ignore", invalid="ignore")

PATH = "../resources/"
PATH = str(Path(__file__).parent / PATH)


[docs]class StopwordRemover: """ Stopword Remover class. Consists of Static and Dynamic stopword detection methods. Static stopwords list is taken from `Zemberek <https://github.com/ahmetax/trstop>`_ and some minor improvements are done. - Dynamic stopword algorithm is implemented according to two papers. - `On Stopwords, Filtering and Data Sparsity for Sentiment Analysis of Twitter <https://aclanthology.org/L14-1265/>`_ proposes to classify stopwords according to their frequency.. - `Finding a "Kneedle" in a Haystack: Detecting Knee Points in System Behavior <https://ieeexplore.ieee.org/document/5961514>`_ proposes to determine a cut-point automatically. """ def __init__(self): # Loading static stop words from the lexicon f = open(PATH + "/turkish_stop_words.txt", encoding="utf-8") self.stop_words = dict.fromkeys([line.strip() for line in f])
[docs] def dynamically_detect_stop_words( self, list_of_tokens: List[str], rare_words_freq: int = 0 ) -> List[str]: """ Dynamically detects stop words and returns them as list of tokens. Use a large corpus with at least hundreds of unique tokens for a reasonable result. Args: list_of_tokens: List of input tokens rare_words_freq: Maximum frequency of words when deciding rarity. Default value is 0 so it does not detect any rare words by default. Returns: List of dynamically detected stop words. Raises: ValueError: Number of unique tokens must be at least 3 for Dynamic Stop Word Detection. Example:: from vnlp import StopwordRemover stopword_remover = StopwordRemover() stopword_remover.dynamically_detect_stop_words(""ben bugün gidip aşı olacağım sonra da eve gelip telefon açacağım aşı nasıl etkiledi eve gelip anlatırım aşı olmak bu dönemde çok ama ama ama ama çok önemli"".split()) ['ama', 'aşı', 'gelip', 'eve'] """ unq, cnts = np.unique(list_of_tokens, return_counts=True) sorted_indices = cnts.argsort()[ ::-1 ] # I need them in descending order unq = unq[sorted_indices] cnts = cnts[sorted_indices] if len(unq) < 3: raise ValueError( "Number of unique tokens must be at least 3 for Dynamic Stop Word Detection." ) # Below is equivalent to: # df_words['counts'].pct_change().abs().pct_change().abs().dropna().idxmax() # First deriv diffs_one = np.diff(cnts) pct_change_one = np.abs(diffs_one / cnts[:-1]) # Second deriv diffs_two = np.diff(pct_change_one) pct_change_two = np.abs(diffs_two / pct_change_one[:-1]) pct_change_two = pct_change_two[ ~np.isnan(pct_change_two) ] # removing nan argmax_second_der = np.argmax(pct_change_two) # +2 is due to shifting twice due to np.diff() detected_stop_words = unq[: argmax_second_der + 2].tolist() # Determine rare_words according to given rare_words_freq value # Add them to dynamic_stop_words list rare_words = unq[cnts <= rare_words_freq].tolist() detected_stop_words += rare_words return detected_stop_words
[docs] def add_to_stop_words(self, novel_stop_words: List[str]): """ Updates self.stop_words by adding given novel_stop_words to existing dictionary. Args: novel_stop_words: Tokens to be added to existing stop_words dictionary. Example:: from vnlp import StopwordRemover stopword_remover = StopwordRemover() stopword_remover.add_to_stop_words(['ama', 'aşı', 'gelip', 'eve']) """ self.stop_words.update(dict.fromkeys(novel_stop_words))
[docs] def drop_stop_words(self, list_of_tokens: List[str]) -> List[str]: """ Given list of tokens, drops stop words and returns list of remaining tokens. Args: list_of_tokens: List of input tokens. Returns: List of tokens stripped of stopwords Example:: from vnlp import StopwordRemover stopword_remover = StopwordRemover() stopword_remover.drop_stop_words("acaba bugün kahvaltıda kahve yerine çay mı içsem ya da neyse süt içeyim".split()) ['bugün', 'kahvaltıda', 'kahve', 'çay', 'içsem', 'süt', 'içeyim'] """ tokens_without_stopwords = [ token for token in list_of_tokens if token not in self.stop_words ] return tokens_without_stopwords