updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/stem/cistem.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/stem/cistem.py
@@ -0,0 +1,209 @@
+# Natural Language Toolkit: CISTEM Stemmer for German
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Leonie Weissweiler <l.weissweiler@outlook.de>
+#         Tom Aarsen <> (modifications)
+# Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
+#            Alexander Fraser <fraser@cis.lmu.de>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import re
+from typing import Tuple
+
+from nltk.stem.api import StemmerI
+
+
+class Cistem(StemmerI):
+    """
+    CISTEM Stemmer for German
+
+    This is the official Python implementation of the CISTEM stemmer.
+    It is based on the paper
+    Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German
+    Based on a Comparative Analysis of Publicly Available Stemmers.
+    In Proceedings of the German Society for Computational Linguistics and Language
+    Technology (GSCL)
+    which can be read here:
+    https://www.cis.lmu.de/~weissweiler/cistem/
+
+    In the paper, we conducted an analysis of publicly available stemmers,
+    developed two gold standards for German stemming and evaluated the stemmers
+    based on the two gold standards. We then proposed the stemmer implemented here
+    and show that it achieves slightly better f-measure than the other stemmers and
+    is thrice as fast as the Snowball stemmer for German while being about as fast
+    as most other stemmers.
+
+    case_insensitive is a a boolean specifying if case-insensitive stemming
+    should be used. Case insensitivity improves performance only if words in the
+    text may be incorrectly upper case. For all-lowercase and correctly cased
+    text, best performance is achieved by setting case_insensitive for false.
+
+    :param case_insensitive: if True, the stemming is case insensitive. False by default.
+    :type case_insensitive: bool
+    """
+
+    strip_ge = re.compile(r"^ge(.{4,})")
+    repl_xx = re.compile(r"(.)\1")
+    strip_emr = re.compile(r"e[mr]$")
+    strip_nd = re.compile(r"nd$")
+    strip_t = re.compile(r"t$")
+    strip_esn = re.compile(r"[esn]$")
+    repl_xx_back = re.compile(r"(.)\*")
+
+    def __init__(self, case_insensitive: bool = False):
+        self._case_insensitive = case_insensitive
+
+    @staticmethod
+    def replace_to(word: str) -> str:
+        word = word.replace("sch", "$")
+        word = word.replace("ei", "%")
+        word = word.replace("ie", "&")
+        word = Cistem.repl_xx.sub(r"\1*", word)
+
+        return word
+
+    @staticmethod
+    def replace_back(word: str) -> str:
+        word = Cistem.repl_xx_back.sub(r"\1\1", word)
+        word = word.replace("%", "ei")
+        word = word.replace("&", "ie")
+        word = word.replace("$", "sch")
+
+        return word
+
+    def stem(self, word: str) -> str:
+        """Stems the input word.
+
+        :param word: The word that is to be stemmed.
+        :type word: str
+        :return: The stemmed word.
+        :rtype: str
+
+        >>> from nltk.stem.cistem import Cistem
+        >>> stemmer = Cistem()
+        >>> s1 = "Speicherbehältern"
+        >>> stemmer.stem(s1)
+        'speicherbehalt'
+        >>> s2 = "Grenzpostens"
+        >>> stemmer.stem(s2)
+        'grenzpost'
+        >>> s3 = "Ausgefeiltere"
+        >>> stemmer.stem(s3)
+        'ausgefeilt'
+        >>> stemmer = Cistem(True)
+        >>> stemmer.stem(s1)
+        'speicherbehal'
+        >>> stemmer.stem(s2)
+        'grenzpo'
+        >>> stemmer.stem(s3)
+        'ausgefeil'
+        """
+        if len(word) == 0:
+            return word
+
+        upper = word[0].isupper()
+        word = word.lower()
+
+        word = word.replace("ü", "u")
+        word = word.replace("ö", "o")
+        word = word.replace("ä", "a")
+        word = word.replace("ß", "ss")
+
+        word = Cistem.strip_ge.sub(r"\1", word)
+
+        return self._segment_inner(word, upper)[0]
+
+    def segment(self, word: str) -> Tuple[str, str]:
+        """
+        This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
+        addition to returning the stem, it also returns the rest that was removed at
+        the end. To be able to return the stem unchanged so the stem and the rest
+        can be concatenated to form the original word, all subsitutions that altered
+        the stem in any other way than by removing letters at the end were left out.
+
+        :param word: The word that is to be stemmed.
+        :type word: str
+        :return: A tuple of the stemmed word and the removed suffix.
+        :rtype: Tuple[str, str]
+
+        >>> from nltk.stem.cistem import Cistem
+        >>> stemmer = Cistem()
+        >>> s1 = "Speicherbehältern"
+        >>> stemmer.segment(s1)
+        ('speicherbehält', 'ern')
+        >>> s2 = "Grenzpostens"
+        >>> stemmer.segment(s2)
+        ('grenzpost', 'ens')
+        >>> s3 = "Ausgefeiltere"
+        >>> stemmer.segment(s3)
+        ('ausgefeilt', 'ere')
+        >>> stemmer = Cistem(True)
+        >>> stemmer.segment(s1)
+        ('speicherbehäl', 'tern')
+        >>> stemmer.segment(s2)
+        ('grenzpo', 'stens')
+        >>> stemmer.segment(s3)
+        ('ausgefeil', 'tere')
+        """
+        if len(word) == 0:
+            return ("", "")
+
+        upper = word[0].isupper()
+        word = word.lower()
+
+        return self._segment_inner(word, upper)
+
+    def _segment_inner(self, word: str, upper: bool):
+        """Inner method for iteratively applying the code stemming regexes.
+        This method receives a pre-processed variant of the word to be stemmed,
+        or the word to be segmented, and returns a tuple of the word and the
+        removed suffix.
+
+        :param word: A pre-processed variant of the word that is to be stemmed.
+        :type word: str
+        :param upper: Whether the original word started with a capital letter.
+        :type upper: bool
+        :return: A tuple of the stemmed word and the removed suffix.
+        :rtype: Tuple[str, str]
+        """
+
+        rest_length = 0
+        word_copy = word[:]
+
+        # Pre-processing before applying the substitution patterns
+        word = Cistem.replace_to(word)
+        rest = ""
+
+        # Apply the substitution patterns
+        while len(word) > 3:
+            if len(word) > 5:
+                word, n = Cistem.strip_emr.subn("", word)
+                if n != 0:
+                    rest_length += 2
+                    continue
+
+                word, n = Cistem.strip_nd.subn("", word)
+                if n != 0:
+                    rest_length += 2
+                    continue
+
+            if not upper or self._case_insensitive:
+                word, n = Cistem.strip_t.subn("", word)
+                if n != 0:
+                    rest_length += 1
+                    continue
+
+            word, n = Cistem.strip_esn.subn("", word)
+            if n != 0:
+                rest_length += 1
+                continue
+            else:
+                break
+
+        # Post-processing after applying the substitution patterns
+        word = Cistem.replace_back(word)
+
+        if rest_length:
+            rest = word_copy[-rest_length:]
+
+        return (word, rest)