updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/tokenize/legality_principle.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/tokenize/legality_principle.py
@@ -0,0 +1,147 @@
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Christopher Hench <chris.l.hench@gmail.com>
+#         Alex Estes
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+The Legality Principle is a language agnostic principle maintaining that syllable
+onsets and codas (the beginning and ends of syllables not including the vowel)
+are only legal if they are found as word onsets or codas in the language. The English
+word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found
+word-initially in the English language (Bartlett et al.). This principle was first proposed
+in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''.
+
+Kahn further argues that there is a ''strong tendency to syllabify in such a way that
+initial clusters are of maximal length, consistent with the general constraints on
+word-initial consonant clusters.'' Consequently, in addition to being legal onsets,
+the longest legal onset is preferable---''Onset Maximization''.
+
+The default implementation assumes an English vowel set, but the `vowels` attribute
+can be set to IPA or any other alphabet's vowel set for the use-case.
+Both a valid set of vowels as well as a text corpus of words in the language
+are necessary to determine legal onsets and subsequently syllabify words.
+
+The legality principle with onset maximization is a universal syllabification algorithm,
+but that does not mean it performs equally across languages. Bartlett et al. (2009)
+is a good benchmark for English accuracy if utilizing IPA (pg. 311).
+
+References:
+
+- Otto Jespersen. 1904. Lehrbuch der Phonetik.
+  Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
+- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11.
+- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976).
+- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
+  In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
+  Cambridge, MIT Press. pp. 107-136.
+- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436.
+- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
+  In HLT-NAACL. pp. 308-316.
+- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley.
+"""
+
+from collections import Counter
+
+from nltk.tokenize.api import TokenizerI
+
+
+class LegalitySyllableTokenizer(TokenizerI):
+    """
+    Syllabifies words based on the Legality Principle and Onset Maximization.
+
+        >>> from nltk.tokenize import LegalitySyllableTokenizer
+        >>> from nltk import word_tokenize
+        >>> from nltk.corpus import words
+        >>> text = "This is a wonderful sentence."
+        >>> text_words = word_tokenize(text)
+        >>> LP = LegalitySyllableTokenizer(words.words())
+        >>> [LP.tokenize(word) for word in text_words]
+        [['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']]
+    """
+
+    def __init__(
+        self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001
+    ):
+        """
+        :param tokenized_source_text: List of valid tokens in the language
+        :type tokenized_source_text: list(str)
+        :param vowels: Valid vowels in language or IPA representation
+        :type vowels: str
+        :param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset
+        :type legal_frequency_threshold: float
+        """
+        self.legal_frequency_threshold = legal_frequency_threshold
+        self.vowels = vowels
+        self.legal_onsets = self.find_legal_onsets(tokenized_source_text)
+
+    def find_legal_onsets(self, words):
+        """
+        Gathers all onsets and then return only those above the frequency threshold
+
+        :param words: List of words in a language
+        :type words: list(str)
+        :return: Set of legal onsets
+        :rtype: set(str)
+        """
+        onsets = [self.onset(word) for word in words]
+        legal_onsets = [
+            k
+            for k, v in Counter(onsets).items()
+            if (v / len(onsets)) > self.legal_frequency_threshold
+        ]
+        return set(legal_onsets)
+
+    def onset(self, word):
+        """
+        Returns consonant cluster of word, i.e. all characters until the first vowel.
+
+        :param word: Single word or token
+        :type word: str
+        :return: String of characters of onset
+        :rtype: str
+        """
+        onset = ""
+        for c in word.lower():
+            if c in self.vowels:
+                return onset
+            else:
+                onset += c
+        return onset
+
+    def tokenize(self, token):
+        """
+        Apply the Legality Principle in combination with
+        Onset Maximization to return a list of syllables.
+
+        :param token: Single word or token
+        :type token: str
+        :return syllable_list: Single word or token broken up into syllables.
+        :rtype: list(str)
+        """
+        syllables = []
+        syllable, current_onset = "", ""
+        vowel, onset = False, False
+        for char in token[::-1]:
+            char_lower = char.lower()
+            if not vowel:
+                syllable += char
+                vowel = bool(char_lower in self.vowels)
+            else:
+                if char_lower + current_onset[::-1] in self.legal_onsets:
+                    syllable += char
+                    current_onset += char_lower
+                    onset = True
+                elif char_lower in self.vowels and not onset:
+                    syllable += char
+                    current_onset += char_lower
+                else:
+                    syllables.append(syllable)
+                    syllable = char
+                    current_onset = ""
+                    vowel = bool(char_lower in self.vowels)
+        syllables.append(syllable)
+        syllables_ordered = [syllable[::-1] for syllable in syllables][::-1]
+        return syllables_ordered