updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/classify/textcat.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/classify/textcat.py
@@ -0,0 +1,193 @@
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle,
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created separately to read
+those files.
+
+For details regarding the algorithm, see:
+https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+https://borel.slu.edu/crubadan/index.html
+"""
+
+from sys import maxsize
+
+from nltk.util import trigrams
+
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+try:
+    import regex as re
+except ImportError:
+    re = None
+######################################################################
+##  Language identification using TextCat
+######################################################################
+
+
+class TextCat:
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<"
+    _END_CHAR = ">"
+
+    last_distances = {}
+
+    def __init__(self):
+        if not re:
+            raise OSError(
+                "classify.textcat requires the regex module that "
+                "supports unicode. Try '$ pip install regex' and "
+                "see https://pypi.python.org/pypi/regex for "
+                "further details."
+            )
+
+        from nltk.corpus import crubadan
+
+        self._corpus = crubadan
+        # Load all language ngrams into cache
+        for lang in self._corpus.langs():
+            self._corpus.lang_freq(lang)
+
+    def remove_punctuation(self, text):
+        """Get rid of punctuation except apostrophes"""
+        return re.sub(r"[^\P{P}\']+", "", text)
+
+    def profile(self, text):
+        """Create FreqDist of trigrams within text"""
+        from nltk import FreqDist, word_tokenize
+
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+            token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
+
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+
+        return fingerprint
+
+    def calc_dist(self, lang, trigram, text_profile):
+        """Calculate the "out-of-place" measure between the
+        text and language profile for a single trigram"""
+
+        lang_fd = self._corpus.lang_freq(lang)
+        dist = 0
+
+        if trigram in lang_fd:
+            idx_lang_profile = list(lang_fd.keys()).index(trigram)
+            idx_text = list(text_profile.keys()).index(trigram)
+
+            # print(idx_lang_profile, ", ", idx_text)
+            dist = abs(idx_lang_profile - idx_text)
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            dist = maxsize
+
+        return dist
+
+    def lang_dists(self, text):
+        """Calculate the "out-of-place" measure between
+        the text and all languages"""
+
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus._all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+
+            distances[lang] = lang_dist
+
+        return distances
+
+    def guess_language(self, text):
+        """Find the language with the min distance
+        to the text and return its ISO 639-3 code"""
+        self.last_distances = self.lang_dists(text)
+
+        return min(self.last_distances, key=self.last_distances.get)
+        #################################################')
+
+
+def demo():
+    from nltk.corpus import udhr
+
+    langs = [
+        "Kurdish-UTF8",
+        "Abkhaz-UTF8",
+        "Farsi_Persian-UTF8",
+        "Hindi-UTF8",
+        "Hawaiian-UTF8",
+        "Russian-UTF8",
+        "Vietnamese-UTF8",
+        "Serbian_Srpski-UTF8",
+        "Esperanto-UTF8",
+    ]
+
+    friendly = {
+        "kmr": "Northern Kurdish",
+        "abk": "Abkhazian",
+        "pes": "Iranian Persian",
+        "hin": "Hindi",
+        "haw": "Hawaiian",
+        "rus": "Russian",
+        "vie": "Vietnamese",
+        "srp": "Serbian",
+        "epo": "Esperanto",
+    }
+
+    tc = TextCat()
+
+    for cur_lang in langs:
+        # Get raw data from UDHR corpus
+        raw_sentences = udhr.sents(cur_lang)
+        rows = len(raw_sentences) - 1
+        cols = list(map(len, raw_sentences))
+
+        sample = ""
+
+        # Generate a sample text of the language
+        for i in range(0, rows):
+            cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
+            sample += cur_sent
+
+        # Try to detect what it is
+        print("Language snippet: " + sample[0:140] + "...")
+        guess = tc.guess_language(sample)
+        print(f"Language detection: {guess} ({friendly[guess]})")
+        print("#" * 140)
+
+
+if __name__ == "__main__":
+    demo()