updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/tokenize/nist.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/tokenize/nist.py
@@ -0,0 +1,179 @@
+# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
+# Contributors: Ozan Caglayan, Wiktor Stribizew
+#
+# URL: <https://www.nltk.org>
+# For license information, see LICENSE.TXT
+
+"""
+This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
+https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
+which was also ported into Python in
+https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
+"""
+
+
+import io
+import re
+
+from nltk.corpus import perluniprops
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import xml_unescape
+
+
+class NISTTokenizer(TokenizerI):
+    """
+    This NIST tokenizer is sentence-based instead of the original
+    paragraph-based tokenization from mteval-14.pl; The sentence-based
+    tokenization is consistent with the other tokenizers available in NLTK.
+
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+    >>> s = "Good muffins cost $3.88 in New York."
+    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
+    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
+    >>> nist.tokenize(s, lowercase=False) == expected_cased
+    True
+    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
+    True
+
+    The international_tokenize() is the preferred function when tokenizing
+    non-european text, e.g.
+
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+
+    # Input strings.
+    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
+    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
+    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
+
+    # Expected tokens.
+    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
+    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
+    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
+
+    >>> nist.international_tokenize(albb)[:10] == expected_albb
+    True
+    >>> nist.international_tokenize(amz)[:10] == expected_amz
+    True
+    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
+    True
+
+    # Doctest for patching issue #1926
+    >>> sent = u'this is a foo\u2604sentence.'
+    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
+    >>> nist.international_tokenize(sent) == expected_sent
+    True
+    """
+
+    # Strip "skipped" tags
+    STRIP_SKIP = re.compile("<skipped>"), ""
+    #  Strip end-of-line hyphenation and join lines
+    STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
+    # Tokenize punctuation.
+    PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
+    # Tokenize period and comma unless preceded by a digit.
+    PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 "
+    # Tokenize period and comma unless followed by a digit.
+    PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2"
+    # Tokenize dash when preceded by a digit
+    DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
+
+    LANG_DEPENDENT_REGEXES = [
+        PUNCT,
+        PERIOD_COMMA_PRECEED,
+        PERIOD_COMMA_FOLLOW,
+        DASH_PRECEED_DIGIT,
+    ]
+
+    # Perluniprops characters used in NIST tokenizer.
+    pup_number = str("".join(set(perluniprops.chars("Number"))))  # i.e. \p{N}
+    pup_punct = str("".join(set(perluniprops.chars("Punctuation"))))  # i.e. \p{P}
+    pup_symbol = str("".join(set(perluniprops.chars("Symbol"))))  # i.e. \p{S}
+
+    # Python regexes needs to escape some special symbols, see
+    # see https://stackoverflow.com/q/45670950/610569
+    number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
+    punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
+    symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
+
+    # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
+    #       (i) strip trailing and heading spaces  and
+    #       (ii) de-deuplicate spaces.
+    #       In Python, this would do: ' '.join(str.strip().split())
+    # Thus, the next two lines were commented out.
+    # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+    # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+
+    # Pads non-ascii strings with space.
+    NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
+    #  Tokenize any punctuation unless followed AND preceded by a digit.
+    PUNCT_1 = (
+        re.compile(f"([{number_regex}])([{punct_regex}])"),
+        "\\1 \\2 ",
+    )
+    PUNCT_2 = (
+        re.compile(f"([{punct_regex}])([{number_regex}])"),
+        " \\1 \\2",
+    )
+    # Tokenize symbols
+    SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 "
+
+    INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
+
+    def lang_independent_sub(self, text):
+        """Performs the language independent string substituitions."""
+        # It's a strange order of regexes.
+        # It'll be better to unescape after STRIP_EOL_HYPHEN
+        # but let's keep it close to the original NIST implementation.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        return text
+
+    def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
+        text = str(text)
+        # Language independent regex.
+        text = self.lang_independent_sub(text)
+        # Language dependent regex.
+        if western_lang:
+            # Pad string with whitespace.
+            text = " " + text + " "
+            if lowercase:
+                text = text.lower()
+            for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
+                text = regexp.sub(substitution, text)
+        # Remove contiguous whitespaces.
+        text = " ".join(text.split())
+        # Finally, strips heading and trailing spaces
+        # and converts output string into unicode.
+        text = str(text.strip())
+        return text if return_str else text.split()
+
+    def international_tokenize(
+        self, text, lowercase=False, split_non_ascii=True, return_str=False
+    ):
+        text = str(text)
+        # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
+        # first before unescaping.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+
+        if lowercase:
+            text = text.lower()
+
+        for regexp, substitution in self.INTERNATIONAL_REGEXES:
+            text = regexp.sub(substitution, text)
+
+        # Make sure that there's only one space only between words.
+        # Strip leading and trailing spaces.
+        text = " ".join(text.strip().split())
+        return text if return_str else text.split()