updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/translate/api.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/translate/api.py
@@ -0,0 +1,335 @@
+# Natural Language Toolkit: API for alignment and translation objects
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Will Zhang <wilzzha@gmail.com>
+#         Guan Gui <ggui@student.unimelb.edu.au>
+#         Steven Bird <stevenbird1@gmail.com>
+#         Tah Wei Hoon <hoon.tw@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import subprocess
+from collections import namedtuple
+
+
+class AlignedSent:
+    """
+    Return an aligned sentence object, which encapsulates two sentences
+    along with an ``Alignment`` between them.
+
+    Typically used in machine translation to represent a sentence and
+    its translation.
+
+        >>> from nltk.translate import AlignedSent, Alignment
+        >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
+        ...     ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1'))
+        >>> algnsent.words
+        ['klein', 'ist', 'das', 'Haus']
+        >>> algnsent.mots
+        ['the', 'house', 'is', 'small']
+        >>> algnsent.alignment
+        Alignment([(0, 3), (1, 2), (2, 0), (3, 1)])
+        >>> from nltk.corpus import comtrans
+        >>> print(comtrans.aligned_sents()[54])
+        <AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
+        >>> print(comtrans.aligned_sents()[54].alignment)
+        0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
+
+    :param words: Words in the target language sentence
+    :type words: list(str)
+    :param mots: Words in the source language sentence
+    :type mots: list(str)
+    :param alignment: Word-level alignments between ``words`` and ``mots``.
+        Each alignment is represented as a 2-tuple (words_index, mots_index).
+    :type alignment: Alignment
+    """
+
+    def __init__(self, words, mots, alignment=None):
+        self._words = words
+        self._mots = mots
+        if alignment is None:
+            self.alignment = Alignment([])
+        else:
+            assert type(alignment) is Alignment
+            self.alignment = alignment
+
+    @property
+    def words(self):
+        return self._words
+
+    @property
+    def mots(self):
+        return self._mots
+
+    def _get_alignment(self):
+        return self._alignment
+
+    def _set_alignment(self, alignment):
+        _check_alignment(len(self.words), len(self.mots), alignment)
+        self._alignment = alignment
+
+    alignment = property(_get_alignment, _set_alignment)
+
+    def __repr__(self):
+        """
+        Return a string representation for this ``AlignedSent``.
+
+        :rtype: str
+        """
+        words = "[%s]" % (", ".join("'%s'" % w for w in self._words))
+        mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots))
+
+        return f"AlignedSent({words}, {mots}, {self._alignment!r})"
+
+    def _to_dot(self):
+        """
+        Dot representation of the aligned sentence
+        """
+        s = "graph align {\n"
+        s += "node[shape=plaintext]\n"
+
+        # Declare node
+        s += "".join([f'"{w}_source" [label="{w}"] \n' for w in self._words])
+        s += "".join([f'"{w}_target" [label="{w}"] \n' for w in self._mots])
+
+        # Alignment
+        s += "".join(
+            [
+                f'"{self._words[u]}_source" -- "{self._mots[v]}_target" \n'
+                for u, v in self._alignment
+            ]
+        )
+
+        # Connect the source words
+        for i in range(len(self._words) - 1):
+            s += '"{}_source" -- "{}_source" [style=invis]\n'.format(
+                self._words[i],
+                self._words[i + 1],
+            )
+
+        # Connect the target words
+        for i in range(len(self._mots) - 1):
+            s += '"{}_target" -- "{}_target" [style=invis]\n'.format(
+                self._mots[i],
+                self._mots[i + 1],
+            )
+
+        # Put it in the same rank
+        s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words))
+        s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots))
+
+        s += "}"
+
+        return s
+
+    def _repr_svg_(self):
+        """
+        Ipython magic : show SVG representation of this ``AlignedSent``.
+        """
+        dot_string = self._to_dot().encode("utf8")
+        output_format = "svg"
+        try:
+            process = subprocess.Popen(
+                ["dot", "-T%s" % output_format],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+        except OSError as e:
+            raise Exception("Cannot find the dot binary from Graphviz package") from e
+        out, err = process.communicate(dot_string)
+
+        return out.decode("utf8")
+
+    def __str__(self):
+        """
+        Return a human-readable string representation for this ``AlignedSent``.
+
+        :rtype: str
+        """
+        source = " ".join(self._words)[:20] + "..."
+        target = " ".join(self._mots)[:20] + "..."
+        return f"<AlignedSent: '{source}' -> '{target}'>"
+
+    def invert(self):
+        """
+        Return the aligned sentence pair, reversing the directionality
+
+        :rtype: AlignedSent
+        """
+        return AlignedSent(self._mots, self._words, self._alignment.invert())
+
+
+class Alignment(frozenset):
+    """
+    A storage class for representing alignment between two sequences, s1, s2.
+    In general, an alignment is a set of tuples of the form (i, j, ...)
+    representing an alignment between the i-th element of s1 and the
+    j-th element of s2.  Tuples are extensible (they might contain
+    additional data, such as a boolean to indicate sure vs possible alignments).
+
+        >>> from nltk.translate import Alignment
+        >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
+        >>> a.invert()
+        Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
+        >>> print(a.invert())
+        0-0 1-0 2-1 2-2
+        >>> a[0]
+        [(0, 1), (0, 0)]
+        >>> a.invert()[2]
+        [(2, 1), (2, 2)]
+        >>> b = Alignment([(0, 0), (0, 1)])
+        >>> b.issubset(a)
+        True
+        >>> c = Alignment.fromstring('0-0 0-1')
+        >>> b == c
+        True
+    """
+
+    def __new__(cls, pairs):
+        self = frozenset.__new__(cls, pairs)
+        self._len = max(p[0] for p in self) if self != frozenset([]) else 0
+        self._index = None
+        return self
+
+    @classmethod
+    def fromstring(cls, s):
+        """
+        Read a giza-formatted string and return an Alignment object.
+
+            >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5')
+            Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)])
+
+        :type s: str
+        :param s: the positional alignments in giza format
+        :rtype: Alignment
+        :return: An Alignment object corresponding to the string representation ``s``.
+        """
+
+        return Alignment([_giza2pair(a) for a in s.split()])
+
+    def __getitem__(self, key):
+        """
+        Look up the alignments that map from a given index or slice.
+        """
+        if not self._index:
+            self._build_index()
+        return self._index.__getitem__(key)
+
+    def invert(self):
+        """
+        Return an Alignment object, being the inverted mapping.
+        """
+        return Alignment(((p[1], p[0]) + p[2:]) for p in self)
+
+    def range(self, positions=None):
+        """
+        Work out the range of the mapping from the given positions.
+        If no positions are specified, compute the range of the entire mapping.
+        """
+        image = set()
+        if not self._index:
+            self._build_index()
+        if not positions:
+            positions = list(range(len(self._index)))
+        for p in positions:
+            image.update(f for _, f in self._index[p])
+        return sorted(image)
+
+    def __repr__(self):
+        """
+        Produce a Giza-formatted string representing the alignment.
+        """
+        return "Alignment(%r)" % sorted(self)
+
+    def __str__(self):
+        """
+        Produce a Giza-formatted string representing the alignment.
+        """
+        return " ".join("%d-%d" % p[:2] for p in sorted(self))
+
+    def _build_index(self):
+        """
+        Build a list self._index such that self._index[i] is a list
+        of the alignments originating from word i.
+        """
+        self._index = [[] for _ in range(self._len + 1)]
+        for p in self:
+            self._index[p[0]].append(p)
+
+
+def _giza2pair(pair_string):
+    i, j = pair_string.split("-")
+    return int(i), int(j)
+
+
+def _naacl2pair(pair_string):
+    i, j, p = pair_string.split("-")
+    return int(i), int(j)
+
+
+def _check_alignment(num_words, num_mots, alignment):
+    """
+    Check whether the alignments are legal.
+
+    :param num_words: the number of source language words
+    :type num_words: int
+    :param num_mots: the number of target language words
+    :type num_mots: int
+    :param alignment: alignment to be checked
+    :type alignment: Alignment
+    :raise IndexError: if alignment falls outside the sentence
+    """
+
+    assert type(alignment) is Alignment
+
+    if not all(0 <= pair[0] < num_words for pair in alignment):
+        raise IndexError("Alignment is outside boundary of words")
+    if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment):
+        raise IndexError("Alignment is outside boundary of mots")
+
+
+PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"])
+
+
+class PhraseTable:
+    """
+    In-memory store of translations for a given phrase, and the log
+    probability of the those translations
+    """
+
+    def __init__(self):
+        self.src_phrases = dict()
+
+    def translations_for(self, src_phrase):
+        """
+        Get the translations for a source language phrase
+
+        :param src_phrase: Source language phrase of interest
+        :type src_phrase: tuple(str)
+
+        :return: A list of target language phrases that are translations
+            of ``src_phrase``, ordered in decreasing order of
+            likelihood. Each list element is a tuple of the target
+            phrase and its log probability.
+        :rtype: list(PhraseTableEntry)
+        """
+        return self.src_phrases[src_phrase]
+
+    def add(self, src_phrase, trg_phrase, log_prob):
+        """
+        :type src_phrase: tuple(str)
+        :type trg_phrase: tuple(str)
+
+        :param log_prob: Log probability that given ``src_phrase``,
+            ``trg_phrase`` is its translation
+        :type log_prob: float
+        """
+        entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob)
+        if src_phrase not in self.src_phrases:
+            self.src_phrases[src_phrase] = []
+        self.src_phrases[src_phrase].append(entry)
+        self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True)
+
+    def __contains__(self, src_phrase):
+        return src_phrase in self.src_phrases