updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm1.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm1.py
@@ -0,0 +1,251 @@
+# Natural Language Toolkit: IBM Model 1
+#
+# Copyright (C) 2001-2013 NLTK Project
+# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
+#         Hengfeng Li <hengfeng12345@gmail.com>
+#         Ruxin Hou <r.hou@student.unimelb.edu.au>
+#         Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
+# Based on earlier version by:
+#         Will Zhang <wilzzha@gmail.com>
+#         Guan Gui <ggui@student.unimelb.edu.au>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Lexical translation model that ignores word order.
+
+In IBM Model 1, word order is ignored for simplicity. As long as the
+word alignments are equivalent, it doesn't matter where the word occurs
+in the source or target sentence. Thus, the following three alignments
+are equally likely::
+
+    Source: je mange du jambon
+    Target: i eat some ham
+    Alignment: (0,0) (1,1) (2,2) (3,3)
+
+    Source: je mange du jambon
+    Target: some ham eat i
+    Alignment: (0,2) (1,3) (2,1) (3,1)
+
+    Source: du jambon je mange
+    Target: eat i some ham
+    Alignment: (0,3) (1,2) (2,0) (3,1)
+
+Note that an alignment is represented here as
+(word_index_in_target, word_index_in_source).
+
+The EM algorithm used in Model 1 is:
+
+:E step: In the training data, count how many times a source language
+         word is translated into a target language word, weighted by
+         the prior probability of the translation.
+
+:M step: Estimate the new probability of translation based on the
+         counts from the Expectation step.
+
+Notations
+---------
+
+:i: Position in the source sentence
+     Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+:j: Position in the target sentence
+     Valid values are 1, 2, ..., length of target sentence
+:s: A word in the source language
+:t: A word in the target language
+
+References
+----------
+
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+import warnings
+from collections import defaultdict
+
+from nltk.translate import AlignedSent, Alignment, IBMModel
+from nltk.translate.ibm_model import Counts
+
+
+class IBMModel1(IBMModel):
+    """
+    Lexical translation model that ignores word order
+
+    >>> bitext = []
+    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+
+    >>> ibm1 = IBMModel1(bitext, 5)
+
+    >>> print(round(ibm1.translation_table['buch']['book'], 3))
+    0.889
+    >>> print(round(ibm1.translation_table['das']['book'], 3))
+    0.062
+    >>> print(round(ibm1.translation_table['buch'][None], 3))
+    0.113
+    >>> print(round(ibm1.translation_table['ja'][None], 3))
+    0.073
+
+    >>> test_sentence = bitext[2]
+    >>> test_sentence.words
+    ['das', 'buch', 'ist', 'ja', 'klein']
+    >>> test_sentence.mots
+    ['the', 'book', 'is', 'small']
+    >>> test_sentence.alignment
+    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
+
+    """
+
+    def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
+        """
+        Train on ``sentence_aligned_corpus`` and create a lexical
+        translation model.
+
+        Translation direction is from ``AlignedSent.mots`` to
+        ``AlignedSent.words``.
+
+        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+        :type sentence_aligned_corpus: list(AlignedSent)
+
+        :param iterations: Number of iterations to run training algorithm
+        :type iterations: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, the following entry must be present:
+            ``translation_table``.
+            See ``IBMModel`` for the type and purpose of this table.
+        :type probability_tables: dict[str]: object
+        """
+        super().__init__(sentence_aligned_corpus)
+
+        if probability_tables is None:
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables["translation_table"]
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+        self.align_all(sentence_aligned_corpus)
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        initial_prob = 1 / len(self.trg_vocab)
+        if initial_prob < IBMModel.MIN_PROB:
+            warnings.warn(
+                "Target language vocabulary is too large ("
+                + str(len(self.trg_vocab))
+                + " words). "
+                "Results may be less accurate."
+            )
+
+        for t in self.trg_vocab:
+            self.translation_table[t] = defaultdict(lambda: initial_prob)
+
+    def train(self, parallel_corpus):
+        counts = Counts()
+        for aligned_sentence in parallel_corpus:
+            trg_sentence = aligned_sentence.words
+            src_sentence = [None] + aligned_sentence.mots
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_all_alignments(src_sentence, trg_sentence)
+
+            # E step (b): Collect counts
+            for t in trg_sentence:
+                for s in src_sentence:
+                    count = self.prob_alignment_point(s, t)
+                    normalized_count = count / total_count[t]
+                    counts.t_given_s[t][s] += normalized_count
+                    counts.any_t_given_s[s] += normalized_count
+
+        # M step: Update probabilities with maximum likelihood estimate
+        self.maximize_lexical_translation_probabilities(counts)
+
+    def prob_all_alignments(self, src_sentence, trg_sentence):
+        """
+        Computes the probability of all possible word alignments,
+        expressed as a marginal distribution over target words t
+
+        Each entry in the return value represents the contribution to
+        the total alignment probability by the target word t.
+
+        To obtain probability(alignment | src_sentence, trg_sentence),
+        simply sum the entries in the return value.
+
+        :return: Probability of t for all s in ``src_sentence``
+        :rtype: dict(str): float
+        """
+        alignment_prob_for_t = defaultdict(float)
+        for t in trg_sentence:
+            for s in src_sentence:
+                alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
+        return alignment_prob_for_t
+
+    def prob_alignment_point(self, s, t):
+        """
+        Probability that word ``t`` in the target sentence is aligned to
+        word ``s`` in the source sentence
+        """
+        return self.translation_table[t][s]
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+        """
+        prob = 1.0
+
+        for j, i in enumerate(alignment_info.alignment):
+            if j == 0:
+                continue  # skip the dummy zeroeth element
+            trg_word = alignment_info.trg_sentence[j]
+            src_word = alignment_info.src_sentence[i]
+            prob *= self.translation_table[trg_word][src_word]
+
+        return max(prob, IBMModel.MIN_PROB)
+
+    def align_all(self, parallel_corpus):
+        for sentence_pair in parallel_corpus:
+            self.align(sentence_pair)
+
+    def align(self, sentence_pair):
+        """
+        Determines the best word alignment for one sentence pair from
+        the corpus that the model was trained on.
+
+        The best alignment will be set in ``sentence_pair`` when the
+        method returns. In contrast with the internal implementation of
+        IBM models, the word indices in the ``Alignment`` are zero-
+        indexed, not one-indexed.
+
+        :param sentence_pair: A sentence in the source language and its
+            counterpart sentence in the target language
+        :type sentence_pair: AlignedSent
+        """
+        best_alignment = []
+
+        for j, trg_word in enumerate(sentence_pair.words):
+            # Initialize trg_word to align with the NULL token
+            best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
+            best_alignment_point = None
+            for i, src_word in enumerate(sentence_pair.mots):
+                align_prob = self.translation_table[trg_word][src_word]
+                if align_prob >= best_prob:  # prefer newer word in case of tie
+                    best_prob = align_prob
+                    best_alignment_point = i
+
+            best_alignment.append((j, best_alignment_point))
+
+        sentence_pair.alignment = Alignment(best_alignment)