updates
This commit is contained in:
251
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm1.py
Normal file
251
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm1.py
Normal file
@@ -0,0 +1,251 @@
|
||||
# Natural Language Toolkit: IBM Model 1
|
||||
#
|
||||
# Copyright (C) 2001-2013 NLTK Project
|
||||
# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
|
||||
# Hengfeng Li <hengfeng12345@gmail.com>
|
||||
# Ruxin Hou <r.hou@student.unimelb.edu.au>
|
||||
# Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
|
||||
# Based on earlier version by:
|
||||
# Will Zhang <wilzzha@gmail.com>
|
||||
# Guan Gui <ggui@student.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Lexical translation model that ignores word order.
|
||||
|
||||
In IBM Model 1, word order is ignored for simplicity. As long as the
|
||||
word alignments are equivalent, it doesn't matter where the word occurs
|
||||
in the source or target sentence. Thus, the following three alignments
|
||||
are equally likely::
|
||||
|
||||
Source: je mange du jambon
|
||||
Target: i eat some ham
|
||||
Alignment: (0,0) (1,1) (2,2) (3,3)
|
||||
|
||||
Source: je mange du jambon
|
||||
Target: some ham eat i
|
||||
Alignment: (0,2) (1,3) (2,1) (3,1)
|
||||
|
||||
Source: du jambon je mange
|
||||
Target: eat i some ham
|
||||
Alignment: (0,3) (1,2) (2,0) (3,1)
|
||||
|
||||
Note that an alignment is represented here as
|
||||
(word_index_in_target, word_index_in_source).
|
||||
|
||||
The EM algorithm used in Model 1 is:
|
||||
|
||||
:E step: In the training data, count how many times a source language
|
||||
word is translated into a target language word, weighted by
|
||||
the prior probability of the translation.
|
||||
|
||||
:M step: Estimate the new probability of translation based on the
|
||||
counts from the Expectation step.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel
|
||||
from nltk.translate.ibm_model import Counts
|
||||
|
||||
|
||||
class IBMModel1(IBMModel):
|
||||
"""
|
||||
Lexical translation model that ignores word order
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
|
||||
>>> ibm1 = IBMModel1(bitext, 5)
|
||||
|
||||
>>> print(round(ibm1.translation_table['buch']['book'], 3))
|
||||
0.889
|
||||
>>> print(round(ibm1.translation_table['das']['book'], 3))
|
||||
0.062
|
||||
>>> print(round(ibm1.translation_table['buch'][None], 3))
|
||||
0.113
|
||||
>>> print(round(ibm1.translation_table['ja'][None], 3))
|
||||
0.073
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, the following entry must be present:
|
||||
``translation_table``.
|
||||
See ``IBMModel`` for the type and purpose of this table.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
|
||||
if probability_tables is None:
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
self.align_all(sentence_aligned_corpus)
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
initial_prob = 1 / len(self.trg_vocab)
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"Target language vocabulary is too large ("
|
||||
+ str(len(self.trg_vocab))
|
||||
+ " words). "
|
||||
"Results may be less accurate."
|
||||
)
|
||||
|
||||
for t in self.trg_vocab:
|
||||
self.translation_table[t] = defaultdict(lambda: initial_prob)
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
trg_sentence = aligned_sentence.words
|
||||
src_sentence = [None] + aligned_sentence.mots
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for t in trg_sentence:
|
||||
for s in src_sentence:
|
||||
count = self.prob_alignment_point(s, t)
|
||||
normalized_count = count / total_count[t]
|
||||
counts.t_given_s[t][s] += normalized_count
|
||||
counts.any_t_given_s[s] += normalized_count
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimate
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
|
||||
def prob_all_alignments(self, src_sentence, trg_sentence):
|
||||
"""
|
||||
Computes the probability of all possible word alignments,
|
||||
expressed as a marginal distribution over target words t
|
||||
|
||||
Each entry in the return value represents the contribution to
|
||||
the total alignment probability by the target word t.
|
||||
|
||||
To obtain probability(alignment | src_sentence, trg_sentence),
|
||||
simply sum the entries in the return value.
|
||||
|
||||
:return: Probability of t for all s in ``src_sentence``
|
||||
:rtype: dict(str): float
|
||||
"""
|
||||
alignment_prob_for_t = defaultdict(float)
|
||||
for t in trg_sentence:
|
||||
for s in src_sentence:
|
||||
alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
|
||||
return alignment_prob_for_t
|
||||
|
||||
def prob_alignment_point(self, s, t):
|
||||
"""
|
||||
Probability that word ``t`` in the target sentence is aligned to
|
||||
word ``s`` in the source sentence
|
||||
"""
|
||||
return self.translation_table[t][s]
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
prob = 1.0
|
||||
|
||||
for j, i in enumerate(alignment_info.alignment):
|
||||
if j == 0:
|
||||
continue # skip the dummy zeroeth element
|
||||
trg_word = alignment_info.trg_sentence[j]
|
||||
src_word = alignment_info.src_sentence[i]
|
||||
prob *= self.translation_table[trg_word][src_word]
|
||||
|
||||
return max(prob, IBMModel.MIN_PROB)
|
||||
|
||||
def align_all(self, parallel_corpus):
|
||||
for sentence_pair in parallel_corpus:
|
||||
self.align(sentence_pair)
|
||||
|
||||
def align(self, sentence_pair):
|
||||
"""
|
||||
Determines the best word alignment for one sentence pair from
|
||||
the corpus that the model was trained on.
|
||||
|
||||
The best alignment will be set in ``sentence_pair`` when the
|
||||
method returns. In contrast with the internal implementation of
|
||||
IBM models, the word indices in the ``Alignment`` are zero-
|
||||
indexed, not one-indexed.
|
||||
|
||||
:param sentence_pair: A sentence in the source language and its
|
||||
counterpart sentence in the target language
|
||||
:type sentence_pair: AlignedSent
|
||||
"""
|
||||
best_alignment = []
|
||||
|
||||
for j, trg_word in enumerate(sentence_pair.words):
|
||||
# Initialize trg_word to align with the NULL token
|
||||
best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
|
||||
best_alignment_point = None
|
||||
for i, src_word in enumerate(sentence_pair.mots):
|
||||
align_prob = self.translation_table[trg_word][src_word]
|
||||
if align_prob >= best_prob: # prefer newer word in case of tie
|
||||
best_prob = align_prob
|
||||
best_alignment_point = i
|
||||
|
||||
best_alignment.append((j, best_alignment_point))
|
||||
|
||||
sentence_pair.alignment = Alignment(best_alignment)
|
||||
Reference in New Issue
Block a user