updates
This commit is contained in:
@@ -0,0 +1,549 @@
|
||||
# Natural Language Toolkit: IBM Model Core
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Common methods and classes for all IBM models. See ``IBMModel1``,
|
||||
``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5``
|
||||
for specific implementations.
|
||||
|
||||
The IBM models are a series of generative models that learn lexical
|
||||
translation probabilities, p(target language word|source language word),
|
||||
given a sentence-aligned parallel corpus.
|
||||
|
||||
The models increase in sophistication from model 1 to 5. Typically, the
|
||||
output of lower models is used to seed the higher models. All models
|
||||
use the Expectation-Maximization (EM) algorithm to learn various
|
||||
probability tables.
|
||||
|
||||
Words in a sentence are one-indexed. The first word of a sentence has
|
||||
position 1, not 0. Index 0 is reserved in the source sentence for the
|
||||
NULL token. The concept of position does not apply to NULL, but it is
|
||||
indexed at 0 by convention.
|
||||
|
||||
Each target word is aligned to exactly one source word or the NULL
|
||||
token.
|
||||
|
||||
References:
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
from bisect import insort_left
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from math import ceil
|
||||
|
||||
|
||||
def longest_target_sentence_length(sentence_aligned_corpus):
|
||||
"""
|
||||
:param sentence_aligned_corpus: Parallel corpus under consideration
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
:return: Number of words in the longest target language sentence
|
||||
of ``sentence_aligned_corpus``
|
||||
"""
|
||||
max_m = 0
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
m = len(aligned_sentence.words)
|
||||
max_m = max(m, max_m)
|
||||
return max_m
|
||||
|
||||
|
||||
class IBMModel:
|
||||
"""
|
||||
Abstract base class for all IBM models
|
||||
"""
|
||||
|
||||
# Avoid division by zero and precision errors by imposing a minimum
|
||||
# value for probabilities. Note that this approach is theoretically
|
||||
# incorrect, since it may create probabilities that sum to more
|
||||
# than 1. In practice, the contribution of probabilities with MIN_PROB
|
||||
# is tiny enough that the value of MIN_PROB can be treated as zero.
|
||||
MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7
|
||||
|
||||
def __init__(self, sentence_aligned_corpus):
|
||||
self.init_vocab(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
|
||||
def reset_probabilities(self):
|
||||
self.translation_table = defaultdict(
|
||||
lambda: defaultdict(lambda: IBMModel.MIN_PROB)
|
||||
)
|
||||
"""
|
||||
dict[str][str]: float. Probability(target word | source word).
|
||||
Values accessed as ``translation_table[target_word][source_word]``.
|
||||
"""
|
||||
|
||||
self.alignment_table = defaultdict(
|
||||
lambda: defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB))
|
||||
)
|
||||
)
|
||||
"""
|
||||
dict[int][int][int][int]: float. Probability(i | j,l,m).
|
||||
Values accessed as ``alignment_table[i][j][l][m]``.
|
||||
Used in model 2 and hill climbing in models 3 and above
|
||||
"""
|
||||
|
||||
self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
"""
|
||||
dict[int][str]: float. Probability(fertility | source word).
|
||||
Values accessed as ``fertility_table[fertility][source_word]``.
|
||||
Used in model 3 and higher.
|
||||
"""
|
||||
|
||||
self.p1 = 0.5
|
||||
"""
|
||||
Probability that a generated word requires another target word
|
||||
that is aligned to NULL.
|
||||
Used in model 3 and higher.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
"""
|
||||
Initialize probability tables to a uniform distribution
|
||||
|
||||
Derived classes should implement this accordingly.
|
||||
"""
|
||||
pass
|
||||
|
||||
def init_vocab(self, sentence_aligned_corpus):
|
||||
src_vocab = set()
|
||||
trg_vocab = set()
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
trg_vocab.update(aligned_sentence.words)
|
||||
src_vocab.update(aligned_sentence.mots)
|
||||
# Add the NULL token
|
||||
src_vocab.add(None)
|
||||
|
||||
self.src_vocab = src_vocab
|
||||
"""
|
||||
set(str): All source language words used in training
|
||||
"""
|
||||
|
||||
self.trg_vocab = trg_vocab
|
||||
"""
|
||||
set(str): All target language words used in training
|
||||
"""
|
||||
|
||||
def sample(self, sentence_pair):
|
||||
"""
|
||||
Sample the most probable alignments from the entire alignment
|
||||
space
|
||||
|
||||
First, determine the best alignment according to IBM Model 2.
|
||||
With this initial alignment, use hill climbing to determine the
|
||||
best alignment according to a higher IBM Model. Add this
|
||||
alignment and its neighbors to the sample set. Repeat this
|
||||
process with other initial alignments obtained by pegging an
|
||||
alignment point.
|
||||
|
||||
Hill climbing may be stuck in a local maxima, hence the pegging
|
||||
and trying out of different alignments.
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to generate a sample of alignments from
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:return: A set of best alignments represented by their ``AlignmentInfo``
|
||||
and the best alignment of the set for convenience
|
||||
:rtype: set(AlignmentInfo), AlignmentInfo
|
||||
"""
|
||||
sampled_alignments = set()
|
||||
l = len(sentence_pair.mots)
|
||||
m = len(sentence_pair.words)
|
||||
|
||||
# Start from the best model 2 alignment
|
||||
initial_alignment = self.best_model2_alignment(sentence_pair)
|
||||
potential_alignment = self.hillclimb(initial_alignment)
|
||||
sampled_alignments.update(self.neighboring(potential_alignment))
|
||||
best_alignment = potential_alignment
|
||||
|
||||
# Start from other model 2 alignments,
|
||||
# with the constraint that j is aligned (pegged) to i
|
||||
for j in range(1, m + 1):
|
||||
for i in range(0, l + 1):
|
||||
initial_alignment = self.best_model2_alignment(sentence_pair, j, i)
|
||||
potential_alignment = self.hillclimb(initial_alignment, j)
|
||||
neighbors = self.neighboring(potential_alignment, j)
|
||||
sampled_alignments.update(neighbors)
|
||||
if potential_alignment.score > best_alignment.score:
|
||||
best_alignment = potential_alignment
|
||||
|
||||
return sampled_alignments, best_alignment
|
||||
|
||||
def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0):
|
||||
"""
|
||||
Finds the best alignment according to IBM Model 2
|
||||
|
||||
Used as a starting point for hill climbing in Models 3 and
|
||||
above, because it is easier to compute than the best alignments
|
||||
in higher models
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to be word-aligned
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:param j_pegged: If specified, the alignment point of j_pegged
|
||||
will be fixed to i_pegged
|
||||
:type j_pegged: int
|
||||
|
||||
:param i_pegged: Alignment point to j_pegged
|
||||
:type i_pegged: int
|
||||
"""
|
||||
src_sentence = [None] + sentence_pair.mots
|
||||
trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed
|
||||
|
||||
l = len(src_sentence) - 1 # exclude NULL
|
||||
m = len(trg_sentence) - 1
|
||||
|
||||
alignment = [0] * (m + 1) # init all alignments to NULL
|
||||
cepts = [[] for i in range(l + 1)] # init all cepts to empty list
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j == j_pegged:
|
||||
# use the pegged alignment instead of searching for best one
|
||||
best_i = i_pegged
|
||||
else:
|
||||
best_i = 0
|
||||
max_alignment_prob = IBMModel.MIN_PROB
|
||||
t = trg_sentence[j]
|
||||
|
||||
for i in range(0, l + 1):
|
||||
s = src_sentence[i]
|
||||
alignment_prob = (
|
||||
self.translation_table[t][s] * self.alignment_table[i][j][l][m]
|
||||
)
|
||||
|
||||
if alignment_prob >= max_alignment_prob:
|
||||
max_alignment_prob = alignment_prob
|
||||
best_i = i
|
||||
|
||||
alignment[j] = best_i
|
||||
cepts[best_i].append(j)
|
||||
|
||||
return AlignmentInfo(
|
||||
tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts
|
||||
)
|
||||
|
||||
def hillclimb(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Starting from the alignment in ``alignment_info``, look at
|
||||
neighboring alignments iteratively for the best one
|
||||
|
||||
There is no guarantee that the best alignment in the alignment
|
||||
space will be found, because the algorithm might be stuck in a
|
||||
local maximum.
|
||||
|
||||
:param j_pegged: If specified, the search will be constrained to
|
||||
alignments where ``j_pegged`` remains unchanged
|
||||
:type j_pegged: int
|
||||
|
||||
:return: The best alignment found from hill climbing
|
||||
:rtype: AlignmentInfo
|
||||
"""
|
||||
alignment = alignment_info # alias with shorter name
|
||||
max_probability = self.prob_t_a_given_s(alignment)
|
||||
|
||||
while True:
|
||||
old_alignment = alignment
|
||||
for neighbor_alignment in self.neighboring(alignment, j_pegged):
|
||||
neighbor_probability = self.prob_t_a_given_s(neighbor_alignment)
|
||||
|
||||
if neighbor_probability > max_probability:
|
||||
alignment = neighbor_alignment
|
||||
max_probability = neighbor_probability
|
||||
|
||||
if alignment == old_alignment:
|
||||
# Until there are no better alignments
|
||||
break
|
||||
|
||||
alignment.score = max_probability
|
||||
return alignment
|
||||
|
||||
def neighboring(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Determine the neighbors of ``alignment_info``, obtained by
|
||||
moving or swapping one alignment point
|
||||
|
||||
:param j_pegged: If specified, neighbors that have a different
|
||||
alignment point from j_pegged will not be considered
|
||||
:type j_pegged: int
|
||||
|
||||
:return: A set neighboring alignments represented by their
|
||||
``AlignmentInfo``
|
||||
:rtype: set(AlignmentInfo)
|
||||
"""
|
||||
neighbors = set()
|
||||
|
||||
l = len(alignment_info.src_sentence) - 1 # exclude NULL
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
original_alignment = alignment_info.alignment
|
||||
original_cepts = alignment_info.cepts
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j != j_pegged:
|
||||
# Add alignments that differ by one alignment point
|
||||
for i in range(0, l + 1):
|
||||
new_alignment = list(original_alignment)
|
||||
new_cepts = deepcopy(original_cepts)
|
||||
old_i = original_alignment[j]
|
||||
|
||||
# update alignment
|
||||
new_alignment[j] = i
|
||||
|
||||
# update cepts
|
||||
insort_left(new_cepts[i], j)
|
||||
new_cepts[old_i].remove(j)
|
||||
|
||||
new_alignment_info = AlignmentInfo(
|
||||
tuple(new_alignment),
|
||||
alignment_info.src_sentence,
|
||||
alignment_info.trg_sentence,
|
||||
new_cepts,
|
||||
)
|
||||
neighbors.add(new_alignment_info)
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j != j_pegged:
|
||||
# Add alignments that have two alignment points swapped
|
||||
for other_j in range(1, m + 1):
|
||||
if other_j != j_pegged and other_j != j:
|
||||
new_alignment = list(original_alignment)
|
||||
new_cepts = deepcopy(original_cepts)
|
||||
other_i = original_alignment[other_j]
|
||||
i = original_alignment[j]
|
||||
|
||||
# update alignments
|
||||
new_alignment[j] = other_i
|
||||
new_alignment[other_j] = i
|
||||
|
||||
# update cepts
|
||||
new_cepts[other_i].remove(other_j)
|
||||
insort_left(new_cepts[other_i], j)
|
||||
new_cepts[i].remove(j)
|
||||
insort_left(new_cepts[i], other_j)
|
||||
|
||||
new_alignment_info = AlignmentInfo(
|
||||
tuple(new_alignment),
|
||||
alignment_info.src_sentence,
|
||||
alignment_info.trg_sentence,
|
||||
new_cepts,
|
||||
)
|
||||
neighbors.add(new_alignment_info)
|
||||
|
||||
return neighbors
|
||||
|
||||
def maximize_lexical_translation_probabilities(self, counts):
|
||||
for t, src_words in counts.t_given_s.items():
|
||||
for s in src_words:
|
||||
estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s]
|
||||
self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def maximize_fertility_probabilities(self, counts):
|
||||
for phi, src_words in counts.fertility.items():
|
||||
for s in src_words:
|
||||
estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s]
|
||||
self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def maximize_null_generation_probabilities(self, counts):
|
||||
p1_estimate = counts.p1 / (counts.p1 + counts.p0)
|
||||
p1_estimate = max(p1_estimate, IBMModel.MIN_PROB)
|
||||
# Clip p1 if it is too large, because p0 = 1 - p1 should not be
|
||||
# smaller than MIN_PROB
|
||||
self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB)
|
||||
|
||||
def prob_of_alignments(self, alignments):
|
||||
probability = 0
|
||||
for alignment_info in alignments:
|
||||
probability += self.prob_t_a_given_s(alignment_info)
|
||||
return probability
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
|
||||
All required information is assumed to be in ``alignment_info``
|
||||
and self.
|
||||
|
||||
Derived classes should override this method
|
||||
"""
|
||||
return 0.0
|
||||
|
||||
|
||||
class AlignmentInfo:
|
||||
"""
|
||||
Helper data object for training IBM Models 3 and up
|
||||
|
||||
Read-only. For a source sentence and its counterpart in the target
|
||||
language, this class holds information about the sentence pair's
|
||||
alignment, cepts, and fertility.
|
||||
|
||||
Warning: Alignments are one-indexed here, in contrast to
|
||||
nltk.translate.Alignment and AlignedSent, which are zero-indexed
|
||||
This class is not meant to be used outside of IBM models.
|
||||
"""
|
||||
|
||||
def __init__(self, alignment, src_sentence, trg_sentence, cepts):
|
||||
if not isinstance(alignment, tuple):
|
||||
raise TypeError(
|
||||
"The alignment must be a tuple because it is used "
|
||||
"to uniquely identify AlignmentInfo objects."
|
||||
)
|
||||
|
||||
self.alignment = alignment
|
||||
"""
|
||||
tuple(int): Alignment function. ``alignment[j]`` is the position
|
||||
in the source sentence that is aligned to the position j in the
|
||||
target sentence.
|
||||
"""
|
||||
|
||||
self.src_sentence = src_sentence
|
||||
"""
|
||||
tuple(str): Source sentence referred to by this object.
|
||||
Should include NULL token (None) in index 0.
|
||||
"""
|
||||
|
||||
self.trg_sentence = trg_sentence
|
||||
"""
|
||||
tuple(str): Target sentence referred to by this object.
|
||||
Should have a dummy element in index 0 so that the first word
|
||||
starts from index 1.
|
||||
"""
|
||||
|
||||
self.cepts = cepts
|
||||
"""
|
||||
list(list(int)): The positions of the target words, in
|
||||
ascending order, aligned to a source word position. For example,
|
||||
cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7
|
||||
of the target sentence are aligned to the word in position 4 of
|
||||
the source sentence
|
||||
"""
|
||||
|
||||
self.score = None
|
||||
"""
|
||||
float: Optional. Probability of alignment, as defined by the
|
||||
IBM model that assesses this alignment
|
||||
"""
|
||||
|
||||
def fertility_of_i(self, i):
|
||||
"""
|
||||
Fertility of word in position ``i`` of the source sentence
|
||||
"""
|
||||
return len(self.cepts[i])
|
||||
|
||||
def is_head_word(self, j):
|
||||
"""
|
||||
:return: Whether the word in position ``j`` of the target
|
||||
sentence is a head word
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
return self.cepts[i][0] == j
|
||||
|
||||
def center_of_cept(self, i):
|
||||
"""
|
||||
:return: The ceiling of the average positions of the words in
|
||||
the tablet of cept ``i``, or 0 if ``i`` is None
|
||||
"""
|
||||
if i is None:
|
||||
return 0
|
||||
|
||||
average_position = sum(self.cepts[i]) / len(self.cepts[i])
|
||||
return int(ceil(average_position))
|
||||
|
||||
def previous_cept(self, j):
|
||||
"""
|
||||
:return: The previous cept of ``j``, or None if ``j`` belongs to
|
||||
the first cept
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
if i == 0:
|
||||
raise ValueError(
|
||||
"Words aligned to NULL cannot have a previous "
|
||||
"cept because NULL has no position"
|
||||
)
|
||||
previous_cept = i - 1
|
||||
while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
|
||||
previous_cept -= 1
|
||||
|
||||
if previous_cept <= 0:
|
||||
previous_cept = None
|
||||
return previous_cept
|
||||
|
||||
def previous_in_tablet(self, j):
|
||||
"""
|
||||
:return: The position of the previous word that is in the same
|
||||
tablet as ``j``, or None if ``j`` is the first word of the
|
||||
tablet
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
tablet_position = self.cepts[i].index(j)
|
||||
if tablet_position == 0:
|
||||
return None
|
||||
return self.cepts[i][tablet_position - 1]
|
||||
|
||||
def zero_indexed_alignment(self):
|
||||
"""
|
||||
:return: Zero-indexed alignment, suitable for use in external
|
||||
``nltk.translate`` modules like ``nltk.translate.Alignment``
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
zero_indexed_alignment = []
|
||||
for j in range(1, len(self.trg_sentence)):
|
||||
i = self.alignment[j] - 1
|
||||
if i < 0:
|
||||
i = None # alignment to NULL token
|
||||
zero_indexed_alignment.append((j - 1, i))
|
||||
return zero_indexed_alignment
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.alignment == other.alignment
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.alignment)
|
||||
|
||||
|
||||
class Counts:
|
||||
"""
|
||||
Data object to store counts of various parameters during training
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.t_given_s = defaultdict(lambda: defaultdict(float))
|
||||
self.any_t_given_s = defaultdict(float)
|
||||
self.p0 = 0.0
|
||||
self.p1 = 0.0
|
||||
self.fertility = defaultdict(lambda: defaultdict(float))
|
||||
self.fertility_for_any_phi = defaultdict(float)
|
||||
|
||||
def update_lexical_translation(self, count, alignment_info, j):
|
||||
i = alignment_info.alignment[j]
|
||||
t = alignment_info.trg_sentence[j]
|
||||
s = alignment_info.src_sentence[i]
|
||||
self.t_given_s[t][s] += count
|
||||
self.any_t_given_s[s] += count
|
||||
|
||||
def update_null_generation(self, count, alignment_info):
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
fertility_of_null = alignment_info.fertility_of_i(0)
|
||||
self.p1 += fertility_of_null * count
|
||||
self.p0 += (m - 2 * fertility_of_null) * count
|
||||
|
||||
def update_fertility(self, count, alignment_info):
|
||||
for i in range(0, len(alignment_info.src_sentence)):
|
||||
s = alignment_info.src_sentence[i]
|
||||
phi = alignment_info.fertility_of_i(i)
|
||||
self.fertility[phi][s] += count
|
||||
self.fertility_for_any_phi[s] += count
|
||||
Reference in New Issue
Block a user