This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,33 @@
# Natural Language Toolkit: Machine Translation
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>, Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Experimental features for machine translation.
These interfaces are prone to change.
isort:skip_file
"""
from nltk.translate.api import AlignedSent, Alignment, PhraseTable
from nltk.translate.ibm_model import IBMModel
from nltk.translate.ibm1 import IBMModel1
from nltk.translate.ibm2 import IBMModel2
from nltk.translate.ibm3 import IBMModel3
from nltk.translate.ibm4 import IBMModel4
from nltk.translate.ibm5 import IBMModel5
from nltk.translate.bleu_score import sentence_bleu as bleu
from nltk.translate.ribes_score import sentence_ribes as ribes
from nltk.translate.meteor_score import meteor_score as meteor
from nltk.translate.metrics import alignment_error_rate
from nltk.translate.stack_decoder import StackDecoder
from nltk.translate.nist_score import sentence_nist as nist
from nltk.translate.chrf_score import sentence_chrf as chrf
from nltk.translate.gale_church import trace
from nltk.translate.gdfa import grow_diag_final_and
from nltk.translate.gleu_score import sentence_gleu as gleu
from nltk.translate.phrase_based import extract
from nltk.translate.lepor import sentence_lepor as lepor, corpus_lepor

View File

@@ -0,0 +1,335 @@
# Natural Language Toolkit: API for alignment and translation objects
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# Steven Bird <stevenbird1@gmail.com>
# Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import subprocess
from collections import namedtuple
class AlignedSent:
"""
Return an aligned sentence object, which encapsulates two sentences
along with an ``Alignment`` between them.
Typically used in machine translation to represent a sentence and
its translation.
>>> from nltk.translate import AlignedSent, Alignment
>>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1'))
>>> algnsent.words
['klein', 'ist', 'das', 'Haus']
>>> algnsent.mots
['the', 'house', 'is', 'small']
>>> algnsent.alignment
Alignment([(0, 3), (1, 2), (2, 0), (3, 1)])
>>> from nltk.corpus import comtrans
>>> print(comtrans.aligned_sents()[54])
<AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
>>> print(comtrans.aligned_sents()[54].alignment)
0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
:param words: Words in the target language sentence
:type words: list(str)
:param mots: Words in the source language sentence
:type mots: list(str)
:param alignment: Word-level alignments between ``words`` and ``mots``.
Each alignment is represented as a 2-tuple (words_index, mots_index).
:type alignment: Alignment
"""
def __init__(self, words, mots, alignment=None):
self._words = words
self._mots = mots
if alignment is None:
self.alignment = Alignment([])
else:
assert type(alignment) is Alignment
self.alignment = alignment
@property
def words(self):
return self._words
@property
def mots(self):
return self._mots
def _get_alignment(self):
return self._alignment
def _set_alignment(self, alignment):
_check_alignment(len(self.words), len(self.mots), alignment)
self._alignment = alignment
alignment = property(_get_alignment, _set_alignment)
def __repr__(self):
"""
Return a string representation for this ``AlignedSent``.
:rtype: str
"""
words = "[%s]" % (", ".join("'%s'" % w for w in self._words))
mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots))
return f"AlignedSent({words}, {mots}, {self._alignment!r})"
def _to_dot(self):
"""
Dot representation of the aligned sentence
"""
s = "graph align {\n"
s += "node[shape=plaintext]\n"
# Declare node
s += "".join([f'"{w}_source" [label="{w}"] \n' for w in self._words])
s += "".join([f'"{w}_target" [label="{w}"] \n' for w in self._mots])
# Alignment
s += "".join(
[
f'"{self._words[u]}_source" -- "{self._mots[v]}_target" \n'
for u, v in self._alignment
]
)
# Connect the source words
for i in range(len(self._words) - 1):
s += '"{}_source" -- "{}_source" [style=invis]\n'.format(
self._words[i],
self._words[i + 1],
)
# Connect the target words
for i in range(len(self._mots) - 1):
s += '"{}_target" -- "{}_target" [style=invis]\n'.format(
self._mots[i],
self._mots[i + 1],
)
# Put it in the same rank
s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words))
s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots))
s += "}"
return s
def _repr_svg_(self):
"""
Ipython magic : show SVG representation of this ``AlignedSent``.
"""
dot_string = self._to_dot().encode("utf8")
output_format = "svg"
try:
process = subprocess.Popen(
["dot", "-T%s" % output_format],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
except OSError as e:
raise Exception("Cannot find the dot binary from Graphviz package") from e
out, err = process.communicate(dot_string)
return out.decode("utf8")
def __str__(self):
"""
Return a human-readable string representation for this ``AlignedSent``.
:rtype: str
"""
source = " ".join(self._words)[:20] + "..."
target = " ".join(self._mots)[:20] + "..."
return f"<AlignedSent: '{source}' -> '{target}'>"
def invert(self):
"""
Return the aligned sentence pair, reversing the directionality
:rtype: AlignedSent
"""
return AlignedSent(self._mots, self._words, self._alignment.invert())
class Alignment(frozenset):
"""
A storage class for representing alignment between two sequences, s1, s2.
In general, an alignment is a set of tuples of the form (i, j, ...)
representing an alignment between the i-th element of s1 and the
j-th element of s2. Tuples are extensible (they might contain
additional data, such as a boolean to indicate sure vs possible alignments).
>>> from nltk.translate import Alignment
>>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
>>> a.invert()
Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
>>> print(a.invert())
0-0 1-0 2-1 2-2
>>> a[0]
[(0, 1), (0, 0)]
>>> a.invert()[2]
[(2, 1), (2, 2)]
>>> b = Alignment([(0, 0), (0, 1)])
>>> b.issubset(a)
True
>>> c = Alignment.fromstring('0-0 0-1')
>>> b == c
True
"""
def __new__(cls, pairs):
self = frozenset.__new__(cls, pairs)
self._len = max(p[0] for p in self) if self != frozenset([]) else 0
self._index = None
return self
@classmethod
def fromstring(cls, s):
"""
Read a giza-formatted string and return an Alignment object.
>>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5')
Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)])
:type s: str
:param s: the positional alignments in giza format
:rtype: Alignment
:return: An Alignment object corresponding to the string representation ``s``.
"""
return Alignment([_giza2pair(a) for a in s.split()])
def __getitem__(self, key):
"""
Look up the alignments that map from a given index or slice.
"""
if not self._index:
self._build_index()
return self._index.__getitem__(key)
def invert(self):
"""
Return an Alignment object, being the inverted mapping.
"""
return Alignment(((p[1], p[0]) + p[2:]) for p in self)
def range(self, positions=None):
"""
Work out the range of the mapping from the given positions.
If no positions are specified, compute the range of the entire mapping.
"""
image = set()
if not self._index:
self._build_index()
if not positions:
positions = list(range(len(self._index)))
for p in positions:
image.update(f for _, f in self._index[p])
return sorted(image)
def __repr__(self):
"""
Produce a Giza-formatted string representing the alignment.
"""
return "Alignment(%r)" % sorted(self)
def __str__(self):
"""
Produce a Giza-formatted string representing the alignment.
"""
return " ".join("%d-%d" % p[:2] for p in sorted(self))
def _build_index(self):
"""
Build a list self._index such that self._index[i] is a list
of the alignments originating from word i.
"""
self._index = [[] for _ in range(self._len + 1)]
for p in self:
self._index[p[0]].append(p)
def _giza2pair(pair_string):
i, j = pair_string.split("-")
return int(i), int(j)
def _naacl2pair(pair_string):
i, j, p = pair_string.split("-")
return int(i), int(j)
def _check_alignment(num_words, num_mots, alignment):
"""
Check whether the alignments are legal.
:param num_words: the number of source language words
:type num_words: int
:param num_mots: the number of target language words
:type num_mots: int
:param alignment: alignment to be checked
:type alignment: Alignment
:raise IndexError: if alignment falls outside the sentence
"""
assert type(alignment) is Alignment
if not all(0 <= pair[0] < num_words for pair in alignment):
raise IndexError("Alignment is outside boundary of words")
if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment):
raise IndexError("Alignment is outside boundary of mots")
PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"])
class PhraseTable:
"""
In-memory store of translations for a given phrase, and the log
probability of the those translations
"""
def __init__(self):
self.src_phrases = dict()
def translations_for(self, src_phrase):
"""
Get the translations for a source language phrase
:param src_phrase: Source language phrase of interest
:type src_phrase: tuple(str)
:return: A list of target language phrases that are translations
of ``src_phrase``, ordered in decreasing order of
likelihood. Each list element is a tuple of the target
phrase and its log probability.
:rtype: list(PhraseTableEntry)
"""
return self.src_phrases[src_phrase]
def add(self, src_phrase, trg_phrase, log_prob):
"""
:type src_phrase: tuple(str)
:type trg_phrase: tuple(str)
:param log_prob: Log probability that given ``src_phrase``,
``trg_phrase`` is its translation
:type log_prob: float
"""
entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob)
if src_phrase not in self.src_phrases:
self.src_phrases[src_phrase] = []
self.src_phrases[src_phrase].append(entry)
self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True)
def __contains__(self, src_phrase):
return src_phrase in self.src_phrases

View File

@@ -0,0 +1,714 @@
# Natural Language Toolkit: BLEU Score
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""BLEU score implementation."""
import math
import sys
import warnings
from collections import Counter
from fractions import Fraction as _Fraction
from nltk.util import ngrams
class Fraction(_Fraction):
"""Fraction with _normalize=False support for 3.12"""
def __new__(cls, numerator=0, denominator=None, _normalize=False):
if sys.version_info >= (3, 12):
self = super().__new__(cls, numerator, denominator)
else:
self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
self._normalize = _normalize
self._original_numerator = numerator
self._original_denominator = denominator
return self
@property
def numerator(self):
if not self._normalize:
return self._original_numerator
return super().numerator
@property
def denominator(self):
if not self._normalize:
return self._original_denominator
return super().denominator
def sentence_bleu(
references,
hypothesis,
weights=(0.25, 0.25, 0.25, 0.25),
smoothing_function=None,
auto_reweigh=False,
):
"""
Calculate BLEU score (Bilingual Evaluation Understudy) from
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
"BLEU: a method for automatic evaluation of machine translation."
In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
... 'that', 'party', 'direct']
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
... 'heed', 'Party', 'commands']
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the',
... 'Party']
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
0.5045...
If there is no ngrams overlap for any order of n-grams, BLEU returns the
value 0. This is because the precision for the order of n-grams without
overlap is 0, and the geometric mean in the final BLEU score computation
multiplies the 0 with the precision of other n-grams. This results in 0
(independently of the precision of the other n-gram orders). The following
example has zero 3-gram and 4-gram overlaps:
>>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
0.0
To avoid this harsh behaviour when no ngram overlaps are found a smoothing
function can be used.
>>> chencherry = SmoothingFunction()
>>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
0.0370...
The default BLEU calculates a score for up to 4-grams using uniform
weights (this is called BLEU-4). To evaluate your translations with
higher/lower order ngrams, use customized weights. E.g. when accounting
for up to 5-grams with uniform weights (this is called BLEU-5) use:
>>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
0.3920...
Multiple BLEU scores can be computed at once, by supplying a list of weights.
E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use:
>>> weights = [
... (1./2., 1./2.),
... (1./3., 1./3., 1./3.),
... (1./4., 1./4., 1./4., 1./4.)
... ]
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
[0.7453..., 0.6240..., 0.5045...]
:param references: reference sentences
:type references: list(list(str))
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
:type weights: tuple(float) / list(tuple(float))
:param smoothing_function:
:type smoothing_function: SmoothingFunction
:param auto_reweigh: Option to re-normalize the weights uniformly.
:type auto_reweigh: bool
:return: The sentence-level BLEU score. Returns a list if multiple weights were supplied.
:rtype: float / list(float)
"""
return corpus_bleu(
[references], [hypothesis], weights, smoothing_function, auto_reweigh
)
def corpus_bleu(
list_of_references,
hypotheses,
weights=(0.25, 0.25, 0.25, 0.25),
smoothing_function=None,
auto_reweigh=False,
):
"""
Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
the hypotheses and their respective references.
Instead of averaging the sentence level BLEU scores (i.e. macro-average
precision), the original BLEU metric (Papineni et al. 2002) accounts for
the micro-average precision (i.e. summing the numerators and denominators
for each hypothesis-reference(s) pairs before the division).
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
... 'heed', 'Party', 'commands']
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
... 'interested', 'in', 'world', 'history']
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
... 'because', 'he', 'read', 'the', 'book']
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
>>> hypotheses = [hyp1, hyp2]
>>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
0.5920...
The example below show that corpus_bleu() is different from averaging
sentence_bleu() for hypotheses
>>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
>>> score2 = sentence_bleu([ref2a], hyp2)
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
0.6223...
Custom weights may be supplied to fine-tune the BLEU score further.
A tuple of float weights for unigrams, bigrams, trigrams and so on can be given.
>>> weights = (0.1, 0.3, 0.5, 0.1)
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
0.5818...
This particular weight gave extra value to trigrams.
Furthermore, multiple weights can be given, resulting in multiple BLEU scores.
>>> weights = [
... (0.5, 0.5),
... (0.333, 0.333, 0.334),
... (0.25, 0.25, 0.25, 0.25),
... (0.2, 0.2, 0.2, 0.2, 0.2)
... ]
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
[0.8242..., 0.7067..., 0.5920..., 0.4719...]
:param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type list_of_references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
:type weights: tuple(float) / list(tuple(float))
:param smoothing_function:
:type smoothing_function: SmoothingFunction
:param auto_reweigh: Option to re-normalize the weights uniformly.
:type auto_reweigh: bool
:return: The corpus-level BLEU score.
:rtype: float
"""
# Before proceeding to compute BLEU, perform sanity checks.
p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
hyp_lengths, ref_lengths = 0, 0
assert len(list_of_references) == len(hypotheses), (
"The number of hypotheses and their reference(s) should be the " "same "
)
try:
weights[0][0]
except:
weights = [weights]
max_weight_length = max(len(weight) for weight in weights)
# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
# For each order of ngram, calculate the numerator and
# denominator for the corpus-level modified precision.
for i in range(1, max_weight_length + 1):
p_i = modified_precision(references, hypothesis, i)
p_numerators[i] += p_i.numerator
p_denominators[i] += p_i.denominator
# Calculate the hypothesis length and the closest reference length.
# Adds them to the corpus-level hypothesis and reference counts.
hyp_len = len(hypothesis)
hyp_lengths += hyp_len
ref_lengths += closest_ref_length(references, hyp_len)
# Calculate corpus-level brevity penalty.
bp = brevity_penalty(ref_lengths, hyp_lengths)
# Collects the various precision values for the different ngram orders.
p_n = [
Fraction(p_numerators[i], p_denominators[i], _normalize=False)
for i in range(1, max_weight_length + 1)
]
# Returns 0 if there's no matching n-grams
# We only need to check for p_numerators[1] == 0, since if there's
# no unigrams, there won't be any higher order ngrams.
if p_numerators[1] == 0:
return 0 if len(weights) == 1 else [0] * len(weights)
# If there's no smoothing, set use method0 from SmoothinFunction class.
if not smoothing_function:
smoothing_function = SmoothingFunction().method0
# Smoothen the modified precision.
# Note: smoothing_function() may convert values into floats;
# it tries to retain the Fraction object as much as the
# smoothing method allows.
p_n = smoothing_function(
p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
)
bleu_scores = []
for weight in weights:
# Uniformly re-weighting based on maximum hypothesis lengths if largest
# order of n-grams < 4 and weights is set at default.
if auto_reweigh:
if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
weight = (1 / hyp_lengths,) * hyp_lengths
s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
s = bp * math.exp(math.fsum(s))
bleu_scores.append(s)
return bleu_scores[0] if len(weights) == 1 else bleu_scores
def modified_precision(references, hypothesis, n):
"""
Calculate modified ngram precision.
The normal precision method may lead to some wrong translations with
high-precision, e.g., the translation, in which a word of reference
repeats several times, has very high precision.
This function only returns the Fraction object that contains the numerator
and denominator necessary to calculate the corpus-level precision.
To calculate the modified precision for a single pair of hypothesis and
references, cast the Fraction object into a float.
The famous "the the the ... " example shows that you can get BLEU precision
by duplicating high frequency words.
>>> reference1 = 'the cat is on the mat'.split()
>>> reference2 = 'there is a cat on the mat'.split()
>>> hypothesis1 = 'the the the the the the the'.split()
>>> references = [reference1, reference2]
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
0.2857...
In the modified n-gram precision, a reference word will be considered
exhausted after a matching hypothesis word is identified, e.g.
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will',
... 'forever', 'heed', 'Party', 'commands']
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the',
... 'Party']
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> hypothesis = 'of the'.split()
>>> references = [reference1, reference2, reference3]
>>> float(modified_precision(references, hypothesis, n=1))
1.0
>>> float(modified_precision(references, hypothesis, n=2))
1.0
An example of a normal machine translation hypothesis:
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
... 'that', 'party', 'direct']
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will',
... 'forever', 'heed', 'Party', 'commands']
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the',
... 'Party']
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> references = [reference1, reference2, reference3]
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
0.9444...
>>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
0.5714...
>>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
0.5882352941176471
>>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
0.07692...
:param references: A list of reference translations.
:type references: list(list(str))
:param hypothesis: A hypothesis translation.
:type hypothesis: list(str)
:param n: The ngram order.
:type n: int
:return: BLEU's modified precision for the nth order ngram.
:rtype: Fraction
"""
# Extracts all ngrams in hypothesis
# Set an empty Counter if hypothesis is empty.
counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
# Extract a union of references' counts.
# max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
max_counts = {}
for reference in references:
reference_counts = (
Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
)
for ngram in counts:
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
# Assigns the intersection between hypothesis and references' counts.
clipped_counts = {
ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
}
numerator = sum(clipped_counts.values())
# Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
# Usually this happens when the ngram order is > len(reference).
denominator = max(1, sum(counts.values()))
return Fraction(numerator, denominator, _normalize=False)
def closest_ref_length(references, hyp_len):
"""
This function finds the reference that is the closest length to the
hypothesis. The closest reference length is referred to as *r* variable
from the brevity penalty formula in Papineni et. al. (2002)
:param references: A list of reference translations.
:type references: list(list(str))
:param hyp_len: The length of the hypothesis.
:type hyp_len: int
:return: The length of the reference that's closest to the hypothesis.
:rtype: int
"""
ref_lens = (len(reference) for reference in references)
closest_ref_len = min(
ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
)
return closest_ref_len
def brevity_penalty(closest_ref_len, hyp_len):
"""
Calculate brevity penalty.
As the modified n-gram precision still has the problem from the short
length sentence, brevity penalty is used to modify the overall BLEU
score according to length.
An example from the paper. There are three references with length 12, 15
and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
>>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
>>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
>>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
>>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
>>> references = [reference1, reference2, reference3]
>>> hyp_len = len(hypothesis)
>>> closest_ref_len = closest_ref_length(references, hyp_len)
>>> brevity_penalty(closest_ref_len, hyp_len)
1.0
In case a hypothesis translation is shorter than the references, penalty is
applied.
>>> references = [['a'] * 28, ['a'] * 28]
>>> hypothesis = ['a'] * 12
>>> hyp_len = len(hypothesis)
>>> closest_ref_len = closest_ref_length(references, hyp_len)
>>> brevity_penalty(closest_ref_len, hyp_len)
0.2635971381157267
The length of the closest reference is used to compute the penalty. If the
length of a hypothesis is 12, and the reference lengths are 13 and 2, the
penalty is applied because the hypothesis length (12) is less then the
closest reference length (13).
>>> references = [['a'] * 13, ['a'] * 2]
>>> hypothesis = ['a'] * 12
>>> hyp_len = len(hypothesis)
>>> closest_ref_len = closest_ref_length(references, hyp_len)
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
0.9200...
The brevity penalty doesn't depend on reference order. More importantly,
when two reference sentences are at the same distance, the shortest
reference sentence length is used.
>>> references = [['a'] * 13, ['a'] * 11]
>>> hypothesis = ['a'] * 12
>>> hyp_len = len(hypothesis)
>>> closest_ref_len = closest_ref_length(references, hyp_len)
>>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
>>> hyp_len = len(hypothesis)
>>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
>>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
>>> bp1 == bp2 == 1
True
A test example from mteval-v13a.pl (starting from the line 705):
>>> references = [['a'] * 11, ['a'] * 8]
>>> hypothesis = ['a'] * 7
>>> hyp_len = len(hypothesis)
>>> closest_ref_len = closest_ref_length(references, hyp_len)
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
0.8668...
>>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
>>> hypothesis = ['a'] * 7
>>> hyp_len = len(hypothesis)
>>> closest_ref_len = closest_ref_length(references, hyp_len)
>>> brevity_penalty(closest_ref_len, hyp_len)
1.0
:param hyp_len: The length of the hypothesis for a single sentence OR the
sum of all the hypotheses' lengths for a corpus
:type hyp_len: int
:param closest_ref_len: The length of the closest reference for a single
hypothesis OR the sum of all the closest references for every hypotheses.
:type closest_ref_len: int
:return: BLEU's brevity penalty.
:rtype: float
"""
if hyp_len > closest_ref_len:
return 1
# If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
elif hyp_len == 0:
return 0
else:
return math.exp(1 - closest_ref_len / hyp_len)
class SmoothingFunction:
"""
This is an implementation of the smoothing techniques
for segment-level BLEU scores that was presented in
Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
Smoothing Techniques for Sentence-Level BLEU. In WMT14.
http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
"""
def __init__(self, epsilon=0.1, alpha=5, k=5):
"""
This will initialize the parameters required for the various smoothing
techniques, the default values are set to the numbers used in the
experiments from Chen and Cherry (2014).
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
... 'that', 'the', 'military', 'always', 'obeys', 'the',
... 'commands', 'of', 'the', 'party']
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
... 'that', 'the', 'military', 'will', 'forever', 'heed',
... 'Party', 'commands']
>>> chencherry = SmoothingFunction()
>>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
0.4118...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
0.4118...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
0.4118...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
0.4452...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
0.4118...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
0.4118...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
0.4905...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
0.4135...
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
0.4905...
:param epsilon: the epsilon value use in method 1
:type epsilon: float
:param alpha: the alpha value use in method 6
:type alpha: int
:param k: the k value use in method 4
:type k: int
"""
self.epsilon = epsilon
self.alpha = alpha
self.k = k
def method0(self, p_n, *args, **kwargs):
"""
No smoothing.
"""
p_n_new = []
for i, p_i in enumerate(p_n):
if p_i.numerator != 0:
p_n_new.append(p_i)
else:
_msg = str(
"\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
"Therefore the BLEU score evaluates to 0, independently of\n"
"how many N-gram overlaps of lower order it contains.\n"
"Consider using lower n-gram order or use "
"SmoothingFunction()"
).format(i + 1)
warnings.warn(_msg)
# When numerator==0 where denonminator==0 or !=0, the result
# for the precision score should be equal to 0 or undefined.
# Due to BLEU geometric mean computation in logarithm space,
# we we need to take the return sys.float_info.min such that
# math.log(sys.float_info.min) returns a 0 precision score.
p_n_new.append(sys.float_info.min)
return p_n_new
def method1(self, p_n, *args, **kwargs):
"""
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
"""
return [
(
(p_i.numerator + self.epsilon) / p_i.denominator
if p_i.numerator == 0
else p_i
)
for p_i in p_n
]
def method2(self, p_n, *args, **kwargs):
"""
Smoothing method 2: Add 1 to both numerator and denominator from
Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for
Evaluating Automatic Evaluation Metrics for Machine Translation.
In COLING 2004.
"""
return [
(
Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False)
if i != 0
else p_n[0]
)
for i in range(len(p_n))
]
def method3(self, p_n, *args, **kwargs):
"""
Smoothing method 3: NIST geometric sequence smoothing
The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
precision score whose matching n-gram count is null.
k is 1 for the first 'n' value for which the n-gram match count is null/
For example, if the text contains:
- one 2-gram match
- and (consequently) two 1-gram matches
the n-gram count for each individual precision score would be:
- n=1 => prec_count = 2 (two unigrams)
- n=2 => prec_count = 1 (one bigram)
- n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
- n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
"""
incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
for i, p_i in enumerate(p_n):
if p_i.numerator == 0:
p_n[i] = 1 / (2**incvnt * p_i.denominator)
incvnt += 1
return p_n
def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 4:
Shorter translations may have inflated precision values due to having
smaller denominators; therefore, we give them proportionally
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
"""
incvnt = 1
hyp_len = hyp_len if hyp_len else len(hypothesis)
for i, p_i in enumerate(p_n):
if p_i.numerator == 0 and hyp_len > 1:
# incvnt = i + 1 * self.k / math.log(
# hyp_len
# ) # Note that this K is different from the K from NIST.
# p_n[i] = incvnt / p_i.denominator\
numerator = 1 / (2**incvnt * self.k / math.log(hyp_len))
p_n[i] = numerator / p_i.denominator
incvnt += 1
return p_n
def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 5:
The matched counts for similar values of n should be similar. To a
calculate the n-gram matched count, it averages the n1, n and n+1 gram
matched counts.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
m = {}
# Requires an precision value for an addition ngram order.
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
m[-1] = p_n[0] + 1
for i, p_i in enumerate(p_n):
p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
m[i] = p_n[i]
return p_n
def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 6:
Interpolates the maximum likelihood estimate of the precision *p_n* with
a prior estimate *pi0*. The prior is estimated by assuming that the ratio
between pn and pn1 will be the same as that between pn1 and pn2; from
Gao and He (2013) Training MRF-Based Phrase Translation Models using
Gradient Ascent. In NAACL.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
# This smoothing only works when p_1 and p_2 is non-zero.
# Raise an error with an appropriate message when the input is too short
# to use this smoothing technique.
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
for i, p_i in enumerate(p_n):
if i in [0, 1]: # Skips the first 2 orders of ngrams.
continue
else:
pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
# No. of ngrams in translation that matches the reference.
m = p_i.numerator
# No. of ngrams in translation.
l = sum(1 for _ in ngrams(hypothesis, i + 1))
# Calculates the interpolated precision.
p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
return p_n
def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 7:
Interpolates methods 4 and 5.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
p_n = self.method4(p_n, references, hypothesis, hyp_len)
p_n = self.method5(p_n, references, hypothesis, hyp_len)
return p_n

View File

@@ -0,0 +1,221 @@
# Natural Language Toolkit: ChrF score
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Maja Popovic
# Contributors: Liling Tan, Aleš Tamchyna (Memsource)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
""" ChrF score implementation """
import re
from collections import Counter, defaultdict
from nltk.util import ngrams
def sentence_chrf(
reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
):
"""
Calculates the sentence level CHRF (Character n-gram F-score) described in
- Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
In Proceedings of the 10th Workshop on Machine Translation.
https://www.statmt.org/wmt15/pdf/WMT49.pdf
- Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
In Proceedings of the 1st Conference on Machine Translation.
https://www.statmt.org/wmt16/pdf/W16-2341.pdf
This implementation of CHRF only supports a single reference at the moment.
For details not reported in the paper, consult Maja Popovic's original
implementation: https://github.com/m-popovic/chrF
The code should output results equivalent to running CHRF++ with the
following options: -nw 0 -b 3
An example from the original BLEU paper
https://www.aclweb.org/anthology/P02-1040.pdf
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands').split()
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party').split()
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
... 'guidebook that party direct').split()
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
0.6349...
>>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
0.3330...
The infamous "the the the ... " example
>>> ref = 'the cat is on the mat'.split()
>>> hyp = 'the the the the the the the'.split()
>>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
0.1468...
An example to show that this function allows users to use strings instead of
tokens, i.e. list(str) as inputs.
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands')
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party')
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
0.6349...
>>> type(ref1) == type(hyp1) == str
True
>>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
0.6349...
To skip the unigrams and only use 2- to 3-grams:
>>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
0.6617...
:param references: reference sentence
:type references: list(str) / str
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str) / str
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:param beta: the parameter to assign more importance to recall over precision
:type beta: float
:param ignore_whitespace: ignore whitespace characters in scoring
:type ignore_whitespace: bool
:return: the sentence level CHRF score.
:rtype: float
"""
return corpus_chrf(
[reference],
[hypothesis],
min_len,
max_len,
beta=beta,
ignore_whitespace=ignore_whitespace,
)
def _preprocess(sent, ignore_whitespace):
if type(sent) != str:
# turn list of tokens into a string
sent = " ".join(sent)
if ignore_whitespace:
sent = re.sub(r"\s+", "", sent)
return sent
def chrf_precision_recall_fscore_support(
reference, hypothesis, n, beta=3.0, epsilon=1e-16
):
"""
This function computes the precision, recall and fscore from the ngram
overlaps. It returns the `support` which is the true positive score.
By underspecifying the input type, the function will be agnostic as to how
it computes the ngrams and simply take the whichever element in the list;
it could be either token or character.
:param reference: The reference sentence.
:type reference: list
:param hypothesis: The hypothesis sentence.
:type hypothesis: list
:param n: Extract up to the n-th order ngrams
:type n: int
:param beta: The parameter to assign more importance to recall over precision.
:type beta: float
:param epsilon: The fallback value if the hypothesis or reference is empty.
:type epsilon: float
:return: Returns the precision, recall and f-score and support (true positive).
:rtype: tuple(float)
"""
ref_ngrams = Counter(ngrams(reference, n))
hyp_ngrams = Counter(ngrams(hypothesis, n))
# calculate the number of ngram matches
overlap_ngrams = ref_ngrams & hyp_ngrams
tp = sum(overlap_ngrams.values()) # True positives.
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
try:
prec = tp / tpfp # precision
rec = tp / tpfn # recall
factor = beta**2
fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
except ZeroDivisionError:
prec = rec = fscore = epsilon
return prec, rec, fscore, tp
def corpus_chrf(
references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
):
"""
Calculates the corpus level CHRF (Character n-gram F-score), it is the
macro-averaged value of the sentence/segment level CHRF score.
This implementation of CHRF only supports a single reference at the moment.
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands').split()
>>> ref2 = str('It is the guiding principle which guarantees the military '
... 'forces always being under the command of the Party').split()
>>>
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party').split()
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
... 'guidebook that party direct')
>>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
0.3910...
:param references: a corpus of list of reference sentences, w.r.t. hypotheses
:type references: list(list(str))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:param beta: the parameter to assign more importance to recall over precision
:type beta: float
:param ignore_whitespace: ignore whitespace characters in scoring
:type ignore_whitespace: bool
:return: the sentence level CHRF score.
:rtype: float
"""
assert len(references) == len(
hypotheses
), "The number of hypotheses and their references should be the same"
num_sents = len(hypotheses)
# Keep f-scores for each n-gram order separate
ngram_fscores = defaultdict(list)
# Iterate through each hypothesis and their corresponding references.
for reference, hypothesis in zip(references, hypotheses):
# preprocess both reference and hypothesis
reference = _preprocess(reference, ignore_whitespace)
hypothesis = _preprocess(hypothesis, ignore_whitespace)
# Calculate f-scores for each sentence and for each n-gram order
# separately.
for n in range(min_len, max_len + 1):
# Compute the precision, recall, fscore and support.
prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
reference, hypothesis, n, beta=beta
)
ngram_fscores[n].append(fscore)
# how many n-gram sizes
num_ngram_sizes = len(ngram_fscores)
# sum of f-scores over all sentences for each n-gram order
total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
# macro-average over n-gram orders and over all sentences
return (sum(total_scores) / num_ngram_sizes) / num_sents

View File

@@ -0,0 +1,263 @@
# Natural Language Toolkit: Gale-Church Aligner
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Torsten Marek <marek@ifi.uzh.ch>
# Contributor: Cassidy Laidlaw, Liling Tan
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A port of the Gale-Church Aligner.
Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
https://aclweb.org/anthology/J93-1004.pdf
"""
import math
try:
from norm import logsf as norm_logsf
from scipy.stats import norm
except ImportError:
def erfcc(x):
"""Complementary error function."""
z = abs(x)
t = 1 / (1 + 0.5 * z)
r = t * math.exp(
-z * z
- 1.26551223
+ t
* (
1.00002368
+ t
* (
0.37409196
+ t
* (
0.09678418
+ t
* (
-0.18628806
+ t
* (
0.27886807
+ t
* (
-1.13520398
+ t
* (1.48851587 + t * (-0.82215223 + t * 0.17087277))
)
)
)
)
)
)
)
if x >= 0.0:
return r
else:
return 2.0 - r
def norm_cdf(x):
"""Return the area under the normal distribution from M{-∞..x}."""
return 1 - 0.5 * erfcc(x / math.sqrt(2))
def norm_logsf(x):
try:
return math.log(1 - norm_cdf(x))
except ValueError:
return float("-inf")
LOG2 = math.log(2)
class LanguageIndependent:
# These are the language-independent probabilities and parameters
# given in Gale & Church
# for the computation, l_1 is always the language with less characters
PRIORS = {
(1, 0): 0.0099,
(0, 1): 0.0099,
(1, 1): 0.89,
(2, 1): 0.089,
(1, 2): 0.089,
(2, 2): 0.011,
}
AVERAGE_CHARACTERS = 1
VARIANCE_CHARACTERS = 6.8
def trace(backlinks, source_sents_lens, target_sents_lens):
"""
Traverse the alignment cost from the tracebacks and retrieves
appropriate sentence pairs.
:param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
:type backlinks: dict
:param source_sents_lens: A list of target sentences' lengths
:type source_sents_lens: list(int)
:param target_sents_lens: A list of target sentences' lengths
:type target_sents_lens: list(int)
"""
links = []
position = (len(source_sents_lens), len(target_sents_lens))
while position != (0, 0) and all(p >= 0 for p in position):
try:
s, t = backlinks[position]
except TypeError:
position = (position[0] - 1, position[1] - 1)
continue
for i in range(s):
for j in range(t):
links.append((position[0] - i - 1, position[1] - j - 1))
position = (position[0] - s, position[1] - t)
return links[::-1]
def align_log_prob(i, j, source_sents, target_sents, alignment, params):
"""Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
being aligned with a specific C{alignment}.
@param i: The offset of the source sentence.
@param j: The offset of the target sentence.
@param source_sents: The list of source sentence lengths.
@param target_sents: The list of target sentence lengths.
@param alignment: The alignment type, a tuple of two integers.
@param params: The sentence alignment parameters.
@returns: The log probability of a specific alignment between the two sentences, given the parameters.
"""
l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
try:
# actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
# reference implementation. With l_s in the denominator, insertions are impossible.
m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(
m * params.VARIANCE_CHARACTERS
)
except ZeroDivisionError:
return float("-inf")
return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent):
"""Return the sentence alignment of two text blocks (usually paragraphs).
>>> align_blocks([5,5,5], [7,7,7])
[(0, 0), (1, 1), (2, 2)]
>>> align_blocks([10,5,5], [12,20])
[(0, 0), (1, 1), (2, 1)]
>>> align_blocks([12,20], [10,5,5])
[(0, 0), (1, 1), (1, 2)]
>>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
[(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]
@param source_sents_lens: The list of source sentence lengths.
@param target_sents_lens: The list of target sentence lengths.
@param params: the sentence alignment parameters.
@return: The sentence alignments, a list of index pairs.
"""
alignment_types = list(params.PRIORS.keys())
# there are always three rows in the history (with the last of them being filled)
D = [[]]
backlinks = {}
for i in range(len(source_sents_lens) + 1):
for j in range(len(target_sents_lens) + 1):
min_dist = float("inf")
min_align = None
for a in alignment_types:
prev_i = -1 - a[0]
prev_j = j - a[1]
if prev_i < -len(D) or prev_j < 0:
continue
p = D[prev_i][prev_j] + align_log_prob(
i, j, source_sents_lens, target_sents_lens, a, params
)
if p < min_dist:
min_dist = p
min_align = a
if min_dist == float("inf"):
min_dist = 0
backlinks[(i, j)] = min_align
D[-1].append(min_dist)
if len(D) > 2:
D.pop(0)
D.append([])
return trace(backlinks, source_sents_lens, target_sents_lens)
def align_texts(source_blocks, target_blocks, params=LanguageIndependent):
"""Creates the sentence alignment of two texts.
Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
alignment links.
Each block consists of a list that contains the lengths (in characters) of the sentences
in this block.
@param source_blocks: The list of blocks in the source text.
@param target_blocks: The list of blocks in the target text.
@param params: the sentence alignment parameters.
@returns: A list of sentence alignment lists
"""
if len(source_blocks) != len(target_blocks):
raise ValueError(
"Source and target texts do not have the same number of blocks."
)
return [
align_blocks(source_block, target_block, params)
for source_block, target_block in zip(source_blocks, target_blocks)
]
# File I/O functions; may belong in a corpus reader
def split_at(it, split_value):
"""Splits an iterator C{it} at values of C{split_value}.
Each instance of C{split_value} is swallowed. The iterator produces
subiterators which need to be consumed fully before the next subiterator
can be used.
"""
def _chunk_iterator(first):
v = first
while v != split_value:
yield v
v = it.next()
while True:
yield _chunk_iterator(it.next())
def parse_token_stream(stream, soft_delimiter, hard_delimiter):
"""Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
"""
return [
[
sum(len(token) for token in sentence_it)
for sentence_it in split_at(block_it, soft_delimiter)
]
for block_it in split_at(stream, hard_delimiter)
]

View File

@@ -0,0 +1,138 @@
# Natural Language Toolkit: GDFA word alignment symmetrization
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Liling Tan
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from collections import defaultdict
def grow_diag_final_and(srclen, trglen, e2f, f2e):
"""
This module symmetrisatizes the source-to-target and target-to-source
word alignment output and produces, aka. GDFA algorithm (Koehn, 2005).
Step 1: Find the intersection of the bidirectional alignment.
Step 2: Search for additional neighbor alignment points to be added, given
these criteria: (i) neighbor alignments points are not in the
intersection and (ii) neighbor alignments are in the union.
Step 3: Add all other alignment points that are not in the intersection, not in
the neighboring alignments that met the criteria but in the original
forward/backward alignment outputs.
>>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 '
... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18')
>>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 '
... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 '
... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18')
>>> srctext = ("この よう な ハロー 白色 わい 星 の 関数 "
... " と 共 に 不連続 に 増加 する こと が "
... "期待 さ れる こと を 示し た 。")
>>> trgtext = ("Therefore , we expect that the luminosity function "
... "of such halo white dwarfs increases discontinuously "
... "with the luminosity .")
>>> srclen = len(srctext.split())
>>> trglen = len(trgtext.split())
>>>
>>> gdfa = grow_diag_final_and(srclen, trglen, forw, back)
>>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),
... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20,
... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5),
... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22,
... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5,
... 12), (11, 6), (12, 8)]))
True
References:
Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot.
2005. Edinburgh System Description for the 2005 IWSLT Speech
Translation Evaluation. In MT Eval Workshop.
:type srclen: int
:param srclen: the number of tokens in the source language
:type trglen: int
:param trglen: the number of tokens in the target language
:type e2f: str
:param e2f: the forward word alignment outputs from source-to-target
language (in pharaoh output format)
:type f2e: str
:param f2e: the backward word alignment outputs from target-to-source
language (in pharaoh output format)
:rtype: set(tuple(int))
:return: the symmetrized alignment points from the GDFA algorithm
"""
# Converts pharaoh text format into list of tuples.
e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()]
f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()]
neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]
alignment = set(e2f).intersection(set(f2e)) # Find the intersection.
union = set(e2f).union(set(f2e))
# *aligned* is used to check if neighbors are aligned in grow_diag()
aligned = defaultdict(set)
for i, j in alignment:
aligned["e"].add(i)
aligned["f"].add(j)
def grow_diag():
"""
Search for the neighbor points and them to the intersected alignment
points if criteria are met.
"""
prev_len = len(alignment) - 1
# iterate until no new points added
while prev_len < len(alignment):
no_new_points = True
# for english word e = 0 ... en
for e in range(srclen):
# for foreign word f = 0 ... fn
for f in range(trglen):
# if ( e aligned with f)
if (e, f) in alignment:
# for each neighboring point (e-new, f-new)
for neighbor in neighbors:
neighbor = tuple(i + j for i, j in zip((e, f), neighbor))
e_new, f_new = neighbor
# if ( ( e-new not aligned and f-new not aligned)
# and (e-new, f-new in union(e2f, f2e) )
if (
e_new not in aligned and f_new not in aligned
) and neighbor in union:
alignment.add(neighbor)
aligned["e"].add(e_new)
aligned["f"].add(f_new)
prev_len += 1
no_new_points = False
# iterate until no new points added
if no_new_points:
break
def final_and(a):
"""
Adds remaining points that are not in the intersection, not in the
neighboring alignments but in the original *e2f* and *f2e* alignments
"""
# for english word e = 0 ... en
for e_new in range(srclen):
# for foreign word f = 0 ... fn
for f_new in range(trglen):
# if ( ( e-new not aligned and f-new not aligned)
# and (e-new, f-new in union(e2f, f2e) )
if (
e_new not in aligned
and f_new not in aligned
and (e_new, f_new) in union
):
alignment.add((e_new, f_new))
aligned["e"].add(e_new)
aligned["f"].add(f_new)
grow_diag()
final_and(e2f)
final_and(f2e)
return sorted(alignment)

View File

@@ -0,0 +1,190 @@
# Natural Language Toolkit: GLEU Score
#
# Copyright (C) 2001-2025 NLTK Project
# Authors:
# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
""" GLEU score implementation. """
from collections import Counter
from nltk.util import everygrams, ngrams
def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
"""
Calculates the sentence level GLEU (Google-BLEU) score described in
Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith,
Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes,
Jeffrey Dean. (2016) Googles Neural Machine Translation System:
Bridging the Gap between Human and Machine Translation.
eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf
Retrieved on 27 Oct 2016.
From Wu et al. (2016):
"The BLEU score has some undesirable properties when used for single
sentences, as it was designed to be a corpus measure. We therefore
use a slightly different score for our RL experiments which we call
the 'GLEU score'. For the GLEU score, we record all sub-sequences of
1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
compute a recall, which is the ratio of the number of matching n-grams
to the number of total n-grams in the target (ground truth) sequence,
and a precision, which is the ratio of the number of matching n-grams
to the number of total n-grams in the generated output sequence. Then
GLEU score is simply the minimum of recall and precision. This GLEU
score's range is always between 0 (no matches) and 1 (all match) and
it is symmetrical when switching output and target. According to
our experiments, GLEU score correlates quite well with the BLEU
metric on a corpus level but does not have its drawbacks for our per
sentence reward objective."
Note: The initial implementation only allowed a single reference, but now
a list of references is required (which is consistent with
bleu_score.sentence_bleu()).
The infamous "the the the ... " example
>>> ref = 'the cat is on the mat'.split()
>>> hyp = 'the the the the the the the'.split()
>>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS
0.0909...
An example to evaluate normal machine translation outputs
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands').split()
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party').split()
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
... 'guidebook that party direct').split()
>>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS
0.4393...
>>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS
0.1206...
:param references: a list of reference sentences
:type references: list(list(str))
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:return: the sentence level GLEU score.
:rtype: float
"""
return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)
def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
"""
Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all
the hypotheses and their respective references.
Instead of averaging the sentence level GLEU scores (i.e. macro-average
precision), Wu et al. (2016) sum up the matching tokens and the max of
hypothesis and reference tokens for each sentence, then compute using the
aggregate values.
From Mike Schuster (via email):
"For the corpus, we just add up the two statistics n_match and
n_all = max(n_all_output, n_all_target) for all sentences, then
calculate gleu_score = n_match / n_all, so it is not just a mean of
the sentence gleu scores (in our case, longer sentences count more,
which I think makes sense as they are more difficult to translate)."
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
... 'heed', 'Party', 'commands']
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
... 'interested', 'in', 'world', 'history']
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
... 'because', 'he', 'read', 'the', 'book']
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
>>> hypotheses = [hyp1, hyp2]
>>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
0.5673...
The example below show that corpus_gleu() is different from averaging
sentence_gleu() for hypotheses
>>> score1 = sentence_gleu([ref1a], hyp1)
>>> score2 = sentence_gleu([ref2a], hyp2)
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
0.6144...
:param list_of_references: a list of reference sentences, w.r.t. hypotheses
:type list_of_references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:return: The corpus-level GLEU score.
:rtype: float
"""
# sanity check
assert len(list_of_references) == len(
hypotheses
), "The number of hypotheses and their reference(s) should be the same"
# sum matches and max-token-lengths over all sentences
corpus_n_match = 0
corpus_n_all = 0
for references, hypothesis in zip(list_of_references, hypotheses):
hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
hyp_counts = []
for reference in references:
ref_ngrams = Counter(everygrams(reference, min_len, max_len))
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
overlap_ngrams = ref_ngrams & hyp_ngrams
tp = sum(overlap_ngrams.values()) # True positives.
# While GLEU is defined as the minimum of precision and
# recall, we can reduce the number of division operations by one by
# instead finding the maximum of the denominators for the precision
# and recall formulae, since the numerators are the same:
# precision = tp / tpfp
# recall = tp / tpfn
# gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
n_all = max(tpfp, tpfn)
if n_all > 0:
hyp_counts.append((tp, n_all))
# use the reference yielding the highest score
if hyp_counts:
n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
corpus_n_match += n_match
corpus_n_all += n_all
# corner case: empty corpus or empty references---don't divide by zero!
if corpus_n_all == 0:
gleu_score = 0.0
else:
gleu_score = corpus_n_match / corpus_n_all
return gleu_score

View File

@@ -0,0 +1,251 @@
# Natural Language Toolkit: IBM Model 1
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
# Hengfeng Li <hengfeng12345@gmail.com>
# Ruxin Hou <r.hou@student.unimelb.edu.au>
# Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
# Based on earlier version by:
# Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Lexical translation model that ignores word order.
In IBM Model 1, word order is ignored for simplicity. As long as the
word alignments are equivalent, it doesn't matter where the word occurs
in the source or target sentence. Thus, the following three alignments
are equally likely::
Source: je mange du jambon
Target: i eat some ham
Alignment: (0,0) (1,1) (2,2) (3,3)
Source: je mange du jambon
Target: some ham eat i
Alignment: (0,2) (1,3) (2,1) (3,1)
Source: du jambon je mange
Target: eat i some ham
Alignment: (0,3) (1,2) (2,0) (3,1)
Note that an alignment is represented here as
(word_index_in_target, word_index_in_source).
The EM algorithm used in Model 1 is:
:E step: In the training data, count how many times a source language
word is translated into a target language word, weighted by
the prior probability of the translation.
:M step: Estimate the new probability of translation based on the
counts from the Expectation step.
Notations
---------
:i: Position in the source sentence
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
Valid values are 1, 2, ..., length of target sentence
:s: A word in the source language
:t: A word in the target language
References
----------
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
import warnings
from collections import defaultdict
from nltk.translate import AlignedSent, Alignment, IBMModel
from nltk.translate.ibm_model import Counts
class IBMModel1(IBMModel):
"""
Lexical translation model that ignores word order
>>> bitext = []
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
>>> ibm1 = IBMModel1(bitext, 5)
>>> print(round(ibm1.translation_table['buch']['book'], 3))
0.889
>>> print(round(ibm1.translation_table['das']['book'], 3))
0.062
>>> print(round(ibm1.translation_table['buch'][None], 3))
0.113
>>> print(round(ibm1.translation_table['ja'][None], 3))
0.073
>>> test_sentence = bitext[2]
>>> test_sentence.words
['das', 'buch', 'ist', 'ja', 'klein']
>>> test_sentence.mots
['the', 'book', 'is', 'small']
>>> test_sentence.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
"""
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model.
Translation direction is from ``AlignedSent.mots`` to
``AlignedSent.words``.
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
:type sentence_aligned_corpus: list(AlignedSent)
:param iterations: Number of iterations to run training algorithm
:type iterations: int
:param probability_tables: Optional. Use this to pass in custom
probability values. If not specified, probabilities will be
set to a uniform distribution, or some other sensible value.
If specified, the following entry must be present:
``translation_table``.
See ``IBMModel`` for the type and purpose of this table.
:type probability_tables: dict[str]: object
"""
super().__init__(sentence_aligned_corpus)
if probability_tables is None:
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
self.translation_table = probability_tables["translation_table"]
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
self.align_all(sentence_aligned_corpus)
def set_uniform_probabilities(self, sentence_aligned_corpus):
initial_prob = 1 / len(self.trg_vocab)
if initial_prob < IBMModel.MIN_PROB:
warnings.warn(
"Target language vocabulary is too large ("
+ str(len(self.trg_vocab))
+ " words). "
"Results may be less accurate."
)
for t in self.trg_vocab:
self.translation_table[t] = defaultdict(lambda: initial_prob)
def train(self, parallel_corpus):
counts = Counts()
for aligned_sentence in parallel_corpus:
trg_sentence = aligned_sentence.words
src_sentence = [None] + aligned_sentence.mots
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
# E step (b): Collect counts
for t in trg_sentence:
for s in src_sentence:
count = self.prob_alignment_point(s, t)
normalized_count = count / total_count[t]
counts.t_given_s[t][s] += normalized_count
counts.any_t_given_s[s] += normalized_count
# M step: Update probabilities with maximum likelihood estimate
self.maximize_lexical_translation_probabilities(counts)
def prob_all_alignments(self, src_sentence, trg_sentence):
"""
Computes the probability of all possible word alignments,
expressed as a marginal distribution over target words t
Each entry in the return value represents the contribution to
the total alignment probability by the target word t.
To obtain probability(alignment | src_sentence, trg_sentence),
simply sum the entries in the return value.
:return: Probability of t for all s in ``src_sentence``
:rtype: dict(str): float
"""
alignment_prob_for_t = defaultdict(float)
for t in trg_sentence:
for s in src_sentence:
alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
return alignment_prob_for_t
def prob_alignment_point(self, s, t):
"""
Probability that word ``t`` in the target sentence is aligned to
word ``s`` in the source sentence
"""
return self.translation_table[t][s]
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
"""
prob = 1.0
for j, i in enumerate(alignment_info.alignment):
if j == 0:
continue # skip the dummy zeroeth element
trg_word = alignment_info.trg_sentence[j]
src_word = alignment_info.src_sentence[i]
prob *= self.translation_table[trg_word][src_word]
return max(prob, IBMModel.MIN_PROB)
def align_all(self, parallel_corpus):
for sentence_pair in parallel_corpus:
self.align(sentence_pair)
def align(self, sentence_pair):
"""
Determines the best word alignment for one sentence pair from
the corpus that the model was trained on.
The best alignment will be set in ``sentence_pair`` when the
method returns. In contrast with the internal implementation of
IBM models, the word indices in the ``Alignment`` are zero-
indexed, not one-indexed.
:param sentence_pair: A sentence in the source language and its
counterpart sentence in the target language
:type sentence_pair: AlignedSent
"""
best_alignment = []
for j, trg_word in enumerate(sentence_pair.words):
# Initialize trg_word to align with the NULL token
best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
best_alignment_point = None
for i, src_word in enumerate(sentence_pair.mots):
align_prob = self.translation_table[trg_word][src_word]
if align_prob >= best_prob: # prefer newer word in case of tie
best_prob = align_prob
best_alignment_point = i
best_alignment.append((j, best_alignment_point))
sentence_pair.alignment = Alignment(best_alignment)

View File

@@ -0,0 +1,319 @@
# Natural Language Toolkit: IBM Model 2
#
# Copyright (C) 2001-2013 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Lexical translation model that considers word order.
IBM Model 2 improves on Model 1 by accounting for word order.
An alignment probability is introduced, a(i | j,l,m), which predicts
a source word position, given its aligned target word's position.
The EM algorithm used in Model 2 is:
:E step: In the training data, collect counts, weighted by prior
probabilities.
- (a) count how many times a source language word is translated
into a target language word
- (b) count how many times a particular position in the source
sentence is aligned to a particular position in the target
sentence
:M step: Estimate new probabilities based on the counts from the E step
Notations
---------
:i: Position in the source sentence
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
Valid values are 1, 2, ..., length of target sentence
:l: Number of words in the source sentence, excluding NULL
:m: Number of words in the target sentence
:s: A word in the source language
:t: A word in the target language
References
----------
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
import warnings
from collections import defaultdict
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel1
from nltk.translate.ibm_model import Counts
class IBMModel2(IBMModel):
"""
Lexical translation model that considers word order
>>> bitext = []
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
>>> ibm2 = IBMModel2(bitext, 5)
>>> print(round(ibm2.translation_table['buch']['book'], 3))
1.0
>>> print(round(ibm2.translation_table['das']['book'], 3))
0.0
>>> print(round(ibm2.translation_table['buch'][None], 3))
0.0
>>> print(round(ibm2.translation_table['ja'][None], 3))
0.0
>>> print(round(ibm2.alignment_table[1][1][2][2], 3))
0.939
>>> print(round(ibm2.alignment_table[1][2][2][2], 3))
0.0
>>> print(round(ibm2.alignment_table[2][2][4][5], 3))
1.0
>>> test_sentence = bitext[2]
>>> test_sentence.words
['das', 'buch', 'ist', 'ja', 'klein']
>>> test_sentence.mots
['the', 'book', 'is', 'small']
>>> test_sentence.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
"""
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model and an alignment model.
Translation direction is from ``AlignedSent.mots`` to
``AlignedSent.words``.
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
:type sentence_aligned_corpus: list(AlignedSent)
:param iterations: Number of iterations to run training algorithm
:type iterations: int
:param probability_tables: Optional. Use this to pass in custom
probability values. If not specified, probabilities will be
set to a uniform distribution, or some other sensible value.
If specified, all the following entries must be present:
``translation_table``, ``alignment_table``.
See ``IBMModel`` for the type and purpose of these tables.
:type probability_tables: dict[str]: object
"""
super().__init__(sentence_aligned_corpus)
if probability_tables is None:
# Get translation probabilities from IBM Model 1
# Run more iterations of training for Model 1, since it is
# faster than Model 2
ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
self.translation_table = ibm1.translation_table
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
self.translation_table = probability_tables["translation_table"]
self.alignment_table = probability_tables["alignment_table"]
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
self.align_all(sentence_aligned_corpus)
def set_uniform_probabilities(self, sentence_aligned_corpus):
# a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
l_m_combinations = set()
for aligned_sentence in sentence_aligned_corpus:
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
if (l, m) not in l_m_combinations:
l_m_combinations.add((l, m))
initial_prob = 1 / (l + 1)
if initial_prob < IBMModel.MIN_PROB:
warnings.warn(
"A source sentence is too long ("
+ str(l)
+ " words). Results may be less accurate."
)
for i in range(0, l + 1):
for j in range(1, m + 1):
self.alignment_table[i][j][l][m] = initial_prob
def train(self, parallel_corpus):
counts = Model2Counts()
for aligned_sentence in parallel_corpus:
src_sentence = [None] + aligned_sentence.mots
trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
# E step (b): Collect counts
for j in range(1, m + 1):
t = trg_sentence[j]
for i in range(0, l + 1):
s = src_sentence[i]
count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
normalized_count = count / total_count[t]
counts.update_lexical_translation(normalized_count, s, t)
counts.update_alignment(normalized_count, i, j, l, m)
# M step: Update probabilities with maximum likelihood estimates
self.maximize_lexical_translation_probabilities(counts)
self.maximize_alignment_probabilities(counts)
def maximize_alignment_probabilities(self, counts):
MIN_PROB = IBMModel.MIN_PROB
for i, j_s in counts.alignment.items():
for j, src_sentence_lengths in j_s.items():
for l, trg_sentence_lengths in src_sentence_lengths.items():
for m in trg_sentence_lengths:
estimate = (
counts.alignment[i][j][l][m]
/ counts.alignment_for_any_i[j][l][m]
)
self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
def prob_all_alignments(self, src_sentence, trg_sentence):
"""
Computes the probability of all possible word alignments,
expressed as a marginal distribution over target words t
Each entry in the return value represents the contribution to
the total alignment probability by the target word t.
To obtain probability(alignment | src_sentence, trg_sentence),
simply sum the entries in the return value.
:return: Probability of t for all s in ``src_sentence``
:rtype: dict(str): float
"""
alignment_prob_for_t = defaultdict(float)
for j in range(1, len(trg_sentence)):
t = trg_sentence[j]
for i in range(0, len(src_sentence)):
alignment_prob_for_t[t] += self.prob_alignment_point(
i, j, src_sentence, trg_sentence
)
return alignment_prob_for_t
def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
"""
Probability that position j in ``trg_sentence`` is aligned to
position i in the ``src_sentence``
"""
l = len(src_sentence) - 1
m = len(trg_sentence) - 1
s = src_sentence[i]
t = trg_sentence[j]
return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
"""
prob = 1.0
l = len(alignment_info.src_sentence) - 1
m = len(alignment_info.trg_sentence) - 1
for j, i in enumerate(alignment_info.alignment):
if j == 0:
continue # skip the dummy zeroeth element
trg_word = alignment_info.trg_sentence[j]
src_word = alignment_info.src_sentence[i]
prob *= (
self.translation_table[trg_word][src_word]
* self.alignment_table[i][j][l][m]
)
return max(prob, IBMModel.MIN_PROB)
def align_all(self, parallel_corpus):
for sentence_pair in parallel_corpus:
self.align(sentence_pair)
def align(self, sentence_pair):
"""
Determines the best word alignment for one sentence pair from
the corpus that the model was trained on.
The best alignment will be set in ``sentence_pair`` when the
method returns. In contrast with the internal implementation of
IBM models, the word indices in the ``Alignment`` are zero-
indexed, not one-indexed.
:param sentence_pair: A sentence in the source language and its
counterpart sentence in the target language
:type sentence_pair: AlignedSent
"""
best_alignment = []
l = len(sentence_pair.mots)
m = len(sentence_pair.words)
for j, trg_word in enumerate(sentence_pair.words):
# Initialize trg_word to align with the NULL token
best_prob = (
self.translation_table[trg_word][None]
* self.alignment_table[0][j + 1][l][m]
)
best_prob = max(best_prob, IBMModel.MIN_PROB)
best_alignment_point = None
for i, src_word in enumerate(sentence_pair.mots):
align_prob = (
self.translation_table[trg_word][src_word]
* self.alignment_table[i + 1][j + 1][l][m]
)
if align_prob >= best_prob:
best_prob = align_prob
best_alignment_point = i
best_alignment.append((j, best_alignment_point))
sentence_pair.alignment = Alignment(best_alignment)
class Model2Counts(Counts):
"""
Data object to store counts of various parameters during training.
Includes counts for alignment.
"""
def __init__(self):
super().__init__()
self.alignment = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
self.alignment_for_any_i = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
def update_lexical_translation(self, count, s, t):
self.t_given_s[t][s] += count
self.any_t_given_s[s] += count
def update_alignment(self, count, i, j, l, m):
self.alignment[i][j][l][m] += count
self.alignment_for_any_i[j][l][m] += count

View File

@@ -0,0 +1,346 @@
# Natural Language Toolkit: IBM Model 3
#
# Copyright (C) 2001-2013 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Translation model that considers how a word can be aligned to
multiple words in another language.
IBM Model 3 improves on Model 2 by directly modeling the phenomenon
where a word in one language may be translated into zero or more words
in another. This is expressed by the fertility probability,
n(phi | source word).
If a source word translates into more than one word, it is possible to
generate sentences that have the same alignment in multiple ways. This
is modeled by a distortion step. The distortion probability, d(j|i,l,m),
predicts a target word position, given its aligned source word's
position. The distortion probability replaces the alignment probability
of Model 2.
The fertility probability is not applicable for NULL. Target words that
align to NULL are assumed to be distributed uniformly in the target
sentence. The existence of these words is modeled by p1, the probability
that a target word produced by a real source word requires another
target word that is produced by NULL.
The EM algorithm used in Model 3 is:
:E step: In the training data, collect counts, weighted by prior
probabilities.
- (a) count how many times a source language word is translated
into a target language word
- (b) count how many times a particular position in the target
sentence is aligned to a particular position in the source
sentence
- (c) count how many times a source word is aligned to phi number
of target words
- (d) count how many times NULL is aligned to a target word
:M step: Estimate new probabilities based on the counts from the E step
Because there are too many possible alignments, only the most probable
ones are considered. First, the best alignment is determined using prior
probabilities. Then, a hill climbing approach is used to find other good
candidates.
Notations
---------
:i: Position in the source sentence
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
Valid values are 1, 2, ..., length of target sentence
:l: Number of words in the source sentence, excluding NULL
:m: Number of words in the target sentence
:s: A word in the source language
:t: A word in the target language
:phi: Fertility, the number of target words produced by a source word
:p1: Probability that a target word produced by a source word is
accompanied by another target word that is aligned to NULL
:p0: 1 - p1
References
----------
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
import warnings
from collections import defaultdict
from math import factorial
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel2
from nltk.translate.ibm_model import Counts
class IBMModel3(IBMModel):
"""
Translation model that considers how a word can be aligned to
multiple words in another language
>>> bitext = []
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
>>> ibm3 = IBMModel3(bitext, 5)
>>> print(round(ibm3.translation_table['buch']['book'], 3))
1.0
>>> print(round(ibm3.translation_table['das']['book'], 3))
0.0
>>> print(round(ibm3.translation_table['ja'][None], 3))
1.0
>>> print(round(ibm3.distortion_table[1][1][2][2], 3))
1.0
>>> print(round(ibm3.distortion_table[1][2][2][2], 3))
0.0
>>> print(round(ibm3.distortion_table[2][2][4][5], 3))
0.75
>>> print(round(ibm3.fertility_table[2]['summarize'], 3))
1.0
>>> print(round(ibm3.fertility_table[1]['book'], 3))
1.0
>>> print(round(ibm3.p1, 3))
0.054
>>> test_sentence = bitext[2]
>>> test_sentence.words
['das', 'buch', 'ist', 'ja', 'klein']
>>> test_sentence.mots
['the', 'book', 'is', 'small']
>>> test_sentence.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
"""
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model, a distortion model, a fertility model, and a
model for generating NULL-aligned words.
Translation direction is from ``AlignedSent.mots`` to
``AlignedSent.words``.
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
:type sentence_aligned_corpus: list(AlignedSent)
:param iterations: Number of iterations to run training algorithm
:type iterations: int
:param probability_tables: Optional. Use this to pass in custom
probability values. If not specified, probabilities will be
set to a uniform distribution, or some other sensible value.
If specified, all the following entries must be present:
``translation_table``, ``alignment_table``,
``fertility_table``, ``p1``, ``distortion_table``.
See ``IBMModel`` for the type and purpose of these tables.
:type probability_tables: dict[str]: object
"""
super().__init__(sentence_aligned_corpus)
self.reset_probabilities()
if probability_tables is None:
# Get translation and alignment probabilities from IBM Model 2
ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
self.translation_table = ibm2.translation_table
self.alignment_table = ibm2.alignment_table
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
self.translation_table = probability_tables["translation_table"]
self.alignment_table = probability_tables["alignment_table"]
self.fertility_table = probability_tables["fertility_table"]
self.p1 = probability_tables["p1"]
self.distortion_table = probability_tables["distortion_table"]
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
def reset_probabilities(self):
super().reset_probabilities()
self.distortion_table = defaultdict(
lambda: defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
)
)
"""
dict[int][int][int][int]: float. Probability(j | i,l,m).
Values accessed as ``distortion_table[j][i][l][m]``.
"""
def set_uniform_probabilities(self, sentence_aligned_corpus):
# d(j | i,l,m) = 1 / m for all i, j, l, m
l_m_combinations = set()
for aligned_sentence in sentence_aligned_corpus:
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
if (l, m) not in l_m_combinations:
l_m_combinations.add((l, m))
initial_prob = 1 / m
if initial_prob < IBMModel.MIN_PROB:
warnings.warn(
"A target sentence is too long ("
+ str(m)
+ " words). Results may be less accurate."
)
for j in range(1, m + 1):
for i in range(0, l + 1):
self.distortion_table[j][i][l][m] = initial_prob
# simple initialization, taken from GIZA++
self.fertility_table[0] = defaultdict(lambda: 0.2)
self.fertility_table[1] = defaultdict(lambda: 0.65)
self.fertility_table[2] = defaultdict(lambda: 0.1)
self.fertility_table[3] = defaultdict(lambda: 0.04)
MAX_FERTILITY = 10
initial_fert_prob = 0.01 / (MAX_FERTILITY - 4)
for phi in range(4, MAX_FERTILITY):
self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob)
self.p1 = 0.5
def train(self, parallel_corpus):
counts = Model3Counts()
for aligned_sentence in parallel_corpus:
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
# Sample the alignment space
sampled_alignments, best_alignment = self.sample(aligned_sentence)
# Record the most probable alignment
aligned_sentence.alignment = Alignment(
best_alignment.zero_indexed_alignment()
)
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_of_alignments(sampled_alignments)
# E step (b): Collect counts
for alignment_info in sampled_alignments:
count = self.prob_t_a_given_s(alignment_info)
normalized_count = count / total_count
for j in range(1, m + 1):
counts.update_lexical_translation(
normalized_count, alignment_info, j
)
counts.update_distortion(normalized_count, alignment_info, j, l, m)
counts.update_null_generation(normalized_count, alignment_info)
counts.update_fertility(normalized_count, alignment_info)
# M step: Update probabilities with maximum likelihood estimates
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
existing_alignment_table = self.alignment_table
self.reset_probabilities()
self.alignment_table = existing_alignment_table # don't retrain
self.maximize_lexical_translation_probabilities(counts)
self.maximize_distortion_probabilities(counts)
self.maximize_fertility_probabilities(counts)
self.maximize_null_generation_probabilities(counts)
def maximize_distortion_probabilities(self, counts):
MIN_PROB = IBMModel.MIN_PROB
for j, i_s in counts.distortion.items():
for i, src_sentence_lengths in i_s.items():
for l, trg_sentence_lengths in src_sentence_lengths.items():
for m in trg_sentence_lengths:
estimate = (
counts.distortion[j][i][l][m]
/ counts.distortion_for_any_j[i][l][m]
)
self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB)
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
"""
src_sentence = alignment_info.src_sentence
trg_sentence = alignment_info.trg_sentence
l = len(src_sentence) - 1 # exclude NULL
m = len(trg_sentence) - 1
p1 = self.p1
p0 = 1 - p1
probability = 1.0
MIN_PROB = IBMModel.MIN_PROB
# Combine NULL insertion probability
null_fertility = alignment_info.fertility_of_i(0)
probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
if probability < MIN_PROB:
return MIN_PROB
# Compute combination (m - null_fertility) choose null_fertility
for i in range(1, null_fertility + 1):
probability *= (m - null_fertility - i + 1) / i
if probability < MIN_PROB:
return MIN_PROB
# Combine fertility probabilities
for i in range(1, l + 1):
fertility = alignment_info.fertility_of_i(i)
probability *= (
factorial(fertility) * self.fertility_table[fertility][src_sentence[i]]
)
if probability < MIN_PROB:
return MIN_PROB
# Combine lexical and distortion probabilities
for j in range(1, m + 1):
t = trg_sentence[j]
i = alignment_info.alignment[j]
s = src_sentence[i]
probability *= (
self.translation_table[t][s] * self.distortion_table[j][i][l][m]
)
if probability < MIN_PROB:
return MIN_PROB
return probability
class Model3Counts(Counts):
"""
Data object to store counts of various parameters during training.
Includes counts for distortion.
"""
def __init__(self):
super().__init__()
self.distortion = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
self.distortion_for_any_j = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
def update_distortion(self, count, alignment_info, j, l, m):
i = alignment_info.alignment[j]
self.distortion[j][i][l][m] += count
self.distortion_for_any_j[i][l][m] += count

View File

@@ -0,0 +1,490 @@
# Natural Language Toolkit: IBM Model 4
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Translation model that reorders output words based on their type and
distance from other related words in the output sentence.
IBM Model 4 improves the distortion model of Model 3, motivated by the
observation that certain words tend to be re-ordered in a predictable
way relative to one another. For example, <adjective><noun> in English
usually has its order flipped as <noun><adjective> in French.
Model 4 requires words in the source and target vocabularies to be
categorized into classes. This can be linguistically driven, like parts
of speech (adjective, nouns, prepositions, etc). Word classes can also
be obtained by statistical methods. The original IBM Model 4 uses an
information theoretic approach to group words into 50 classes for each
vocabulary.
Terminology
-----------
:Cept:
A source word with non-zero fertility i.e. aligned to one or more
target words.
:Tablet:
The set of target word(s) aligned to a cept.
:Head of cept:
The first word of the tablet of that cept.
:Center of cept:
The average position of the words in that cept's tablet. If the
value is not an integer, the ceiling is taken.
For example, for a tablet with words in positions 2, 5, 6 in the
target sentence, the center of the corresponding cept is
ceil((2 + 5 + 6) / 3) = 5
:Displacement:
For a head word, defined as (position of head word - position of
previous cept's center). Can be positive or negative.
For a non-head word, defined as (position of non-head word -
position of previous word in the same tablet). Always positive,
because successive words in a tablet are assumed to appear to the
right of the previous word.
In contrast to Model 3 which reorders words in a tablet independently of
other words, Model 4 distinguishes between three cases.
1. Words generated by NULL are distributed uniformly.
2. For a head word t, its position is modeled by the probability
d_head(displacement | word_class_s(s),word_class_t(t)),
where s is the previous cept, and word_class_s and word_class_t maps
s and t to a source and target language word class respectively.
3. For a non-head word t, its position is modeled by the probability
d_non_head(displacement | word_class_t(t))
The EM algorithm used in Model 4 is:
:E step: In the training data, collect counts, weighted by prior
probabilities.
- (a) count how many times a source language word is translated
into a target language word
- (b) for a particular word class, count how many times a head
word is located at a particular displacement from the
previous cept's center
- (c) for a particular word class, count how many times a
non-head word is located at a particular displacement from
the previous target word
- (d) count how many times a source word is aligned to phi number
of target words
- (e) count how many times NULL is aligned to a target word
:M step: Estimate new probabilities based on the counts from the E step
Like Model 3, there are too many possible alignments to consider. Thus,
a hill climbing approach is used to sample good candidates.
Notations
---------
:i: Position in the source sentence
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
Valid values are 1, 2, ..., length of target sentence
:l: Number of words in the source sentence, excluding NULL
:m: Number of words in the target sentence
:s: A word in the source language
:t: A word in the target language
:phi: Fertility, the number of target words produced by a source word
:p1: Probability that a target word produced by a source word is
accompanied by another target word that is aligned to NULL
:p0: 1 - p1
:dj: Displacement, Δj
References
----------
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
import warnings
from collections import defaultdict
from math import factorial
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel3
from nltk.translate.ibm_model import Counts, longest_target_sentence_length
class IBMModel4(IBMModel):
"""
Translation model that reorders output words based on their type and
their distance from other related words in the output sentence
>>> bitext = []
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
>>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
>>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
>>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes)
>>> print(round(ibm4.translation_table['buch']['book'], 3))
1.0
>>> print(round(ibm4.translation_table['das']['book'], 3))
0.0
>>> print(round(ibm4.translation_table['ja'][None], 3))
1.0
>>> print(round(ibm4.head_distortion_table[1][0][1], 3))
1.0
>>> print(round(ibm4.head_distortion_table[2][0][1], 3))
0.0
>>> print(round(ibm4.non_head_distortion_table[3][6], 3))
0.5
>>> print(round(ibm4.fertility_table[2]['summarize'], 3))
1.0
>>> print(round(ibm4.fertility_table[1]['book'], 3))
1.0
>>> print(round(ibm4.p1, 3))
0.033
>>> test_sentence = bitext[2]
>>> test_sentence.words
['das', 'buch', 'ist', 'ja', 'klein']
>>> test_sentence.mots
['the', 'book', 'is', 'small']
>>> test_sentence.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
"""
def __init__(
self,
sentence_aligned_corpus,
iterations,
source_word_classes,
target_word_classes,
probability_tables=None,
):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model, distortion models, a fertility model, and a
model for generating NULL-aligned words.
Translation direction is from ``AlignedSent.mots`` to
``AlignedSent.words``.
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
:type sentence_aligned_corpus: list(AlignedSent)
:param iterations: Number of iterations to run training algorithm
:type iterations: int
:param source_word_classes: Lookup table that maps a source word
to its word class, the latter represented by an integer id
:type source_word_classes: dict[str]: int
:param target_word_classes: Lookup table that maps a target word
to its word class, the latter represented by an integer id
:type target_word_classes: dict[str]: int
:param probability_tables: Optional. Use this to pass in custom
probability values. If not specified, probabilities will be
set to a uniform distribution, or some other sensible value.
If specified, all the following entries must be present:
``translation_table``, ``alignment_table``,
``fertility_table``, ``p1``, ``head_distortion_table``,
``non_head_distortion_table``. See ``IBMModel`` and
``IBMModel4`` for the type and purpose of these tables.
:type probability_tables: dict[str]: object
"""
super().__init__(sentence_aligned_corpus)
self.reset_probabilities()
self.src_classes = source_word_classes
self.trg_classes = target_word_classes
if probability_tables is None:
# Get probabilities from IBM model 3
ibm3 = IBMModel3(sentence_aligned_corpus, iterations)
self.translation_table = ibm3.translation_table
self.alignment_table = ibm3.alignment_table
self.fertility_table = ibm3.fertility_table
self.p1 = ibm3.p1
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
self.translation_table = probability_tables["translation_table"]
self.alignment_table = probability_tables["alignment_table"]
self.fertility_table = probability_tables["fertility_table"]
self.p1 = probability_tables["p1"]
self.head_distortion_table = probability_tables["head_distortion_table"]
self.non_head_distortion_table = probability_tables[
"non_head_distortion_table"
]
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
def reset_probabilities(self):
super().reset_probabilities()
self.head_distortion_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
)
"""
dict[int][int][int]: float. Probability(displacement of head
word | word class of previous cept,target word class).
Values accessed as ``distortion_table[dj][src_class][trg_class]``.
"""
self.non_head_distortion_table = defaultdict(
lambda: defaultdict(lambda: self.MIN_PROB)
)
"""
dict[int][int]: float. Probability(displacement of non-head
word | target word class).
Values accessed as ``distortion_table[dj][trg_class]``.
"""
def set_uniform_probabilities(self, sentence_aligned_corpus):
"""
Set distortion probabilities uniformly to
1 / cardinality of displacement values
"""
max_m = longest_target_sentence_length(sentence_aligned_corpus)
# The maximum displacement is m-1, when a word is in the last
# position m of the target sentence and the previously placed
# word is in the first position.
# Conversely, the minimum displacement is -(m-1).
# Thus, the displacement range is (m-1) - (-(m-1)). Note that
# displacement cannot be zero and is not included in the range.
if max_m <= 1:
initial_prob = IBMModel.MIN_PROB
else:
initial_prob = 1 / (2 * (max_m - 1))
if initial_prob < IBMModel.MIN_PROB:
warnings.warn(
"A target sentence is too long ("
+ str(max_m)
+ " words). Results may be less accurate."
)
for dj in range(1, max_m):
self.head_distortion_table[dj] = defaultdict(
lambda: defaultdict(lambda: initial_prob)
)
self.head_distortion_table[-dj] = defaultdict(
lambda: defaultdict(lambda: initial_prob)
)
self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob)
self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob)
def train(self, parallel_corpus):
counts = Model4Counts()
for aligned_sentence in parallel_corpus:
m = len(aligned_sentence.words)
# Sample the alignment space
sampled_alignments, best_alignment = self.sample(aligned_sentence)
# Record the most probable alignment
aligned_sentence.alignment = Alignment(
best_alignment.zero_indexed_alignment()
)
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_of_alignments(sampled_alignments)
# E step (b): Collect counts
for alignment_info in sampled_alignments:
count = self.prob_t_a_given_s(alignment_info)
normalized_count = count / total_count
for j in range(1, m + 1):
counts.update_lexical_translation(
normalized_count, alignment_info, j
)
counts.update_distortion(
normalized_count,
alignment_info,
j,
self.src_classes,
self.trg_classes,
)
counts.update_null_generation(normalized_count, alignment_info)
counts.update_fertility(normalized_count, alignment_info)
# M step: Update probabilities with maximum likelihood estimates
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
existing_alignment_table = self.alignment_table
self.reset_probabilities()
self.alignment_table = existing_alignment_table # don't retrain
self.maximize_lexical_translation_probabilities(counts)
self.maximize_distortion_probabilities(counts)
self.maximize_fertility_probabilities(counts)
self.maximize_null_generation_probabilities(counts)
def maximize_distortion_probabilities(self, counts):
head_d_table = self.head_distortion_table
for dj, src_classes in counts.head_distortion.items():
for s_cls, trg_classes in src_classes.items():
for t_cls in trg_classes:
estimate = (
counts.head_distortion[dj][s_cls][t_cls]
/ counts.head_distortion_for_any_dj[s_cls][t_cls]
)
head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB)
non_head_d_table = self.non_head_distortion_table
for dj, trg_classes in counts.non_head_distortion.items():
for t_cls in trg_classes:
estimate = (
counts.non_head_distortion[dj][t_cls]
/ counts.non_head_distortion_for_any_dj[t_cls]
)
non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB)
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
"""
return IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
@staticmethod # exposed for Model 5 to use
def model4_prob_t_a_given_s(alignment_info, ibm_model):
probability = 1.0
MIN_PROB = IBMModel.MIN_PROB
def null_generation_term():
# Binomial distribution: B(m - null_fertility, p1)
value = 1.0
p1 = ibm_model.p1
p0 = 1 - p1
null_fertility = alignment_info.fertility_of_i(0)
m = len(alignment_info.trg_sentence) - 1
value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
if value < MIN_PROB:
return MIN_PROB
# Combination: (m - null_fertility) choose null_fertility
for i in range(1, null_fertility + 1):
value *= (m - null_fertility - i + 1) / i
return value
def fertility_term():
value = 1.0
src_sentence = alignment_info.src_sentence
for i in range(1, len(src_sentence)):
fertility = alignment_info.fertility_of_i(i)
value *= (
factorial(fertility)
* ibm_model.fertility_table[fertility][src_sentence[i]]
)
if value < MIN_PROB:
return MIN_PROB
return value
def lexical_translation_term(j):
t = alignment_info.trg_sentence[j]
i = alignment_info.alignment[j]
s = alignment_info.src_sentence[i]
return ibm_model.translation_table[t][s]
def distortion_term(j):
t = alignment_info.trg_sentence[j]
i = alignment_info.alignment[j]
if i == 0:
# case 1: t is aligned to NULL
return 1.0
if alignment_info.is_head_word(j):
# case 2: t is the first word of a tablet
previous_cept = alignment_info.previous_cept(j)
src_class = None
if previous_cept is not None:
previous_s = alignment_info.src_sentence[previous_cept]
src_class = ibm_model.src_classes[previous_s]
trg_class = ibm_model.trg_classes[t]
dj = j - alignment_info.center_of_cept(previous_cept)
return ibm_model.head_distortion_table[dj][src_class][trg_class]
# case 3: t is a subsequent word of a tablet
previous_position = alignment_info.previous_in_tablet(j)
trg_class = ibm_model.trg_classes[t]
dj = j - previous_position
return ibm_model.non_head_distortion_table[dj][trg_class]
# end nested functions
# Abort computation whenever probability falls below MIN_PROB at
# any point, since MIN_PROB can be considered as zero
probability *= null_generation_term()
if probability < MIN_PROB:
return MIN_PROB
probability *= fertility_term()
if probability < MIN_PROB:
return MIN_PROB
for j in range(1, len(alignment_info.trg_sentence)):
probability *= lexical_translation_term(j)
if probability < MIN_PROB:
return MIN_PROB
probability *= distortion_term(j)
if probability < MIN_PROB:
return MIN_PROB
return probability
class Model4Counts(Counts):
"""
Data object to store counts of various parameters during training.
Includes counts for distortion.
"""
def __init__(self):
super().__init__()
self.head_distortion = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(float))
self.non_head_distortion = defaultdict(lambda: defaultdict(float))
self.non_head_distortion_for_any_dj = defaultdict(float)
def update_distortion(self, count, alignment_info, j, src_classes, trg_classes):
i = alignment_info.alignment[j]
t = alignment_info.trg_sentence[j]
if i == 0:
# case 1: t is aligned to NULL
pass
elif alignment_info.is_head_word(j):
# case 2: t is the first word of a tablet
previous_cept = alignment_info.previous_cept(j)
if previous_cept is not None:
previous_src_word = alignment_info.src_sentence[previous_cept]
src_class = src_classes[previous_src_word]
else:
src_class = None
trg_class = trg_classes[t]
dj = j - alignment_info.center_of_cept(previous_cept)
self.head_distortion[dj][src_class][trg_class] += count
self.head_distortion_for_any_dj[src_class][trg_class] += count
else:
# case 3: t is a subsequent word of a tablet
previous_j = alignment_info.previous_in_tablet(j)
trg_class = trg_classes[t]
dj = j - previous_j
self.non_head_distortion[dj][trg_class] += count
self.non_head_distortion_for_any_dj[trg_class] += count

View File

@@ -0,0 +1,661 @@
# Natural Language Toolkit: IBM Model 5
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Translation model that keeps track of vacant positions in the target
sentence to decide where to place translated words.
Translation can be viewed as a process where each word in the source
sentence is stepped through sequentially, generating translated words
for each source word. The target sentence can be viewed as being made
up of ``m`` empty slots initially, which gradually fill up as generated
words are placed in them.
Models 3 and 4 use distortion probabilities to decide how to place
translated words. For simplicity, these models ignore the history of
which slots have already been occupied with translated words.
Consider the placement of the last translated word: there is only one
empty slot left in the target sentence, so the distortion probability
should be 1.0 for that position and 0.0 everywhere else. However, the
distortion probabilities for Models 3 and 4 are set up such that all
positions are under consideration.
IBM Model 5 fixes this deficiency by accounting for occupied slots
during translation. It introduces the vacancy function v(j), the number
of vacancies up to, and including, position j in the target sentence.
Terminology
-----------
:Maximum vacancy:
The number of valid slots that a word can be placed in.
This is not necessarily the same as the number of vacant slots.
For example, if a tablet contains more than one word, the head word
cannot be placed at the last vacant slot because there will be no
space for the other words in the tablet. The number of valid slots
has to take into account the length of the tablet.
Non-head words cannot be placed before the head word, so vacancies
to the left of the head word are ignored.
:Vacancy difference:
For a head word: (v(j) - v(center of previous cept))
Can be positive or negative.
For a non-head word: (v(j) - v(position of previously placed word))
Always positive, because successive words in a tablet are assumed to
appear to the right of the previous word.
Positioning of target words fall under three cases:
1. Words generated by NULL are distributed uniformly
2. For a head word t, its position is modeled by the probability
v_head(dv | max_v,word_class_t(t))
3. For a non-head word t, its position is modeled by the probability
v_non_head(dv | max_v,word_class_t(t))
dv and max_v are defined differently for head and non-head words.
The EM algorithm used in Model 5 is:
:E step: In the training data, collect counts, weighted by prior
probabilities.
- (a) count how many times a source language word is translated
into a target language word
- (b) for a particular word class and maximum vacancy, count how
many times a head word and the previous cept's center have
a particular difference in number of vacancies
- (b) for a particular word class and maximum vacancy, count how
many times a non-head word and the previous target word
have a particular difference in number of vacancies
- (d) count how many times a source word is aligned to phi number
of target words
- (e) count how many times NULL is aligned to a target word
:M step: Estimate new probabilities based on the counts from the E step
Like Model 4, there are too many possible alignments to consider. Thus,
a hill climbing approach is used to sample good candidates. In addition,
pruning is used to weed out unlikely alignments based on Model 4 scores.
Notations
---------
:i: Position in the source sentence
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
Valid values are 1, 2, ..., length of target sentence
:l: Number of words in the source sentence, excluding NULL
:m: Number of words in the target sentence
:s: A word in the source language
:t: A word in the target language
:phi: Fertility, the number of target words produced by a source word
:p1: Probability that a target word produced by a source word is
accompanied by another target word that is aligned to NULL
:p0: 1 - p1
:max_v: Maximum vacancy
:dv: Vacancy difference, Δv
The definition of v_head here differs from GIZA++, section 4.7 of
[Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is
v_head(v(j) | v(center of previous cept),max_v,word_class(t)).
Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with
v(center of previous cept) to obtain dv:
v_head(v(j) - v(center of previous cept) | max_v,word_class(t)).
References
----------
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
import warnings
from collections import defaultdict
from math import factorial
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel4
from nltk.translate.ibm_model import Counts, longest_target_sentence_length
class IBMModel5(IBMModel):
"""
Translation model that keeps track of vacant positions in the target
sentence to decide where to place translated words
>>> bitext = []
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
>>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
>>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
>>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes)
>>> print(round(ibm5.head_vacancy_table[1][1][1], 3))
1.0
>>> print(round(ibm5.head_vacancy_table[2][1][1], 3))
0.0
>>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3))
1.0
>>> print(round(ibm5.fertility_table[2]['summarize'], 3))
1.0
>>> print(round(ibm5.fertility_table[1]['book'], 3))
1.0
>>> print(round(ibm5.p1, 3))
0.033
>>> test_sentence = bitext[2]
>>> test_sentence.words
['das', 'buch', 'ist', 'ja', 'klein']
>>> test_sentence.mots
['the', 'book', 'is', 'small']
>>> test_sentence.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
"""
MIN_SCORE_FACTOR = 0.2
"""
Alignments with scores below this factor are pruned during sampling
"""
def __init__(
self,
sentence_aligned_corpus,
iterations,
source_word_classes,
target_word_classes,
probability_tables=None,
):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model, vacancy models, a fertility model, and a
model for generating NULL-aligned words.
Translation direction is from ``AlignedSent.mots`` to
``AlignedSent.words``.
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
:type sentence_aligned_corpus: list(AlignedSent)
:param iterations: Number of iterations to run training algorithm
:type iterations: int
:param source_word_classes: Lookup table that maps a source word
to its word class, the latter represented by an integer id
:type source_word_classes: dict[str]: int
:param target_word_classes: Lookup table that maps a target word
to its word class, the latter represented by an integer id
:type target_word_classes: dict[str]: int
:param probability_tables: Optional. Use this to pass in custom
probability values. If not specified, probabilities will be
set to a uniform distribution, or some other sensible value.
If specified, all the following entries must be present:
``translation_table``, ``alignment_table``,
``fertility_table``, ``p1``, ``head_distortion_table``,
``non_head_distortion_table``, ``head_vacancy_table``,
``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``,
and ``IBMModel5`` for the type and purpose of these tables.
:type probability_tables: dict[str]: object
"""
super().__init__(sentence_aligned_corpus)
self.reset_probabilities()
self.src_classes = source_word_classes
self.trg_classes = target_word_classes
if probability_tables is None:
# Get probabilities from IBM model 4
ibm4 = IBMModel4(
sentence_aligned_corpus,
iterations,
source_word_classes,
target_word_classes,
)
self.translation_table = ibm4.translation_table
self.alignment_table = ibm4.alignment_table
self.fertility_table = ibm4.fertility_table
self.p1 = ibm4.p1
self.head_distortion_table = ibm4.head_distortion_table
self.non_head_distortion_table = ibm4.non_head_distortion_table
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
self.translation_table = probability_tables["translation_table"]
self.alignment_table = probability_tables["alignment_table"]
self.fertility_table = probability_tables["fertility_table"]
self.p1 = probability_tables["p1"]
self.head_distortion_table = probability_tables["head_distortion_table"]
self.non_head_distortion_table = probability_tables[
"non_head_distortion_table"
]
self.head_vacancy_table = probability_tables["head_vacancy_table"]
self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"]
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
def reset_probabilities(self):
super().reset_probabilities()
self.head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
)
"""
dict[int][int][int]: float. Probability(vacancy difference |
number of remaining valid positions,target word class).
Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``.
"""
self.non_head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
)
"""
dict[int][int][int]: float. Probability(vacancy difference |
number of remaining valid positions,target word class).
Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``.
"""
def set_uniform_probabilities(self, sentence_aligned_corpus):
"""
Set vacancy probabilities uniformly to
1 / cardinality of vacancy difference values
"""
max_m = longest_target_sentence_length(sentence_aligned_corpus)
# The maximum vacancy difference occurs when a word is placed in
# the last available position m of the target sentence and the
# previous word position has no vacancies.
# The minimum is 1-max_v, when a word is placed in the first
# available position and the previous word is placed beyond the
# last available position.
# Thus, the number of possible vacancy difference values is
# (max_v) - (1-max_v) + 1 = 2 * max_v.
if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB:
warnings.warn(
"A target sentence is too long ("
+ str(max_m)
+ " words). Results may be less accurate."
)
for max_v in range(1, max_m + 1):
for dv in range(1, max_m + 1):
initial_prob = 1 / (2 * max_v)
self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob)
self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
lambda: initial_prob
)
self.non_head_vacancy_table[dv][max_v] = defaultdict(
lambda: initial_prob
)
self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
lambda: initial_prob
)
def train(self, parallel_corpus):
counts = Model5Counts()
for aligned_sentence in parallel_corpus:
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
# Sample the alignment space
sampled_alignments, best_alignment = self.sample(aligned_sentence)
# Record the most probable alignment
aligned_sentence.alignment = Alignment(
best_alignment.zero_indexed_alignment()
)
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_of_alignments(sampled_alignments)
# E step (b): Collect counts
for alignment_info in sampled_alignments:
count = self.prob_t_a_given_s(alignment_info)
normalized_count = count / total_count
for j in range(1, m + 1):
counts.update_lexical_translation(
normalized_count, alignment_info, j
)
slots = Slots(m)
for i in range(1, l + 1):
counts.update_vacancy(
normalized_count, alignment_info, i, self.trg_classes, slots
)
counts.update_null_generation(normalized_count, alignment_info)
counts.update_fertility(normalized_count, alignment_info)
# M step: Update probabilities with maximum likelihood estimates
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
existing_alignment_table = self.alignment_table
self.reset_probabilities()
self.alignment_table = existing_alignment_table # don't retrain
self.maximize_lexical_translation_probabilities(counts)
self.maximize_vacancy_probabilities(counts)
self.maximize_fertility_probabilities(counts)
self.maximize_null_generation_probabilities(counts)
def sample(self, sentence_pair):
"""
Sample the most probable alignments from the entire alignment
space according to Model 4
Note that Model 4 scoring is used instead of Model 5 because the
latter is too expensive to compute.
First, determine the best alignment according to IBM Model 2.
With this initial alignment, use hill climbing to determine the
best alignment according to a IBM Model 4. Add this
alignment and its neighbors to the sample set. Repeat this
process with other initial alignments obtained by pegging an
alignment point. Finally, prune alignments that have
substantially lower Model 4 scores than the best alignment.
:param sentence_pair: Source and target language sentence pair
to generate a sample of alignments from
:type sentence_pair: AlignedSent
:return: A set of best alignments represented by their ``AlignmentInfo``
and the best alignment of the set for convenience
:rtype: set(AlignmentInfo), AlignmentInfo
"""
sampled_alignments, best_alignment = super().sample(sentence_pair)
return self.prune(sampled_alignments), best_alignment
def prune(self, alignment_infos):
"""
Removes alignments from ``alignment_infos`` that have
substantially lower Model 4 scores than the best alignment
:return: Pruned alignments
:rtype: set(AlignmentInfo)
"""
alignments = []
best_score = 0
for alignment_info in alignment_infos:
score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
best_score = max(score, best_score)
alignments.append((alignment_info, score))
threshold = IBMModel5.MIN_SCORE_FACTOR * best_score
alignments = [a[0] for a in alignments if a[1] > threshold]
return set(alignments)
def hillclimb(self, alignment_info, j_pegged=None):
"""
Starting from the alignment in ``alignment_info``, look at
neighboring alignments iteratively for the best one, according
to Model 4
Note that Model 4 scoring is used instead of Model 5 because the
latter is too expensive to compute.
There is no guarantee that the best alignment in the alignment
space will be found, because the algorithm might be stuck in a
local maximum.
:param j_pegged: If specified, the search will be constrained to
alignments where ``j_pegged`` remains unchanged
:type j_pegged: int
:return: The best alignment found from hill climbing
:rtype: AlignmentInfo
"""
alignment = alignment_info # alias with shorter name
max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self)
while True:
old_alignment = alignment
for neighbor_alignment in self.neighboring(alignment, j_pegged):
neighbor_probability = IBMModel4.model4_prob_t_a_given_s(
neighbor_alignment, self
)
if neighbor_probability > max_probability:
alignment = neighbor_alignment
max_probability = neighbor_probability
if alignment == old_alignment:
# Until there are no better alignments
break
alignment.score = max_probability
return alignment
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
"""
probability = 1.0
MIN_PROB = IBMModel.MIN_PROB
slots = Slots(len(alignment_info.trg_sentence) - 1)
def null_generation_term():
# Binomial distribution: B(m - null_fertility, p1)
value = 1.0
p1 = self.p1
p0 = 1 - p1
null_fertility = alignment_info.fertility_of_i(0)
m = len(alignment_info.trg_sentence) - 1
value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
if value < MIN_PROB:
return MIN_PROB
# Combination: (m - null_fertility) choose null_fertility
for i in range(1, null_fertility + 1):
value *= (m - null_fertility - i + 1) / i
return value
def fertility_term():
value = 1.0
src_sentence = alignment_info.src_sentence
for i in range(1, len(src_sentence)):
fertility = alignment_info.fertility_of_i(i)
value *= (
factorial(fertility)
* self.fertility_table[fertility][src_sentence[i]]
)
if value < MIN_PROB:
return MIN_PROB
return value
def lexical_translation_term(j):
t = alignment_info.trg_sentence[j]
i = alignment_info.alignment[j]
s = alignment_info.src_sentence[i]
return self.translation_table[t][s]
def vacancy_term(i):
value = 1.0
tablet = alignment_info.cepts[i]
tablet_length = len(tablet)
total_vacancies = slots.vacancies_at(len(slots))
# case 1: NULL-aligned words
if tablet_length == 0:
return value
# case 2: head word
j = tablet[0]
previous_cept = alignment_info.previous_cept(j)
previous_center = alignment_info.center_of_cept(previous_cept)
dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
max_v = total_vacancies - tablet_length + 1
trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
value *= self.head_vacancy_table[dv][max_v][trg_class]
slots.occupy(j) # mark position as occupied
total_vacancies -= 1
if value < MIN_PROB:
return MIN_PROB
# case 3: non-head words
for k in range(1, tablet_length):
previous_position = tablet[k - 1]
previous_vacancies = slots.vacancies_at(previous_position)
j = tablet[k]
dv = slots.vacancies_at(j) - previous_vacancies
max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
value *= self.non_head_vacancy_table[dv][max_v][trg_class]
slots.occupy(j) # mark position as occupied
total_vacancies -= 1
if value < MIN_PROB:
return MIN_PROB
return value
# end nested functions
# Abort computation whenever probability falls below MIN_PROB at
# any point, since MIN_PROB can be considered as zero
probability *= null_generation_term()
if probability < MIN_PROB:
return MIN_PROB
probability *= fertility_term()
if probability < MIN_PROB:
return MIN_PROB
for j in range(1, len(alignment_info.trg_sentence)):
probability *= lexical_translation_term(j)
if probability < MIN_PROB:
return MIN_PROB
for i in range(1, len(alignment_info.src_sentence)):
probability *= vacancy_term(i)
if probability < MIN_PROB:
return MIN_PROB
return probability
def maximize_vacancy_probabilities(self, counts):
MIN_PROB = IBMModel.MIN_PROB
head_vacancy_table = self.head_vacancy_table
for dv, max_vs in counts.head_vacancy.items():
for max_v, trg_classes in max_vs.items():
for t_cls in trg_classes:
estimate = (
counts.head_vacancy[dv][max_v][t_cls]
/ counts.head_vacancy_for_any_dv[max_v][t_cls]
)
head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
non_head_vacancy_table = self.non_head_vacancy_table
for dv, max_vs in counts.non_head_vacancy.items():
for max_v, trg_classes in max_vs.items():
for t_cls in trg_classes:
estimate = (
counts.non_head_vacancy[dv][max_v][t_cls]
/ counts.non_head_vacancy_for_any_dv[max_v][t_cls]
)
non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
class Model5Counts(Counts):
"""
Data object to store counts of various parameters during training.
Includes counts for vacancies.
"""
def __init__(self):
super().__init__()
self.head_vacancy = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
self.head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(float))
self.non_head_vacancy = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
self.non_head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(float))
def update_vacancy(self, count, alignment_info, i, trg_classes, slots):
"""
:param count: Value to add to the vacancy counts
:param alignment_info: Alignment under consideration
:param i: Source word position under consideration
:param trg_classes: Target word classes
:param slots: Vacancy states of the slots in the target sentence.
Output parameter that will be modified as new words are placed
in the target sentence.
"""
tablet = alignment_info.cepts[i]
tablet_length = len(tablet)
total_vacancies = slots.vacancies_at(len(slots))
# case 1: NULL aligned words
if tablet_length == 0:
return # ignore zero fertility words
# case 2: head word
j = tablet[0]
previous_cept = alignment_info.previous_cept(j)
previous_center = alignment_info.center_of_cept(previous_cept)
dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
max_v = total_vacancies - tablet_length + 1
trg_class = trg_classes[alignment_info.trg_sentence[j]]
self.head_vacancy[dv][max_v][trg_class] += count
self.head_vacancy_for_any_dv[max_v][trg_class] += count
slots.occupy(j) # mark position as occupied
total_vacancies -= 1
# case 3: non-head words
for k in range(1, tablet_length):
previous_position = tablet[k - 1]
previous_vacancies = slots.vacancies_at(previous_position)
j = tablet[k]
dv = slots.vacancies_at(j) - previous_vacancies
max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
trg_class = trg_classes[alignment_info.trg_sentence[j]]
self.non_head_vacancy[dv][max_v][trg_class] += count
self.non_head_vacancy_for_any_dv[max_v][trg_class] += count
slots.occupy(j) # mark position as occupied
total_vacancies -= 1
class Slots:
"""
Represents positions in a target sentence. Used to keep track of
which slot (position) is occupied.
"""
def __init__(self, target_sentence_length):
self._slots = [False] * (target_sentence_length + 1) # 1-indexed
def occupy(self, position):
"""
:return: Mark slot at ``position`` as occupied
"""
self._slots[position] = True
def vacancies_at(self, position):
"""
:return: Number of vacant slots up to, and including, ``position``
"""
vacancies = 0
for k in range(1, position + 1):
if not self._slots[k]:
vacancies += 1
return vacancies
def __len__(self):
return len(self._slots) - 1 # exclude dummy zeroeth element

View File

@@ -0,0 +1,549 @@
# Natural Language Toolkit: IBM Model Core
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Common methods and classes for all IBM models. See ``IBMModel1``,
``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5``
for specific implementations.
The IBM models are a series of generative models that learn lexical
translation probabilities, p(target language word|source language word),
given a sentence-aligned parallel corpus.
The models increase in sophistication from model 1 to 5. Typically, the
output of lower models is used to seed the higher models. All models
use the Expectation-Maximization (EM) algorithm to learn various
probability tables.
Words in a sentence are one-indexed. The first word of a sentence has
position 1, not 0. Index 0 is reserved in the source sentence for the
NULL token. The concept of position does not apply to NULL, but it is
indexed at 0 by convention.
Each target word is aligned to exactly one source word or the NULL
token.
References:
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
from bisect import insort_left
from collections import defaultdict
from copy import deepcopy
from math import ceil
def longest_target_sentence_length(sentence_aligned_corpus):
"""
:param sentence_aligned_corpus: Parallel corpus under consideration
:type sentence_aligned_corpus: list(AlignedSent)
:return: Number of words in the longest target language sentence
of ``sentence_aligned_corpus``
"""
max_m = 0
for aligned_sentence in sentence_aligned_corpus:
m = len(aligned_sentence.words)
max_m = max(m, max_m)
return max_m
class IBMModel:
"""
Abstract base class for all IBM models
"""
# Avoid division by zero and precision errors by imposing a minimum
# value for probabilities. Note that this approach is theoretically
# incorrect, since it may create probabilities that sum to more
# than 1. In practice, the contribution of probabilities with MIN_PROB
# is tiny enough that the value of MIN_PROB can be treated as zero.
MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7
def __init__(self, sentence_aligned_corpus):
self.init_vocab(sentence_aligned_corpus)
self.reset_probabilities()
def reset_probabilities(self):
self.translation_table = defaultdict(
lambda: defaultdict(lambda: IBMModel.MIN_PROB)
)
"""
dict[str][str]: float. Probability(target word | source word).
Values accessed as ``translation_table[target_word][source_word]``.
"""
self.alignment_table = defaultdict(
lambda: defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB))
)
)
"""
dict[int][int][int][int]: float. Probability(i | j,l,m).
Values accessed as ``alignment_table[i][j][l][m]``.
Used in model 2 and hill climbing in models 3 and above
"""
self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
"""
dict[int][str]: float. Probability(fertility | source word).
Values accessed as ``fertility_table[fertility][source_word]``.
Used in model 3 and higher.
"""
self.p1 = 0.5
"""
Probability that a generated word requires another target word
that is aligned to NULL.
Used in model 3 and higher.
"""
def set_uniform_probabilities(self, sentence_aligned_corpus):
"""
Initialize probability tables to a uniform distribution
Derived classes should implement this accordingly.
"""
pass
def init_vocab(self, sentence_aligned_corpus):
src_vocab = set()
trg_vocab = set()
for aligned_sentence in sentence_aligned_corpus:
trg_vocab.update(aligned_sentence.words)
src_vocab.update(aligned_sentence.mots)
# Add the NULL token
src_vocab.add(None)
self.src_vocab = src_vocab
"""
set(str): All source language words used in training
"""
self.trg_vocab = trg_vocab
"""
set(str): All target language words used in training
"""
def sample(self, sentence_pair):
"""
Sample the most probable alignments from the entire alignment
space
First, determine the best alignment according to IBM Model 2.
With this initial alignment, use hill climbing to determine the
best alignment according to a higher IBM Model. Add this
alignment and its neighbors to the sample set. Repeat this
process with other initial alignments obtained by pegging an
alignment point.
Hill climbing may be stuck in a local maxima, hence the pegging
and trying out of different alignments.
:param sentence_pair: Source and target language sentence pair
to generate a sample of alignments from
:type sentence_pair: AlignedSent
:return: A set of best alignments represented by their ``AlignmentInfo``
and the best alignment of the set for convenience
:rtype: set(AlignmentInfo), AlignmentInfo
"""
sampled_alignments = set()
l = len(sentence_pair.mots)
m = len(sentence_pair.words)
# Start from the best model 2 alignment
initial_alignment = self.best_model2_alignment(sentence_pair)
potential_alignment = self.hillclimb(initial_alignment)
sampled_alignments.update(self.neighboring(potential_alignment))
best_alignment = potential_alignment
# Start from other model 2 alignments,
# with the constraint that j is aligned (pegged) to i
for j in range(1, m + 1):
for i in range(0, l + 1):
initial_alignment = self.best_model2_alignment(sentence_pair, j, i)
potential_alignment = self.hillclimb(initial_alignment, j)
neighbors = self.neighboring(potential_alignment, j)
sampled_alignments.update(neighbors)
if potential_alignment.score > best_alignment.score:
best_alignment = potential_alignment
return sampled_alignments, best_alignment
def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0):
"""
Finds the best alignment according to IBM Model 2
Used as a starting point for hill climbing in Models 3 and
above, because it is easier to compute than the best alignments
in higher models
:param sentence_pair: Source and target language sentence pair
to be word-aligned
:type sentence_pair: AlignedSent
:param j_pegged: If specified, the alignment point of j_pegged
will be fixed to i_pegged
:type j_pegged: int
:param i_pegged: Alignment point to j_pegged
:type i_pegged: int
"""
src_sentence = [None] + sentence_pair.mots
trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed
l = len(src_sentence) - 1 # exclude NULL
m = len(trg_sentence) - 1
alignment = [0] * (m + 1) # init all alignments to NULL
cepts = [[] for i in range(l + 1)] # init all cepts to empty list
for j in range(1, m + 1):
if j == j_pegged:
# use the pegged alignment instead of searching for best one
best_i = i_pegged
else:
best_i = 0
max_alignment_prob = IBMModel.MIN_PROB
t = trg_sentence[j]
for i in range(0, l + 1):
s = src_sentence[i]
alignment_prob = (
self.translation_table[t][s] * self.alignment_table[i][j][l][m]
)
if alignment_prob >= max_alignment_prob:
max_alignment_prob = alignment_prob
best_i = i
alignment[j] = best_i
cepts[best_i].append(j)
return AlignmentInfo(
tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts
)
def hillclimb(self, alignment_info, j_pegged=None):
"""
Starting from the alignment in ``alignment_info``, look at
neighboring alignments iteratively for the best one
There is no guarantee that the best alignment in the alignment
space will be found, because the algorithm might be stuck in a
local maximum.
:param j_pegged: If specified, the search will be constrained to
alignments where ``j_pegged`` remains unchanged
:type j_pegged: int
:return: The best alignment found from hill climbing
:rtype: AlignmentInfo
"""
alignment = alignment_info # alias with shorter name
max_probability = self.prob_t_a_given_s(alignment)
while True:
old_alignment = alignment
for neighbor_alignment in self.neighboring(alignment, j_pegged):
neighbor_probability = self.prob_t_a_given_s(neighbor_alignment)
if neighbor_probability > max_probability:
alignment = neighbor_alignment
max_probability = neighbor_probability
if alignment == old_alignment:
# Until there are no better alignments
break
alignment.score = max_probability
return alignment
def neighboring(self, alignment_info, j_pegged=None):
"""
Determine the neighbors of ``alignment_info``, obtained by
moving or swapping one alignment point
:param j_pegged: If specified, neighbors that have a different
alignment point from j_pegged will not be considered
:type j_pegged: int
:return: A set neighboring alignments represented by their
``AlignmentInfo``
:rtype: set(AlignmentInfo)
"""
neighbors = set()
l = len(alignment_info.src_sentence) - 1 # exclude NULL
m = len(alignment_info.trg_sentence) - 1
original_alignment = alignment_info.alignment
original_cepts = alignment_info.cepts
for j in range(1, m + 1):
if j != j_pegged:
# Add alignments that differ by one alignment point
for i in range(0, l + 1):
new_alignment = list(original_alignment)
new_cepts = deepcopy(original_cepts)
old_i = original_alignment[j]
# update alignment
new_alignment[j] = i
# update cepts
insort_left(new_cepts[i], j)
new_cepts[old_i].remove(j)
new_alignment_info = AlignmentInfo(
tuple(new_alignment),
alignment_info.src_sentence,
alignment_info.trg_sentence,
new_cepts,
)
neighbors.add(new_alignment_info)
for j in range(1, m + 1):
if j != j_pegged:
# Add alignments that have two alignment points swapped
for other_j in range(1, m + 1):
if other_j != j_pegged and other_j != j:
new_alignment = list(original_alignment)
new_cepts = deepcopy(original_cepts)
other_i = original_alignment[other_j]
i = original_alignment[j]
# update alignments
new_alignment[j] = other_i
new_alignment[other_j] = i
# update cepts
new_cepts[other_i].remove(other_j)
insort_left(new_cepts[other_i], j)
new_cepts[i].remove(j)
insort_left(new_cepts[i], other_j)
new_alignment_info = AlignmentInfo(
tuple(new_alignment),
alignment_info.src_sentence,
alignment_info.trg_sentence,
new_cepts,
)
neighbors.add(new_alignment_info)
return neighbors
def maximize_lexical_translation_probabilities(self, counts):
for t, src_words in counts.t_given_s.items():
for s in src_words:
estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s]
self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB)
def maximize_fertility_probabilities(self, counts):
for phi, src_words in counts.fertility.items():
for s in src_words:
estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s]
self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
def maximize_null_generation_probabilities(self, counts):
p1_estimate = counts.p1 / (counts.p1 + counts.p0)
p1_estimate = max(p1_estimate, IBMModel.MIN_PROB)
# Clip p1 if it is too large, because p0 = 1 - p1 should not be
# smaller than MIN_PROB
self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB)
def prob_of_alignments(self, alignments):
probability = 0
for alignment_info in alignments:
probability += self.prob_t_a_given_s(alignment_info)
return probability
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
All required information is assumed to be in ``alignment_info``
and self.
Derived classes should override this method
"""
return 0.0
class AlignmentInfo:
"""
Helper data object for training IBM Models 3 and up
Read-only. For a source sentence and its counterpart in the target
language, this class holds information about the sentence pair's
alignment, cepts, and fertility.
Warning: Alignments are one-indexed here, in contrast to
nltk.translate.Alignment and AlignedSent, which are zero-indexed
This class is not meant to be used outside of IBM models.
"""
def __init__(self, alignment, src_sentence, trg_sentence, cepts):
if not isinstance(alignment, tuple):
raise TypeError(
"The alignment must be a tuple because it is used "
"to uniquely identify AlignmentInfo objects."
)
self.alignment = alignment
"""
tuple(int): Alignment function. ``alignment[j]`` is the position
in the source sentence that is aligned to the position j in the
target sentence.
"""
self.src_sentence = src_sentence
"""
tuple(str): Source sentence referred to by this object.
Should include NULL token (None) in index 0.
"""
self.trg_sentence = trg_sentence
"""
tuple(str): Target sentence referred to by this object.
Should have a dummy element in index 0 so that the first word
starts from index 1.
"""
self.cepts = cepts
"""
list(list(int)): The positions of the target words, in
ascending order, aligned to a source word position. For example,
cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7
of the target sentence are aligned to the word in position 4 of
the source sentence
"""
self.score = None
"""
float: Optional. Probability of alignment, as defined by the
IBM model that assesses this alignment
"""
def fertility_of_i(self, i):
"""
Fertility of word in position ``i`` of the source sentence
"""
return len(self.cepts[i])
def is_head_word(self, j):
"""
:return: Whether the word in position ``j`` of the target
sentence is a head word
"""
i = self.alignment[j]
return self.cepts[i][0] == j
def center_of_cept(self, i):
"""
:return: The ceiling of the average positions of the words in
the tablet of cept ``i``, or 0 if ``i`` is None
"""
if i is None:
return 0
average_position = sum(self.cepts[i]) / len(self.cepts[i])
return int(ceil(average_position))
def previous_cept(self, j):
"""
:return: The previous cept of ``j``, or None if ``j`` belongs to
the first cept
"""
i = self.alignment[j]
if i == 0:
raise ValueError(
"Words aligned to NULL cannot have a previous "
"cept because NULL has no position"
)
previous_cept = i - 1
while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
previous_cept -= 1
if previous_cept <= 0:
previous_cept = None
return previous_cept
def previous_in_tablet(self, j):
"""
:return: The position of the previous word that is in the same
tablet as ``j``, or None if ``j`` is the first word of the
tablet
"""
i = self.alignment[j]
tablet_position = self.cepts[i].index(j)
if tablet_position == 0:
return None
return self.cepts[i][tablet_position - 1]
def zero_indexed_alignment(self):
"""
:return: Zero-indexed alignment, suitable for use in external
``nltk.translate`` modules like ``nltk.translate.Alignment``
:rtype: list(tuple)
"""
zero_indexed_alignment = []
for j in range(1, len(self.trg_sentence)):
i = self.alignment[j] - 1
if i < 0:
i = None # alignment to NULL token
zero_indexed_alignment.append((j - 1, i))
return zero_indexed_alignment
def __eq__(self, other):
return self.alignment == other.alignment
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash(self.alignment)
class Counts:
"""
Data object to store counts of various parameters during training
"""
def __init__(self):
self.t_given_s = defaultdict(lambda: defaultdict(float))
self.any_t_given_s = defaultdict(float)
self.p0 = 0.0
self.p1 = 0.0
self.fertility = defaultdict(lambda: defaultdict(float))
self.fertility_for_any_phi = defaultdict(float)
def update_lexical_translation(self, count, alignment_info, j):
i = alignment_info.alignment[j]
t = alignment_info.trg_sentence[j]
s = alignment_info.src_sentence[i]
self.t_given_s[t][s] += count
self.any_t_given_s[s] += count
def update_null_generation(self, count, alignment_info):
m = len(alignment_info.trg_sentence) - 1
fertility_of_null = alignment_info.fertility_of_i(0)
self.p1 += fertility_of_null * count
self.p0 += (m - 2 * fertility_of_null) * count
def update_fertility(self, count, alignment_info):
for i in range(0, len(alignment_info.src_sentence)):
s = alignment_info.src_sentence[i]
phi = alignment_info.fertility_of_i(i)
self.fertility[phi][s] += count
self.fertility_for_any_phi[s] += count

View File

@@ -0,0 +1,332 @@
# Natural Language Toolkit: LEPOR Score
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Ikram Ul Haq (ulhaqi12)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""LEPOR score implementation."""
import math
import re
import sys
from typing import Callable, List
import nltk
def length_penalty(reference: List[str], hypothesis: List[str]) -> float:
"""
This function calculates the length penalty(LP) for the LEPOR metric, which is defined to embrace the penaltyvfor
both longer and shorter hypothesis compared with the reference translations.
Refer from Eq (2) on https://aclanthology.org/C12-2044
:param reference: Reference sentence
:type reference: str
:param hypothesis: Hypothesis sentence
:type hypothesis: str
:return: Penalty of difference in length in reference and hypothesis sentence.
:rtype: float
"""
ref_len = len(reference)
hyp_len = len(hypothesis)
if ref_len == hyp_len:
return 1
elif ref_len < hyp_len:
return math.exp(1 - (ref_len / hyp_len))
else: # i.e. r_len > hyp_len
return math.exp(1 - (hyp_len / ref_len))
def alignment(ref_tokens: List[str], hyp_tokens: List[str]):
"""
This function computes the context-dependent n-gram word alignment tasks that
takes into account the surrounding context (neighbouring words) of the potential
word to select a better matching pairs between the output and the reference.
This alignment task is used to compute the ngram positional difference penalty
component of the LEPOR score. Generally, the function finds the matching tokens
between the reference and hypothesis, then find the indices of longest matching
n-grams by checking the left and right unigram window of the matching tokens.
:param ref_tokens: A list of tokens in reference sentence.
:type ref_tokens: List[str]
:param hyp_tokens: A list of tokens in hypothesis sentence.
:type hyp_tokens: List[str]
"""
alignments = []
# Store the reference and hypothesis tokens length.
hyp_len = len(hyp_tokens)
ref_len = len(ref_tokens)
for hyp_index, hyp_token in enumerate(hyp_tokens):
# If no match.
if ref_tokens.count(hyp_token) == 0:
alignments.append(-1)
# If only one match.
elif ref_tokens.count(hyp_token) == 1:
alignments.append(ref_tokens.index(hyp_token))
# Otherwise, compute the multiple possibilities.
else:
# Keeps an index of where the hypothesis token matches the reference.
ref_indexes = [
i for i, ref_token in enumerate(ref_tokens) if ref_token == hyp_token
]
# Iterate through the matched tokens, and check if
# the one token to the left/right also matches.
is_matched = []
for ind, ref_index in enumerate(ref_indexes):
# The one to the left token also matches.
if (
0 < ref_index - 1 < ref_len
and 0 < hyp_index - 1 < hyp_len
and ref_tokens[ref_index - 1] == hyp_tokens[hyp_index - 1]
):
is_matched[ind] = True
# The one to the right token also matches.
elif (
0 < ref_index + 1 < ref_len
and 0 < hyp_index + 1 < hyp_len
and ref_tokens[ref_index + 1] == hyp_tokens[hyp_index + 1]
):
is_matched[ind] = True
# If the left and right tokens don't match.
else:
is_matched[ind] = False
# Stores the alignments that have matching phrases.
# If there's only a single matched alignment.
if is_matched.count(True) == 1:
alignments.append(ref_indexes[is_matched.index(True)])
# If there's multiple matched alignments that have matching
# tokens in the left/right window, we shift the index of the
# alignment to the right most matching token.
elif is_matched.count(True) > 1:
min_distance = 0
min_index = 0
for match, ref_index in zip(is_matched, ref_indexes):
if match:
distance = abs(hyp_index - ref_index)
if distance > min_distance:
min_distance = distance
min_index = ref_index
alignments.append(min_index)
# If there's no matched alignments,
# we still keep indexes of the matching tokens
# without explicitly checking for the left/right window.
else:
min_distance = 0
min_index = 0
for ref_index in ref_indexes:
distance = abs(hyp_index - ref_index)
if distance > min_distance:
min_distance = distance
min_index = ref_index
alignments.append(min_index)
for ref_index in ref_indexes:
distance = abs(hyp_index - ref_index)
if distance > min_distance:
min_distance = distance
min_index = ref_index
alignments.append(min_index)
# The alignments are one indexed to keep track of the ending slice pointer of the matching ngrams.
alignments = [a + 1 for a in alignments if a != -1]
return alignments
def ngram_positional_penalty(
ref_tokens: List[str], hyp_tokens: List[str]
) -> (float, float):
"""
This function calculates the n-gram position difference penalty (NPosPenal) described in the LEPOR paper.
The NPosPenal is an exponential of the length normalized n-gram matches between the reference and the hypothesis.
:param ref_tokens: A list of words in reference sentence.
:type ref_tokens: List[str]
:param hyp_tokens: A list of words in hypothesis sentence.
:type hyp_tokens: List[str]
:return: A tuple containing two elements:
- NPosPenal: N-gram positional penalty.
- match_count: Count of matched n-grams.
:rtype: tuple
"""
alignments = alignment(ref_tokens, hyp_tokens)
match_count = len(alignments)
# Stores the n-gram position values (difference values) of aligned words
# between output and reference sentences,
# aka |PD| of eq (4) in https://aclanthology.org/C12-2044
pd = []
for i, a in enumerate(alignments):
pd.append(abs((i + 1) / len(hyp_tokens) - a / len(ref_tokens)))
npd = sum(pd) / len(hyp_tokens)
return math.exp(-npd), match_count
def harmonic(
match_count: int,
reference_length: int,
hypothesis_length: int,
alpha: float,
beta: float,
) -> float:
"""
Function will calculate the precision and recall of matched words and calculate a final score on wighting
using alpha and beta parameters.
:param match_count: Number of words in hypothesis aligned with reference.
:type match_count: int
:param reference_length: Length of the reference sentence
:type reference_length: int
:param hypothesis_length: Length of the hypothesis sentence
:type hypothesis_length: int
:param alpha: A parameter to set weight fot recall.
:type alpha: float
:param beta: A parameter to set weight fot precision.
:type beta: float
:return: Harmonic mean.
:rtype: float
"""
epsilon = sys.float_info.epsilon
precision = match_count / hypothesis_length
recall = match_count / reference_length
harmonic_score = (alpha + beta) / (
(alpha / (recall + epsilon)) + (beta / (precision + epsilon))
)
return harmonic_score
def sentence_lepor(
references: List[str],
hypothesis: str,
alpha: float = 1.0,
beta: float = 1.0,
tokenizer: Callable[[str], List[str]] = None,
) -> List[float]:
"""
Calculate LEPOR score a sentence from Han, A. L.-F. (2017).
LEPOR: An Augmented Machine Translation Evaluation Metric. https://arxiv.org/abs/1703.08748v2
>>> hypothesis = 'a bird is on a stone.'
>>> reference1 = 'a bird behind the stone.'
>>> reference2 = 'a bird is on the rock.'
>>> sentence_lepor([reference1, reference2], hypothesis)
[0.7824248013113159, 0.7739937377760259]
:param references: Reference sentences
:type references: list(str)
:param hypothesis: Hypothesis sentence
:type hypothesis: str
:param alpha: A parameter to set weight fot recall.
:type alpha: float
:param beta: A parameter to set weight fot precision.
:type beta: float
:param tokenizer: A callable tokenizer that will accept a string and returns a list of tokens.
:type tokenizer: Callable[[str], List[str]]
:return: The list of Lepor scores for a hypothesis with all references.
:rtype: list(float)
"""
lepor_scores = list()
# Tokenize sentences.
if tokenizer:
hypothesis = tokenizer(hypothesis)
for index, reference in enumerate(references):
references[index] = tokenizer(reference)
else: # If tokenizer is not provided, use the one in NLTK.
hypothesis = nltk.word_tokenize(hypothesis)
for index, reference in enumerate(references):
references[index] = nltk.word_tokenize(reference)
for reference in references:
if len(reference) == 0 or len(hypothesis) == 0:
raise ValueError("One of the sentence is empty. Exit.")
# Calculate the length penalty due to the difference in the length of reference and hypothesis.
lp = length_penalty(reference, hypothesis)
# Calculate the penalty on different positions of same word in translation.
npd, match_count = ngram_positional_penalty(reference, hypothesis)
harmonic_score = harmonic(
match_count, len(reference), len(hypothesis), alpha, beta
)
lepor_scores.append(lp * npd * harmonic_score)
return lepor_scores
def corpus_lepor(
references: List[List[str]],
hypothesis: List[str],
alpha: float = 1.0,
beta: float = 1.0,
tokenizer: Callable[[str], List[str]] = None,
) -> List[List[float]]:
"""
Calculate LEPOR score for list of sentences from Han, A. L.-F. (2017).
LEPOR: An Augmented Machine Translation Evaluation Metric. https://arxiv.org/abs/1703.08748v2
>>> hypothesis = ['a bird is on a stone.', 'scary crow was not bad.']
>>> references = [['a bird behind the stone.', 'a bird is on the rock'],
... ['scary cow was good.', 'scary crow was elegant.']]
>>> corpus_lepor(references, hypothesis)
[[0.7824248013113159, 0.7931427828105261], [0.5639427891892225, 0.7860963170056643]]
:param references: Reference sentences
:type references: list(list(str))
:param hypothesis: Hypothesis sentences
:type hypothesis: list(str)
:param alpha: A parameter to set weight fot recall.
:type alpha: float
:param beta: A parameter to set weight fot precision.
:type beta: float
:param tokenizer: A callable tokenizer that will accept a string and returns a list of tokens.
:type tokenizer: Callable[[str], List[str]]
:return: The Lepor score. Returns a list for all sentences
:rtype: list(list(float))
"""
if len(references) == 0 or len(hypothesis) == 0:
raise ValueError("There is an Empty list. Exit.")
assert len(references) == len(hypothesis), (
"The number of hypothesis and their reference(s) should be the " "same "
)
lepor_scores = list()
for reference_sen, hypothesis_sen in zip(references, hypothesis):
# Calculate Lepor for each sentence separately and append in a list.
lepor_scores.append(
sentence_lepor(reference_sen, hypothesis_sen, alpha, beta, tokenizer)
)
return lepor_scores

View File

@@ -0,0 +1,409 @@
# Natural Language Toolkit: Machine Translation
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Uday Krishna <udaykrishna5@gmail.com>
# Contributor: Tom Aarsen
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from itertools import chain, product
from typing import Callable, Iterable, List, Tuple
from nltk.corpus import WordNetCorpusReader, wordnet
from nltk.stem.api import StemmerI
from nltk.stem.porter import PorterStemmer
def _generate_enums(
hypothesis: Iterable[str],
reference: Iterable[str],
preprocess: Callable[[str], str] = str.lower,
) -> Tuple[List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
Takes in pre-tokenized inputs for hypothesis and reference and returns
enumerated word lists for each of them
:param hypothesis: pre-tokenized hypothesis
:param reference: pre-tokenized reference
:preprocess: preprocessing method (default str.lower)
:return: enumerated words list
"""
if isinstance(hypothesis, str):
raise TypeError(
f'"hypothesis" expects pre-tokenized hypothesis (Iterable[str]): {hypothesis}'
)
if isinstance(reference, str):
raise TypeError(
f'"reference" expects pre-tokenized reference (Iterable[str]): {reference}'
)
enum_hypothesis_list = list(enumerate(map(preprocess, hypothesis)))
enum_reference_list = list(enumerate(map(preprocess, reference)))
return enum_hypothesis_list, enum_reference_list
def exact_match(
hypothesis: Iterable[str], reference: Iterable[str]
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
matches exact words in hypothesis and reference
and returns a word mapping based on the enumerated
word id between hypothesis and reference
:param hypothesis: pre-tokenized hypothesis
:param reference: pre-tokenized reference
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
enumerated unmatched reference tuples
"""
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
return _match_enums(enum_hypothesis_list, enum_reference_list)
def _match_enums(
enum_hypothesis_list: List[Tuple[int, str]],
enum_reference_list: List[Tuple[int, str]],
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
matches exact words in hypothesis and reference and returns
a word mapping between enum_hypothesis_list and enum_reference_list
based on the enumerated word id.
:param enum_hypothesis_list: enumerated hypothesis list
:param enum_reference_list: enumerated reference list
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
enumerated unmatched reference tuples
"""
word_match = []
for i in range(len(enum_hypothesis_list))[::-1]:
for j in range(len(enum_reference_list))[::-1]:
if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
word_match.append(
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
)
enum_hypothesis_list.pop(i)
enum_reference_list.pop(j)
break
return word_match, enum_hypothesis_list, enum_reference_list
def _enum_stem_match(
enum_hypothesis_list: List[Tuple[int, str]],
enum_reference_list: List[Tuple[int, str]],
stemmer: StemmerI = PorterStemmer(),
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
Stems each word and matches them in hypothesis and reference
and returns a word mapping between enum_hypothesis_list and
enum_reference_list based on the enumerated word id. The function also
returns a enumerated list of unmatched words for hypothesis and reference.
:param enum_hypothesis_list: enumerated hypothesis list
:param enum_reference_list: enumerated reference list
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
enumerated unmatched reference tuples
"""
stemmed_enum_hypothesis_list = [
(word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list
]
stemmed_enum_reference_list = [
(word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list
]
return _match_enums(stemmed_enum_hypothesis_list, stemmed_enum_reference_list)
def stem_match(
hypothesis: Iterable[str],
reference: Iterable[str],
stemmer: StemmerI = PorterStemmer(),
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
Stems each word and matches them in hypothesis and reference
and returns a word mapping between hypothesis and reference
:param hypothesis: pre-tokenized hypothesis
:param reference: pre-tokenized reference
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
enumerated unmatched reference tuples
"""
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer)
def _enum_wordnetsyn_match(
enum_hypothesis_list: List[Tuple[int, str]],
enum_reference_list: List[Tuple[int, str]],
wordnet: WordNetCorpusReader = wordnet,
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
Matches each word in reference to a word in hypothesis
if any synonym of a hypothesis word is the exact match
to the reference word.
:param enum_hypothesis_list: enumerated hypothesis list
:param enum_reference_list: enumerated reference list
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
"""
word_match = []
for i in range(len(enum_hypothesis_list))[::-1]:
hypothesis_syns = set(
chain.from_iterable(
(
lemma.name()
for lemma in synset.lemmas()
if lemma.name().find("_") < 0
)
for synset in wordnet.synsets(enum_hypothesis_list[i][1])
)
).union({enum_hypothesis_list[i][1]})
for j in range(len(enum_reference_list))[::-1]:
if enum_reference_list[j][1] in hypothesis_syns:
word_match.append(
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
)
enum_hypothesis_list.pop(i)
enum_reference_list.pop(j)
break
return word_match, enum_hypothesis_list, enum_reference_list
def wordnetsyn_match(
hypothesis: Iterable[str],
reference: Iterable[str],
wordnet: WordNetCorpusReader = wordnet,
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
Matches each word in reference to a word in hypothesis if any synonym
of a hypothesis word is the exact match to the reference word.
:param hypothesis: pre-tokenized hypothesis
:param reference: pre-tokenized reference
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
:return: list of mapped tuples
"""
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
return _enum_wordnetsyn_match(
enum_hypothesis_list, enum_reference_list, wordnet=wordnet
)
def _enum_align_words(
enum_hypothesis_list: List[Tuple[int, str]],
enum_reference_list: List[Tuple[int, str]],
stemmer: StemmerI = PorterStemmer(),
wordnet: WordNetCorpusReader = wordnet,
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
Aligns/matches words in the hypothesis to reference by sequentially
applying exact match, stemmed match and wordnet based synonym match.
in case there are multiple matches the match which has the least number
of crossing is chosen. Takes enumerated list as input instead of
string input
:param enum_hypothesis_list: enumerated hypothesis list
:param enum_reference_list: enumerated reference list
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
:return: sorted list of matched tuples, unmatched hypothesis list,
unmatched reference list
"""
exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums(
enum_hypothesis_list, enum_reference_list
)
stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match(
enum_hypothesis_list, enum_reference_list, stemmer=stemmer
)
wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match(
enum_hypothesis_list, enum_reference_list, wordnet=wordnet
)
return (
sorted(
exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0]
),
enum_hypothesis_list,
enum_reference_list,
)
def align_words(
hypothesis: Iterable[str],
reference: Iterable[str],
stemmer: StemmerI = PorterStemmer(),
wordnet: WordNetCorpusReader = wordnet,
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
"""
Aligns/matches words in the hypothesis to reference by sequentially
applying exact match, stemmed match and wordnet based synonym match.
In case there are multiple matches the match which has the least number
of crossing is chosen.
:param hypothesis: pre-tokenized hypothesis
:param reference: pre-tokenized reference
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
:return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list
"""
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
return _enum_align_words(
enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet
)
def _count_chunks(matches: List[Tuple[int, int]]) -> int:
"""
Counts the fewest possible number of chunks such that matched unigrams
of each chunk are adjacent to each other. This is used to calculate the
fragmentation part of the metric.
:param matches: list containing a mapping of matched words (output of align_words)
:return: Number of chunks a sentence is divided into post alignment
"""
i = 0
chunks = 1
while i < len(matches) - 1:
if (matches[i + 1][0] == matches[i][0] + 1) and (
matches[i + 1][1] == matches[i][1] + 1
):
i += 1
continue
i += 1
chunks += 1
return chunks
def single_meteor_score(
reference: Iterable[str],
hypothesis: Iterable[str],
preprocess: Callable[[str], str] = str.lower,
stemmer: StemmerI = PorterStemmer(),
wordnet: WordNetCorpusReader = wordnet,
alpha: float = 0.9,
beta: float = 3.0,
gamma: float = 0.5,
) -> float:
"""
Calculates METEOR score for single hypothesis and reference as per
"Meteor: An Automatic Metric for MT Evaluation with HighLevels of
Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal,
in Proceedings of ACL.
https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']
>>> round(single_meteor_score(reference1, hypothesis1),4)
0.6944
If there is no words match during the alignment the method returns the
score as 0. We can safely return a zero instead of raising a
division by zero error as no match usually implies a bad translation.
>>> round(single_meteor_score(['this', 'is', 'a', 'cat'], ['non', 'matching', 'hypothesis']),4)
0.0
:param reference: pre-tokenized reference
:param hypothesis: pre-tokenized hypothesis
:param preprocess: preprocessing function (default str.lower)
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
:param alpha: parameter for controlling relative weights of precision and recall.
:param beta: parameter for controlling shape of penalty as a
function of as a function of fragmentation.
:param gamma: relative weight assigned to fragmentation penalty.
:return: The sentence-level METEOR score.
"""
enum_hypothesis, enum_reference = _generate_enums(
hypothesis, reference, preprocess=preprocess
)
translation_length = len(enum_hypothesis)
reference_length = len(enum_reference)
matches, _, _ = _enum_align_words(
enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet
)
matches_count = len(matches)
try:
precision = float(matches_count) / translation_length
recall = float(matches_count) / reference_length
fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
chunk_count = float(_count_chunks(matches))
frag_frac = chunk_count / matches_count
except ZeroDivisionError:
return 0.0
penalty = gamma * frag_frac**beta
return (1 - penalty) * fmean
def meteor_score(
references: Iterable[Iterable[str]],
hypothesis: Iterable[str],
preprocess: Callable[[str], str] = str.lower,
stemmer: StemmerI = PorterStemmer(),
wordnet: WordNetCorpusReader = wordnet,
alpha: float = 0.9,
beta: float = 3.0,
gamma: float = 0.5,
) -> float:
"""
Calculates METEOR score for hypothesis with multiple references as
described in "Meteor: An Automatic Metric for MT Evaluation with
HighLevels of Correlation with Human Judgments" by Alon Lavie and
Abhaya Agarwal, in Proceedings of ACL.
https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
In case of multiple references the best score is chosen. This method
iterates over single_meteor_score and picks the best pair among all
the references for a given hypothesis
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct']
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party']
>>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4)
0.6944
If there is no words match during the alignment the method returns the
score as 0. We can safely return a zero instead of raising a
division by zero error as no match usually implies a bad translation.
>>> round(meteor_score([['this', 'is', 'a', 'cat']], ['non', 'matching', 'hypothesis']),4)
0.0
:param references: pre-tokenized reference sentences
:param hypothesis: a pre-tokenized hypothesis sentence
:param preprocess: preprocessing function (default str.lower)
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
:param alpha: parameter for controlling relative weights of precision and recall.
:param beta: parameter for controlling shape of penalty as a function
of as a function of fragmentation.
:param gamma: relative weight assigned to fragmentation penalty.
:return: The sentence-level METEOR score.
"""
return max(
single_meteor_score(
reference,
hypothesis,
preprocess=preprocess,
stemmer=stemmer,
wordnet=wordnet,
alpha=alpha,
beta=beta,
gamma=gamma,
)
for reference in references
)

View File

@@ -0,0 +1,41 @@
# Natural Language Toolkit: Translation metrics
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Will Zhang <wilzzha@gmail.com>
# Guan Gui <ggui@student.unimelb.edu.au>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
def alignment_error_rate(reference, hypothesis, possible=None):
"""
Return the Alignment Error Rate (AER) of an alignment
with respect to a "gold standard" reference alignment.
Return an error rate between 0.0 (perfect alignment) and 1.0 (no
alignment).
>>> from nltk.translate import Alignment
>>> ref = Alignment([(0, 0), (1, 1), (2, 2)])
>>> test = Alignment([(0, 0), (1, 2), (2, 1)])
>>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS
0.6666666666666667
:type reference: Alignment
:param reference: A gold standard alignment (sure alignments)
:type hypothesis: Alignment
:param hypothesis: A hypothesis alignment (aka. candidate alignments)
:type possible: Alignment or None
:param possible: A gold standard reference of possible alignments
(defaults to *reference* if None)
:rtype: float or None
"""
if possible is None:
possible = reference
else:
assert reference.issubset(possible) # sanity check
return 1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) / float(
len(hypothesis) + len(reference)
)

View File

@@ -0,0 +1,195 @@
# Natural Language Toolkit: NIST Score
#
# Copyright (C) 2001-2025 NLTK Project
# Authors:
# Contributors:
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""NIST score implementation."""
import fractions
import math
from collections import Counter
from nltk.util import ngrams
def sentence_nist(references, hypothesis, n=5):
"""
Calculate NIST score from
George Doddington. 2002. "Automatic evaluation of machine translation quality
using n-gram co-occurrence statistics." Proceedings of HLT.
Morgan Kaufmann Publishers Inc. https://dl.acm.org/citation.cfm?id=1289189.1289273
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
score. The official script used by NIST to compute BLEU and NIST score is
mteval-14.pl. The main differences are:
- BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
- NIST has a different brevity penalty
- NIST score from mteval-14.pl has a self-contained tokenizer
Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT
used in the NIST score computation.
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
... 'that', 'party', 'direct']
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
... 'heed', 'Party', 'commands']
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the',
... 'Party']
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
3.3709...
>>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
1.4619...
:param references: reference sentences
:type references: list(list(str))
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param n: highest n-gram order
:type n: int
"""
return corpus_nist([references], [hypothesis], n)
def corpus_nist(list_of_references, hypotheses, n=5):
"""
Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
the hypotheses and their respective references.
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param n: highest n-gram order
:type n: int
"""
# Before proceeding to compute NIST, perform sanity checks.
assert len(list_of_references) == len(
hypotheses
), "The number of hypotheses and their reference(s) should be the same"
# Collect the ngram coounts from the reference sentences.
ngram_freq = Counter()
total_reference_words = 0
for (
references
) in list_of_references: # For each source sent, there's a list of reference sents.
for reference in references:
# For each order of ngram, count the ngram occurrences.
for i in range(1, n + 1):
ngram_freq.update(ngrams(reference, i))
total_reference_words += len(reference)
# Compute the information weights based on the reference sentences.
# Eqn 2 in Doddington (2002):
# Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
information_weights = {}
for _ngram in ngram_freq: # w_1 ... w_n
_mgram = _ngram[:-1] # w_1 ... w_n-1
# From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl#L546
# it's computed as such:
# denominator = ngram_freq[_mgram] if _mgram and _mgram in ngram_freq else denominator = total_reference_words
# information_weights[_ngram] = -1 * math.log(ngram_freq[_ngram]/denominator) / math.log(2)
#
# Mathematically, it's equivalent to the our implementation:
if _mgram and _mgram in ngram_freq:
numerator = ngram_freq[_mgram]
else:
numerator = total_reference_words
information_weights[_ngram] = math.log(numerator / ngram_freq[_ngram], 2)
# Micro-average.
nist_precision_numerator_per_ngram = Counter()
nist_precision_denominator_per_ngram = Counter()
l_ref, l_sys = 0, 0
# For each order of ngram.
for i in range(1, n + 1):
# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
hyp_len = len(hypothesis)
# Find reference with the best NIST score.
nist_score_per_ref = []
for reference in references:
_ref_len = len(reference)
# Counter of ngrams in hypothesis.
hyp_ngrams = (
Counter(ngrams(hypothesis, i))
if len(hypothesis) >= i
else Counter()
)
ref_ngrams = (
Counter(ngrams(reference, i)) if len(reference) >= i else Counter()
)
ngram_overlaps = hyp_ngrams & ref_ngrams
# Precision part of the score in Eqn 3
_numerator = sum(
information_weights[_ngram] * count
for _ngram, count in ngram_overlaps.items()
)
_denominator = sum(hyp_ngrams.values())
_precision = 0 if _denominator == 0 else _numerator / _denominator
nist_score_per_ref.append(
(_precision, _numerator, _denominator, _ref_len)
)
# Best reference.
precision, numerator, denominator, ref_len = max(nist_score_per_ref)
nist_precision_numerator_per_ngram[i] += numerator
nist_precision_denominator_per_ngram[i] += denominator
l_ref += ref_len
l_sys += hyp_len
# Final NIST micro-average mean aggregation.
nist_precision = 0
for i in nist_precision_numerator_per_ngram:
precision = (
nist_precision_numerator_per_ngram[i]
/ nist_precision_denominator_per_ngram[i]
)
nist_precision += precision
# Eqn 3 in Doddington(2002)
return nist_precision * nist_length_penalty(l_ref, l_sys)
def nist_length_penalty(ref_len, hyp_len):
"""
Calculates the NIST length penalty, from Eq. 3 in Doddington (2002)
penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 )))
where,
`beta` is chosen to make the brevity penalty factor = 0.5 when the
no. of words in the system output (hyp) is 2/3 of the average
no. of words in the reference translation (ref)
The NIST penalty is different from BLEU's such that it minimize the impact
of the score of small variations in the length of a translation.
See Fig. 4 in Doddington (2002)
"""
ratio = hyp_len / ref_len
if 0 < ratio < 1:
ratio_x, score_x = 1.5, 0.5
beta = math.log(score_x) / math.log(ratio_x) ** 2
return math.exp(beta * math.log(ratio) ** 2)
else: # ratio <= 0 or ratio >= 1
return max(min(ratio, 1.0), 0.0)

View File

@@ -0,0 +1,193 @@
# Natural Language Toolkit: Phrase Extraction Algorithm
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
def extract(
f_start,
f_end,
e_start,
e_end,
alignment,
f_aligned,
srctext,
trgtext,
srclen,
trglen,
max_phrase_length,
):
"""
This function checks for alignment point consistency and extracts
phrases using the chunk of consistent phrases.
A phrase pair (e, f ) is consistent with an alignment A if and only if:
(i) No English words in the phrase pair are aligned to words outside it.
∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
(ii) No Foreign words in the phrase pair are aligned to words outside it.
∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
(iii) The phrase pair contains at least one alignment point.
∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A
:type f_start: int
:param f_start: Starting index of the possible foreign language phrases
:type f_end: int
:param f_end: End index of the possible foreign language phrases
:type e_start: int
:param e_start: Starting index of the possible source language phrases
:type e_end: int
:param e_end: End index of the possible source language phrases
:type srctext: list
:param srctext: The source language tokens, a list of string.
:type trgtext: list
:param trgtext: The target language tokens, a list of string.
:type srclen: int
:param srclen: The number of tokens in the source language tokens.
:type trglen: int
:param trglen: The number of tokens in the target language tokens.
"""
if f_end < 0: # 0-based indexing.
return {}
# Check if alignment points are consistent.
for e, f in alignment:
if (f_start <= f <= f_end) and (e < e_start or e > e_end):
return {}
# Add phrase pairs (incl. additional unaligned f)
phrases = set()
fs = f_start
while True:
fe = min(f_end, f_start + max_phrase_length - 1)
while True:
# add phrase pair ([e_start, e_end], [fs, fe]) to set E
# Need to +1 in range to include the end-point.
src_phrase = " ".join(srctext[e_start : e_end + 1])
trg_phrase = " ".join(trgtext[fs : fe + 1])
# Include more data for later ordering.
phrases.add(((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase))
fe += 1
if fe in f_aligned or fe >= trglen:
break
fs -= 1
if fs in f_aligned or fs < 0:
break
return phrases
def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
"""
Phrase extraction algorithm extracts all consistent phrase pairs from
a word-aligned sentence pair.
The idea is to loop over all possible source language (e) phrases and find
the minimal foreign phrase (f) that matches each of them. Matching is done
by identifying all alignment points for the source phrase and finding the
shortest foreign phrase that includes all the foreign counterparts for the
source words.
In short, a phrase alignment has to
(a) contain all alignment points for all covered words
(b) contain at least one alignment point
>>> srctext = "michael assumes that he will stay in the house"
>>> trgtext = "michael geht davon aus , dass er im haus bleibt"
>>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
... (5,9), (6,7), (7,7), (8,8)]
>>> phrases = phrase_extraction(srctext, trgtext, alignment)
>>> for i in sorted(phrases):
... print(i)
...
((0, 1), (0, 1), 'michael', 'michael')
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
((1, 2), (1, 4), 'assumes', 'geht davon aus')
((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
((2, 3), (4, 6), 'that', ', dass')
((2, 3), (5, 6), 'that', 'dass')
((2, 4), (4, 7), 'that he', ', dass er')
((2, 4), (5, 7), 'that he', 'dass er')
((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
((3, 4), (6, 7), 'he', 'er')
((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
((4, 6), (9, 10), 'will stay', 'bleibt')
((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt')
((6, 8), (7, 8), 'in the', 'im')
((6, 9), (7, 9), 'in the house', 'im haus')
((8, 9), (8, 9), 'house', 'haus')
:type srctext: str
:param srctext: The sentence string from the source language.
:type trgtext: str
:param trgtext: The sentence string from the target language.
:type alignment: list(tuple)
:param alignment: The word alignment outputs as list of tuples, where
the first elements of tuples are the source words' indices and
second elements are the target words' indices. This is also the output
format of nltk.translate.ibm1
:rtype: list(tuple)
:return: A list of tuples, each element in a list is a phrase and each
phrase is a tuple made up of (i) its source location, (ii) its target
location, (iii) the source phrase and (iii) the target phrase. The phrase
list of tuples represents all the possible phrases extracted from the
word alignments.
:type max_phrase_length: int
:param max_phrase_length: maximal phrase length, if 0 or not specified
it is set to a length of the longer sentence (srctext or trgtext).
"""
srctext = srctext.split() # e
trgtext = trgtext.split() # f
srclen = len(srctext) # len(e)
trglen = len(trgtext) # len(f)
# Keeps an index of which source/target words that are aligned.
f_aligned = [j for _, j in alignment]
max_phrase_length = max_phrase_length or max(srclen, trglen)
# set of phrase pairs BP
bp = set()
for e_start in range(srclen):
max_idx = min(srclen, e_start + max_phrase_length)
for e_end in range(e_start, max_idx):
# // find the minimally matching foreign phrase
# (f start , f end ) = ( length(f), 0 )
# f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
f_start, f_end = trglen - 1, -1 # 0-based indexing
for e, f in alignment:
if e_start <= e <= e_end:
f_start = min(f, f_start)
f_end = max(f, f_end)
# add extract (f start , f end , e start , e end ) to set BP
phrases = extract(
f_start,
f_end,
e_start,
e_end,
alignment,
f_aligned,
srctext,
trgtext,
srclen,
trglen,
max_phrase_length,
)
if phrases:
bp.update(phrases)
return bp

View File

@@ -0,0 +1,330 @@
# Natural Language Toolkit: RIBES Score
#
# Copyright (C) 2001-2025 NLTK Project
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
# Mark Byers, ekhumoro, P. Ortiz
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
""" RIBES score implementation """
import math
from itertools import islice
from nltk.util import choose, ngrams
def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
"""
The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
Distant Language Pairs". In Proceedings of EMNLP.
https://www.aclweb.org/anthology/D/D10/D10-1092.pdf
The generic RIBES scores used in shared task, e.g. Workshop for
Asian Translation (WAT) uses the following RIBES calculations:
RIBES = kendall_tau * (alpha**p1) * (beta**bp)
Please note that this re-implementation differs from the official
RIBES implementation and though it emulates the results as describe
in the original paper, there are further optimization implemented
in the official RIBES script.
Users are encouraged to use the official RIBES script instead of this
implementation when evaluating your machine translation system. Refer
to https://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
:param references: a list of reference sentences
:type references: list(list(str))
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param alpha: hyperparameter used as a prior for the unigram precision.
:type alpha: float
:param beta: hyperparameter used as a prior for the brevity penalty.
:type beta: float
:return: The best ribes score from one of the references.
:rtype: float
"""
best_ribes = -1.0
# Calculates RIBES for each reference and returns the best score.
for reference in references:
# Collects the *worder* from the ranked correlation alignments.
worder = word_rank_alignment(reference, hypothesis)
nkt = kendall_tau(worder)
# Calculates the brevity penalty
bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis)))
# Calculates the unigram precision, *p1*
p1 = len(worder) / len(hypothesis)
_ribes = nkt * (p1**alpha) * (bp**beta)
if _ribes > best_ribes: # Keeps the best score.
best_ribes = _ribes
return best_ribes
def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
"""
This function "calculates RIBES for a system output (hypothesis) with
multiple references, and returns "best" score among multi-references and
individual scores. The scores are corpus-wise, i.e., averaged by the number
of sentences." (c.f. RIBES version 1.03.1 code).
Different from BLEU's micro-average precision, RIBES calculates the
macro-average precision by averaging the best RIBES score for each pair of
hypothesis and its corresponding references
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
... 'heed', 'Party', 'commands']
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
... 'interested', 'in', 'world', 'history']
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
... 'because', 'he', 'read', 'the', 'book']
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
>>> hypotheses = [hyp1, hyp2]
>>> round(corpus_ribes(list_of_references, hypotheses),4)
0.3597
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param alpha: hyperparameter used as a prior for the unigram precision.
:type alpha: float
:param beta: hyperparameter used as a prior for the brevity penalty.
:type beta: float
:return: The best ribes score from one of the references.
:rtype: float
"""
corpus_best_ribes = 0.0
# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
return corpus_best_ribes / len(hypotheses)
def position_of_ngram(ngram, sentence):
"""
This function returns the position of the first instance of the ngram
appearing in a sentence.
Note that one could also use string as follows but the code is a little
convoluted with type casting back and forth:
char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
word_pos = char_pos.count(' ')
Another way to conceive this is:
return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
if ng == ngram)
:param ngram: The ngram that needs to be searched
:type ngram: tuple
:param sentence: The list of tokens to search from.
:type sentence: list(str)
"""
# Iterates through the ngrams in sentence.
for i, sublist in enumerate(ngrams(sentence, len(ngram))):
# Returns the index of the word when ngram matches.
if ngram == sublist:
return i
def word_rank_alignment(reference, hypothesis, character_based=False):
"""
This is the word rank alignment algorithm described in the paper to produce
the *worder* list, i.e. a list of word indices of the hypothesis word orders
w.r.t. the list of reference words.
Below is (H0, R0) example from the Isozaki et al. 2010 paper,
note the examples are indexed from 1 but the results here are indexed from 0:
>>> ref = str('he was interested in world history because he '
... 'read the book').split()
>>> hyp = str('he read the book because he was interested in world '
... 'history').split()
>>> word_rank_alignment(ref, hyp)
[7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
The (H1, R1) example from the paper, note the 0th index:
>>> ref = 'John hit Bob yesterday'.split()
>>> hyp = 'Bob hit John yesterday'.split()
>>> word_rank_alignment(ref, hyp)
[2, 1, 0, 3]
Here is the (H2, R2) example from the paper, note the 0th index here too:
>>> ref = 'the boy read the book'.split()
>>> hyp = 'the book was read by the boy'.split()
>>> word_rank_alignment(ref, hyp)
[3, 4, 2, 0, 1]
:param reference: a reference sentence
:type reference: list(str)
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
"""
worder = []
hyp_len = len(hypothesis)
# Stores a list of possible ngrams from the reference sentence.
# This is used for matching context window later in the algorithm.
ref_ngrams = []
hyp_ngrams = []
for n in range(1, len(reference) + 1):
for ng in ngrams(reference, n):
ref_ngrams.append(ng)
for ng in ngrams(hypothesis, n):
hyp_ngrams.append(ng)
for i, h_word in enumerate(hypothesis):
# If word is not in the reference, continue.
if h_word not in reference:
continue
# If we can determine one-to-one word correspondence for unigrams that
# only appear once in both the reference and hypothesis.
elif hypothesis.count(h_word) == reference.count(h_word) == 1:
worder.append(reference.index(h_word))
else:
max_window_size = max(i, hyp_len - i + 1)
for window in range(1, max_window_size):
if i + window < hyp_len: # If searching the right context is possible.
# Retrieve the right context window.
right_context_ngram = tuple(islice(hypothesis, i, i + window + 1))
num_times_in_ref = ref_ngrams.count(right_context_ngram)
num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
# If ngram appears only once in both ref and hyp.
if num_times_in_ref == num_times_in_hyp == 1:
# Find the position of ngram that matched the reference.
pos = position_of_ngram(right_context_ngram, reference)
worder.append(pos) # Add the positions of the ngram.
break
if window <= i: # If searching the left context is possible.
# Retrieve the left context window.
left_context_ngram = tuple(islice(hypothesis, i - window, i + 1))
num_times_in_ref = ref_ngrams.count(left_context_ngram)
num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
if num_times_in_ref == num_times_in_hyp == 1:
# Find the position of ngram that matched the reference.
pos = position_of_ngram(left_context_ngram, reference)
# Add the positions of the ngram.
worder.append(pos + len(left_context_ngram) - 1)
break
return worder
def find_increasing_sequences(worder):
"""
Given the *worder* list, this function groups monotonic +1 sequences.
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> list(find_increasing_sequences(worder))
[(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
:param worder: The worder list output from word_rank_alignment
:param type: list(int)
"""
items = iter(worder)
a, b = None, next(items, None)
result = [b]
while b is not None:
a, b = b, next(items, None)
if b is not None and a + 1 == b:
result.append(b)
else:
if len(result) > 1:
yield tuple(result)
result = [b]
def kendall_tau(worder, normalize=True):
"""
Calculates the Kendall's Tau correlation coefficient given the *worder*
list of word alignments from word_rank_alignment(), using the formula:
tau = 2 * num_increasing_pairs / num_possible_pairs -1
Note that the no. of increasing pairs can be discontinuous in the *worder*
list and each each increasing sequence can be tabulated as choose(len(seq), 2)
no. of increasing pairs, e.g.
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> number_possible_pairs = choose(len(worder), 2)
>>> round(kendall_tau(worder, normalize=False),3)
-0.236
>>> round(kendall_tau(worder),3)
0.382
:param worder: The worder list output from word_rank_alignment
:type worder: list(int)
:param normalize: Flag to indicate normalization to between 0.0 and 1.0.
:type normalize: boolean
:return: The Kendall's Tau correlation coefficient.
:rtype: float
"""
worder_len = len(worder)
# With worder_len < 2, `choose(worder_len, 2)` will be 0.
# As we divide by this, it will give a ZeroDivisionError.
# To avoid this, we can just return the lowest possible score.
if worder_len < 2:
tau = -1
else:
# Extract the groups of increasing/monotonic sequences.
increasing_sequences = find_increasing_sequences(worder)
# Calculate no. of increasing_pairs in *worder* list.
num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences)
# Calculate no. of possible pairs.
num_possible_pairs = choose(worder_len, 2)
# Kendall's Tau computation.
tau = 2 * num_increasing_pairs / num_possible_pairs - 1
if normalize: # If normalized, the tau output falls between 0.0 to 1.0
return (tau + 1) / 2
else: # Otherwise, the tau outputs falls between -1.0 to +1.0
return tau
def spearman_rho(worder, normalize=True):
"""
Calculates the Spearman's Rho correlation coefficient given the *worder*
list of word alignment from word_rank_alignment(), using the formula:
rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
Given that d is the sum of difference between the *worder* list of indices
and the original word indices from the reference sentence.
Using the (H0,R0) and (H5, R5) example from the paper
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> round(spearman_rho(worder, normalize=False), 3)
-0.591
>>> round(spearman_rho(worder), 3)
0.205
:param worder: The worder list output from word_rank_alignment
:param type: list(int)
"""
worder_len = len(worder)
sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len)))
rho = 1 - sum_d_square / choose(worder_len + 1, 3)
if normalize: # If normalized, the rho output falls between 0.0 to 1.0
return (rho + 1) / 2
else: # Otherwise, the rho outputs falls between -1.0 to +1.0
return rho

View File

@@ -0,0 +1,515 @@
# Natural Language Toolkit: Stack decoder
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A decoder that uses stacks to implement phrase-based translation.
In phrase-based translation, the source sentence is segmented into
phrases of one or more words, and translations for those phrases are
used to build the target sentence.
Hypothesis data structures are used to keep track of the source words
translated so far and the partial output. A hypothesis can be expanded
by selecting an untranslated phrase, looking up its translation in a
phrase table, and appending that translation to the partial output.
Translation is complete when a hypothesis covers all source words.
The search space is huge because the source sentence can be segmented
in different ways, the source phrases can be selected in any order,
and there could be multiple translations for the same source phrase in
the phrase table. To make decoding tractable, stacks are used to limit
the number of candidate hypotheses by doing histogram and/or threshold
pruning.
Hypotheses with the same number of words translated are placed in the
same stack. In histogram pruning, each stack has a size limit, and
the hypothesis with the lowest score is removed when the stack is full.
In threshold pruning, hypotheses that score below a certain threshold
of the best hypothesis in that stack are removed.
Hypothesis scoring can include various factors such as phrase
translation probability, language model probability, length of
translation, cost of remaining words to be translated, and so on.
References:
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
"""
import warnings
from collections import defaultdict
from math import log
class StackDecoder:
"""
Phrase-based stack decoder for machine translation
>>> from nltk.translate import PhraseTable
>>> phrase_table = PhraseTable()
>>> phrase_table.add(('niemand',), ('nobody',), log(0.8))
>>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2))
>>> phrase_table.add(('erwartet',), ('expects',), log(0.8))
>>> phrase_table.add(('erwartet',), ('expecting',), log(0.2))
>>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1))
>>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8))
>>> phrase_table.add(('!',), ('!',), log(0.8))
>>> # nltk.model should be used here once it is implemented
>>> from collections import defaultdict
>>> language_prob = defaultdict(lambda: -999.0)
>>> language_prob[('nobody',)] = log(0.5)
>>> language_prob[('expects',)] = log(0.4)
>>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2)
>>> language_prob[('!',)] = log(0.1)
>>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})()
>>> stack_decoder = StackDecoder(phrase_table, language_model)
>>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!'])
['nobody', 'expects', 'the', 'spanish', 'inquisition', '!']
"""
def __init__(self, phrase_table, language_model):
"""
:param phrase_table: Table of translations for source language
phrases and the log probabilities for those translations.
:type phrase_table: PhraseTable
:param language_model: Target language model. Must define a
``probability_change`` method that calculates the change in
log probability of a sentence, if a given string is appended
to it.
This interface is experimental and will likely be replaced
with nltk.model once it is implemented.
:type language_model: object
"""
self.phrase_table = phrase_table
self.language_model = language_model
self.word_penalty = 0.0
"""
float: Influences the translation length exponentially.
If positive, shorter translations are preferred.
If negative, longer translations are preferred.
If zero, no penalty is applied.
"""
self.beam_threshold = 0.0
"""
float: Hypotheses that score below this factor of the best
hypothesis in a stack are dropped from consideration.
Value between 0.0 and 1.0.
"""
self.stack_size = 100
"""
int: Maximum number of hypotheses to consider in a stack.
Higher values increase the likelihood of a good translation,
but increases processing time.
"""
self.__distortion_factor = 0.5
self.__compute_log_distortion()
@property
def distortion_factor(self):
"""
float: Amount of reordering of source phrases.
Lower values favour monotone translation, suitable when
word order is similar for both source and target languages.
Value between 0.0 and 1.0. Default 0.5.
"""
return self.__distortion_factor
@distortion_factor.setter
def distortion_factor(self, d):
self.__distortion_factor = d
self.__compute_log_distortion()
def __compute_log_distortion(self):
# cache log(distortion_factor) so we don't have to recompute it
# when scoring hypotheses
if self.__distortion_factor == 0.0:
self.__log_distortion_factor = log(1e-9) # 1e-9 is almost zero
else:
self.__log_distortion_factor = log(self.__distortion_factor)
def translate(self, src_sentence):
"""
:param src_sentence: Sentence to be translated
:type src_sentence: list(str)
:return: Translated sentence
:rtype: list(str)
"""
sentence = tuple(src_sentence) # prevent accidental modification
sentence_length = len(sentence)
stacks = [
_Stack(self.stack_size, self.beam_threshold)
for _ in range(0, sentence_length + 1)
]
empty_hypothesis = _Hypothesis()
stacks[0].push(empty_hypothesis)
all_phrases = self.find_all_src_phrases(sentence)
future_score_table = self.compute_future_scores(sentence)
for stack in stacks:
for hypothesis in stack:
possible_expansions = StackDecoder.valid_phrases(
all_phrases, hypothesis
)
for src_phrase_span in possible_expansions:
src_phrase = sentence[src_phrase_span[0] : src_phrase_span[1]]
for translation_option in self.phrase_table.translations_for(
src_phrase
):
raw_score = self.expansion_score(
hypothesis, translation_option, src_phrase_span
)
new_hypothesis = _Hypothesis(
raw_score=raw_score,
src_phrase_span=src_phrase_span,
trg_phrase=translation_option.trg_phrase,
previous=hypothesis,
)
new_hypothesis.future_score = self.future_score(
new_hypothesis, future_score_table, sentence_length
)
total_words = new_hypothesis.total_translated_words()
stacks[total_words].push(new_hypothesis)
if not stacks[sentence_length]:
warnings.warn(
"Unable to translate all words. "
"The source sentence contains words not in "
"the phrase table"
)
# Instead of returning empty output, perhaps a partial
# translation could be returned
return []
best_hypothesis = stacks[sentence_length].best()
return best_hypothesis.translation_so_far()
def find_all_src_phrases(self, src_sentence):
"""
Finds all subsequences in src_sentence that have a phrase
translation in the translation table
:type src_sentence: tuple(str)
:return: Subsequences that have a phrase translation,
represented as a table of lists of end positions.
For example, if result[2] is [5, 6, 9], then there are
three phrases starting from position 2 in ``src_sentence``,
ending at positions 5, 6, and 9 exclusive. The list of
ending positions are in ascending order.
:rtype: list(list(int))
"""
sentence_length = len(src_sentence)
phrase_indices = [[] for _ in src_sentence]
for start in range(0, sentence_length):
for end in range(start + 1, sentence_length + 1):
potential_phrase = src_sentence[start:end]
if potential_phrase in self.phrase_table:
phrase_indices[start].append(end)
return phrase_indices
def compute_future_scores(self, src_sentence):
"""
Determines the approximate scores for translating every
subsequence in ``src_sentence``
Future scores can be used a look-ahead to determine the
difficulty of translating the remaining parts of a src_sentence.
:type src_sentence: tuple(str)
:return: Scores of subsequences referenced by their start and
end positions. For example, result[2][5] is the score of the
subsequence covering positions 2, 3, and 4.
:rtype: dict(int: (dict(int): float))
"""
scores = defaultdict(lambda: defaultdict(lambda: float("-inf")))
for seq_length in range(1, len(src_sentence) + 1):
for start in range(0, len(src_sentence) - seq_length + 1):
end = start + seq_length
phrase = src_sentence[start:end]
if phrase in self.phrase_table:
score = self.phrase_table.translations_for(phrase)[
0
].log_prob # pick best (first) translation
# Warning: API of language_model is subject to change
score += self.language_model.probability(phrase)
scores[start][end] = score
# check if a better score can be obtained by combining
# two child subsequences
for mid in range(start + 1, end):
combined_score = scores[start][mid] + scores[mid][end]
if combined_score > scores[start][end]:
scores[start][end] = combined_score
return scores
def future_score(self, hypothesis, future_score_table, sentence_length):
"""
Determines the approximate score for translating the
untranslated words in ``hypothesis``
"""
score = 0.0
for span in hypothesis.untranslated_spans(sentence_length):
score += future_score_table[span[0]][span[1]]
return score
def expansion_score(self, hypothesis, translation_option, src_phrase_span):
"""
Calculate the score of expanding ``hypothesis`` with
``translation_option``
:param hypothesis: Hypothesis being expanded
:type hypothesis: _Hypothesis
:param translation_option: Information about the proposed expansion
:type translation_option: PhraseTableEntry
:param src_phrase_span: Word position span of the source phrase
:type src_phrase_span: tuple(int, int)
"""
score = hypothesis.raw_score
score += translation_option.log_prob
# The API of language_model is subject to change; it could accept
# a string, a list of words, and/or some other type
score += self.language_model.probability_change(
hypothesis, translation_option.trg_phrase
)
score += self.distortion_score(hypothesis, src_phrase_span)
score -= self.word_penalty * len(translation_option.trg_phrase)
return score
def distortion_score(self, hypothesis, next_src_phrase_span):
if not hypothesis.src_phrase_span:
return 0.0
next_src_phrase_start = next_src_phrase_span[0]
prev_src_phrase_end = hypothesis.src_phrase_span[1]
distortion_distance = next_src_phrase_start - prev_src_phrase_end
return abs(distortion_distance) * self.__log_distortion_factor
@staticmethod
def valid_phrases(all_phrases_from, hypothesis):
"""
Extract phrases from ``all_phrases_from`` that contains words
that have not been translated by ``hypothesis``
:param all_phrases_from: Phrases represented by their spans, in
the same format as the return value of
``find_all_src_phrases``
:type all_phrases_from: list(list(int))
:type hypothesis: _Hypothesis
:return: A list of phrases, represented by their spans, that
cover untranslated positions.
:rtype: list(tuple(int, int))
"""
untranslated_spans = hypothesis.untranslated_spans(len(all_phrases_from))
valid_phrases = []
for available_span in untranslated_spans:
start = available_span[0]
available_end = available_span[1]
while start < available_end:
for phrase_end in all_phrases_from[start]:
if phrase_end > available_end:
# Subsequent elements in all_phrases_from[start]
# will also be > available_end, since the
# elements are in ascending order
break
valid_phrases.append((start, phrase_end))
start += 1
return valid_phrases
class _Hypothesis:
"""
Partial solution to a translation.
Records the word positions of the phrase being translated, its
translation, raw score, and the cost of the untranslated parts of
the sentence. When the next phrase is selected to build upon the
partial solution, a new _Hypothesis object is created, with a back
pointer to the previous hypothesis.
To find out which words have been translated so far, look at the
``src_phrase_span`` in the hypothesis chain. Similarly, the
translation output can be found by traversing up the chain.
"""
def __init__(
self,
raw_score=0.0,
src_phrase_span=(),
trg_phrase=(),
previous=None,
future_score=0.0,
):
"""
:param raw_score: Likelihood of hypothesis so far.
Higher is better. Does not account for untranslated words.
:type raw_score: float
:param src_phrase_span: Span of word positions covered by the
source phrase in this hypothesis expansion. For example,
(2, 5) means that the phrase is from the second word up to,
but not including the fifth word in the source sentence.
:type src_phrase_span: tuple(int)
:param trg_phrase: Translation of the source phrase in this
hypothesis expansion
:type trg_phrase: tuple(str)
:param previous: Previous hypothesis before expansion to this one
:type previous: _Hypothesis
:param future_score: Approximate score for translating the
remaining words not covered by this hypothesis. Higher means
that the remaining words are easier to translate.
:type future_score: float
"""
self.raw_score = raw_score
self.src_phrase_span = src_phrase_span
self.trg_phrase = trg_phrase
self.previous = previous
self.future_score = future_score
def score(self):
"""
Overall score of hypothesis after accounting for local and
global features
"""
return self.raw_score + self.future_score
def untranslated_spans(self, sentence_length):
"""
Starting from each untranslated word, find the longest
continuous span of untranslated positions
:param sentence_length: Length of source sentence being
translated by the hypothesis
:type sentence_length: int
:rtype: list(tuple(int, int))
"""
translated_positions = self.translated_positions()
translated_positions.sort()
translated_positions.append(sentence_length) # add sentinel position
untranslated_spans = []
start = 0
# each untranslated span must end in one of the translated_positions
for end in translated_positions:
if start < end:
untranslated_spans.append((start, end))
start = end + 1
return untranslated_spans
def translated_positions(self):
"""
List of positions in the source sentence of words already
translated. The list is not sorted.
:rtype: list(int)
"""
translated_positions = []
current_hypothesis = self
while current_hypothesis.previous is not None:
translated_span = current_hypothesis.src_phrase_span
translated_positions.extend(range(translated_span[0], translated_span[1]))
current_hypothesis = current_hypothesis.previous
return translated_positions
def total_translated_words(self):
return len(self.translated_positions())
def translation_so_far(self):
translation = []
self.__build_translation(self, translation)
return translation
def __build_translation(self, hypothesis, output):
if hypothesis.previous is None:
return
self.__build_translation(hypothesis.previous, output)
output.extend(hypothesis.trg_phrase)
class _Stack:
"""
Collection of _Hypothesis objects
"""
def __init__(self, max_size=100, beam_threshold=0.0):
"""
:param beam_threshold: Hypotheses that score less than this
factor of the best hypothesis are discarded from the stack.
Value must be between 0.0 and 1.0.
:type beam_threshold: float
"""
self.max_size = max_size
self.items = []
if beam_threshold == 0.0:
self.__log_beam_threshold = float("-inf")
else:
self.__log_beam_threshold = log(beam_threshold)
def push(self, hypothesis):
"""
Add ``hypothesis`` to the stack.
Removes lowest scoring hypothesis if the stack is full.
After insertion, hypotheses that score less than
``beam_threshold`` times the score of the best hypothesis
are removed.
"""
self.items.append(hypothesis)
self.items.sort(key=lambda h: h.score(), reverse=True)
while len(self.items) > self.max_size:
self.items.pop()
self.threshold_prune()
def threshold_prune(self):
if not self.items:
return
# log(score * beam_threshold) = log(score) + log(beam_threshold)
threshold = self.items[0].score() + self.__log_beam_threshold
for hypothesis in reversed(self.items):
if hypothesis.score() < threshold:
self.items.pop()
else:
break
def best(self):
"""
:return: Hypothesis with the highest score in the stack
:rtype: _Hypothesis
"""
if self.items:
return self.items[0]
return None
def __iter__(self):
return iter(self.items)
def __contains__(self, hypothesis):
return hypothesis in self.items
def __bool__(self):
return len(self.items) != 0
__nonzero__ = __bool__