updates
This commit is contained in:
@@ -0,0 +1,33 @@
|
||||
# Natural Language Toolkit: Machine Translation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>, Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Experimental features for machine translation.
|
||||
These interfaces are prone to change.
|
||||
|
||||
isort:skip_file
|
||||
"""
|
||||
|
||||
from nltk.translate.api import AlignedSent, Alignment, PhraseTable
|
||||
from nltk.translate.ibm_model import IBMModel
|
||||
from nltk.translate.ibm1 import IBMModel1
|
||||
from nltk.translate.ibm2 import IBMModel2
|
||||
from nltk.translate.ibm3 import IBMModel3
|
||||
from nltk.translate.ibm4 import IBMModel4
|
||||
from nltk.translate.ibm5 import IBMModel5
|
||||
from nltk.translate.bleu_score import sentence_bleu as bleu
|
||||
from nltk.translate.ribes_score import sentence_ribes as ribes
|
||||
from nltk.translate.meteor_score import meteor_score as meteor
|
||||
from nltk.translate.metrics import alignment_error_rate
|
||||
from nltk.translate.stack_decoder import StackDecoder
|
||||
from nltk.translate.nist_score import sentence_nist as nist
|
||||
from nltk.translate.chrf_score import sentence_chrf as chrf
|
||||
from nltk.translate.gale_church import trace
|
||||
from nltk.translate.gdfa import grow_diag_final_and
|
||||
from nltk.translate.gleu_score import sentence_gleu as gleu
|
||||
from nltk.translate.phrase_based import extract
|
||||
from nltk.translate.lepor import sentence_lepor as lepor, corpus_lepor
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
335
Backend/venv/lib/python3.12/site-packages/nltk/translate/api.py
Normal file
335
Backend/venv/lib/python3.12/site-packages/nltk/translate/api.py
Normal file
@@ -0,0 +1,335 @@
|
||||
# Natural Language Toolkit: API for alignment and translation objects
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Will Zhang <wilzzha@gmail.com>
|
||||
# Guan Gui <ggui@student.unimelb.edu.au>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import subprocess
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
class AlignedSent:
|
||||
"""
|
||||
Return an aligned sentence object, which encapsulates two sentences
|
||||
along with an ``Alignment`` between them.
|
||||
|
||||
Typically used in machine translation to represent a sentence and
|
||||
its translation.
|
||||
|
||||
>>> from nltk.translate import AlignedSent, Alignment
|
||||
>>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
|
||||
... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1'))
|
||||
>>> algnsent.words
|
||||
['klein', 'ist', 'das', 'Haus']
|
||||
>>> algnsent.mots
|
||||
['the', 'house', 'is', 'small']
|
||||
>>> algnsent.alignment
|
||||
Alignment([(0, 3), (1, 2), (2, 0), (3, 1)])
|
||||
>>> from nltk.corpus import comtrans
|
||||
>>> print(comtrans.aligned_sents()[54])
|
||||
<AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
|
||||
>>> print(comtrans.aligned_sents()[54].alignment)
|
||||
0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13
|
||||
|
||||
:param words: Words in the target language sentence
|
||||
:type words: list(str)
|
||||
:param mots: Words in the source language sentence
|
||||
:type mots: list(str)
|
||||
:param alignment: Word-level alignments between ``words`` and ``mots``.
|
||||
Each alignment is represented as a 2-tuple (words_index, mots_index).
|
||||
:type alignment: Alignment
|
||||
"""
|
||||
|
||||
def __init__(self, words, mots, alignment=None):
|
||||
self._words = words
|
||||
self._mots = mots
|
||||
if alignment is None:
|
||||
self.alignment = Alignment([])
|
||||
else:
|
||||
assert type(alignment) is Alignment
|
||||
self.alignment = alignment
|
||||
|
||||
@property
|
||||
def words(self):
|
||||
return self._words
|
||||
|
||||
@property
|
||||
def mots(self):
|
||||
return self._mots
|
||||
|
||||
def _get_alignment(self):
|
||||
return self._alignment
|
||||
|
||||
def _set_alignment(self, alignment):
|
||||
_check_alignment(len(self.words), len(self.mots), alignment)
|
||||
self._alignment = alignment
|
||||
|
||||
alignment = property(_get_alignment, _set_alignment)
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a string representation for this ``AlignedSent``.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
words = "[%s]" % (", ".join("'%s'" % w for w in self._words))
|
||||
mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots))
|
||||
|
||||
return f"AlignedSent({words}, {mots}, {self._alignment!r})"
|
||||
|
||||
def _to_dot(self):
|
||||
"""
|
||||
Dot representation of the aligned sentence
|
||||
"""
|
||||
s = "graph align {\n"
|
||||
s += "node[shape=plaintext]\n"
|
||||
|
||||
# Declare node
|
||||
s += "".join([f'"{w}_source" [label="{w}"] \n' for w in self._words])
|
||||
s += "".join([f'"{w}_target" [label="{w}"] \n' for w in self._mots])
|
||||
|
||||
# Alignment
|
||||
s += "".join(
|
||||
[
|
||||
f'"{self._words[u]}_source" -- "{self._mots[v]}_target" \n'
|
||||
for u, v in self._alignment
|
||||
]
|
||||
)
|
||||
|
||||
# Connect the source words
|
||||
for i in range(len(self._words) - 1):
|
||||
s += '"{}_source" -- "{}_source" [style=invis]\n'.format(
|
||||
self._words[i],
|
||||
self._words[i + 1],
|
||||
)
|
||||
|
||||
# Connect the target words
|
||||
for i in range(len(self._mots) - 1):
|
||||
s += '"{}_target" -- "{}_target" [style=invis]\n'.format(
|
||||
self._mots[i],
|
||||
self._mots[i + 1],
|
||||
)
|
||||
|
||||
# Put it in the same rank
|
||||
s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words))
|
||||
s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots))
|
||||
|
||||
s += "}"
|
||||
|
||||
return s
|
||||
|
||||
def _repr_svg_(self):
|
||||
"""
|
||||
Ipython magic : show SVG representation of this ``AlignedSent``.
|
||||
"""
|
||||
dot_string = self._to_dot().encode("utf8")
|
||||
output_format = "svg"
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
["dot", "-T%s" % output_format],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
except OSError as e:
|
||||
raise Exception("Cannot find the dot binary from Graphviz package") from e
|
||||
out, err = process.communicate(dot_string)
|
||||
|
||||
return out.decode("utf8")
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Return a human-readable string representation for this ``AlignedSent``.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
source = " ".join(self._words)[:20] + "..."
|
||||
target = " ".join(self._mots)[:20] + "..."
|
||||
return f"<AlignedSent: '{source}' -> '{target}'>"
|
||||
|
||||
def invert(self):
|
||||
"""
|
||||
Return the aligned sentence pair, reversing the directionality
|
||||
|
||||
:rtype: AlignedSent
|
||||
"""
|
||||
return AlignedSent(self._mots, self._words, self._alignment.invert())
|
||||
|
||||
|
||||
class Alignment(frozenset):
|
||||
"""
|
||||
A storage class for representing alignment between two sequences, s1, s2.
|
||||
In general, an alignment is a set of tuples of the form (i, j, ...)
|
||||
representing an alignment between the i-th element of s1 and the
|
||||
j-th element of s2. Tuples are extensible (they might contain
|
||||
additional data, such as a boolean to indicate sure vs possible alignments).
|
||||
|
||||
>>> from nltk.translate import Alignment
|
||||
>>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
|
||||
>>> a.invert()
|
||||
Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
|
||||
>>> print(a.invert())
|
||||
0-0 1-0 2-1 2-2
|
||||
>>> a[0]
|
||||
[(0, 1), (0, 0)]
|
||||
>>> a.invert()[2]
|
||||
[(2, 1), (2, 2)]
|
||||
>>> b = Alignment([(0, 0), (0, 1)])
|
||||
>>> b.issubset(a)
|
||||
True
|
||||
>>> c = Alignment.fromstring('0-0 0-1')
|
||||
>>> b == c
|
||||
True
|
||||
"""
|
||||
|
||||
def __new__(cls, pairs):
|
||||
self = frozenset.__new__(cls, pairs)
|
||||
self._len = max(p[0] for p in self) if self != frozenset([]) else 0
|
||||
self._index = None
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def fromstring(cls, s):
|
||||
"""
|
||||
Read a giza-formatted string and return an Alignment object.
|
||||
|
||||
>>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5')
|
||||
Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)])
|
||||
|
||||
:type s: str
|
||||
:param s: the positional alignments in giza format
|
||||
:rtype: Alignment
|
||||
:return: An Alignment object corresponding to the string representation ``s``.
|
||||
"""
|
||||
|
||||
return Alignment([_giza2pair(a) for a in s.split()])
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
Look up the alignments that map from a given index or slice.
|
||||
"""
|
||||
if not self._index:
|
||||
self._build_index()
|
||||
return self._index.__getitem__(key)
|
||||
|
||||
def invert(self):
|
||||
"""
|
||||
Return an Alignment object, being the inverted mapping.
|
||||
"""
|
||||
return Alignment(((p[1], p[0]) + p[2:]) for p in self)
|
||||
|
||||
def range(self, positions=None):
|
||||
"""
|
||||
Work out the range of the mapping from the given positions.
|
||||
If no positions are specified, compute the range of the entire mapping.
|
||||
"""
|
||||
image = set()
|
||||
if not self._index:
|
||||
self._build_index()
|
||||
if not positions:
|
||||
positions = list(range(len(self._index)))
|
||||
for p in positions:
|
||||
image.update(f for _, f in self._index[p])
|
||||
return sorted(image)
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Produce a Giza-formatted string representing the alignment.
|
||||
"""
|
||||
return "Alignment(%r)" % sorted(self)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Produce a Giza-formatted string representing the alignment.
|
||||
"""
|
||||
return " ".join("%d-%d" % p[:2] for p in sorted(self))
|
||||
|
||||
def _build_index(self):
|
||||
"""
|
||||
Build a list self._index such that self._index[i] is a list
|
||||
of the alignments originating from word i.
|
||||
"""
|
||||
self._index = [[] for _ in range(self._len + 1)]
|
||||
for p in self:
|
||||
self._index[p[0]].append(p)
|
||||
|
||||
|
||||
def _giza2pair(pair_string):
|
||||
i, j = pair_string.split("-")
|
||||
return int(i), int(j)
|
||||
|
||||
|
||||
def _naacl2pair(pair_string):
|
||||
i, j, p = pair_string.split("-")
|
||||
return int(i), int(j)
|
||||
|
||||
|
||||
def _check_alignment(num_words, num_mots, alignment):
|
||||
"""
|
||||
Check whether the alignments are legal.
|
||||
|
||||
:param num_words: the number of source language words
|
||||
:type num_words: int
|
||||
:param num_mots: the number of target language words
|
||||
:type num_mots: int
|
||||
:param alignment: alignment to be checked
|
||||
:type alignment: Alignment
|
||||
:raise IndexError: if alignment falls outside the sentence
|
||||
"""
|
||||
|
||||
assert type(alignment) is Alignment
|
||||
|
||||
if not all(0 <= pair[0] < num_words for pair in alignment):
|
||||
raise IndexError("Alignment is outside boundary of words")
|
||||
if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment):
|
||||
raise IndexError("Alignment is outside boundary of mots")
|
||||
|
||||
|
||||
PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"])
|
||||
|
||||
|
||||
class PhraseTable:
|
||||
"""
|
||||
In-memory store of translations for a given phrase, and the log
|
||||
probability of the those translations
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.src_phrases = dict()
|
||||
|
||||
def translations_for(self, src_phrase):
|
||||
"""
|
||||
Get the translations for a source language phrase
|
||||
|
||||
:param src_phrase: Source language phrase of interest
|
||||
:type src_phrase: tuple(str)
|
||||
|
||||
:return: A list of target language phrases that are translations
|
||||
of ``src_phrase``, ordered in decreasing order of
|
||||
likelihood. Each list element is a tuple of the target
|
||||
phrase and its log probability.
|
||||
:rtype: list(PhraseTableEntry)
|
||||
"""
|
||||
return self.src_phrases[src_phrase]
|
||||
|
||||
def add(self, src_phrase, trg_phrase, log_prob):
|
||||
"""
|
||||
:type src_phrase: tuple(str)
|
||||
:type trg_phrase: tuple(str)
|
||||
|
||||
:param log_prob: Log probability that given ``src_phrase``,
|
||||
``trg_phrase`` is its translation
|
||||
:type log_prob: float
|
||||
"""
|
||||
entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob)
|
||||
if src_phrase not in self.src_phrases:
|
||||
self.src_phrases[src_phrase] = []
|
||||
self.src_phrases[src_phrase].append(entry)
|
||||
self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True)
|
||||
|
||||
def __contains__(self, src_phrase):
|
||||
return src_phrase in self.src_phrases
|
||||
@@ -0,0 +1,714 @@
|
||||
# Natural Language Toolkit: BLEU Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||||
# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""BLEU score implementation."""
|
||||
import math
|
||||
import sys
|
||||
import warnings
|
||||
from collections import Counter
|
||||
from fractions import Fraction as _Fraction
|
||||
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
class Fraction(_Fraction):
|
||||
"""Fraction with _normalize=False support for 3.12"""
|
||||
|
||||
def __new__(cls, numerator=0, denominator=None, _normalize=False):
|
||||
if sys.version_info >= (3, 12):
|
||||
self = super().__new__(cls, numerator, denominator)
|
||||
else:
|
||||
self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
|
||||
self._normalize = _normalize
|
||||
self._original_numerator = numerator
|
||||
self._original_denominator = denominator
|
||||
return self
|
||||
|
||||
@property
|
||||
def numerator(self):
|
||||
if not self._normalize:
|
||||
return self._original_numerator
|
||||
return super().numerator
|
||||
|
||||
@property
|
||||
def denominator(self):
|
||||
if not self._normalize:
|
||||
return self._original_denominator
|
||||
return super().denominator
|
||||
|
||||
|
||||
def sentence_bleu(
|
||||
references,
|
||||
hypothesis,
|
||||
weights=(0.25, 0.25, 0.25, 0.25),
|
||||
smoothing_function=None,
|
||||
auto_reweigh=False,
|
||||
):
|
||||
"""
|
||||
Calculate BLEU score (Bilingual Evaluation Understudy) from
|
||||
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
|
||||
"BLEU: a method for automatic evaluation of machine translation."
|
||||
In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||||
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||||
... 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
|
||||
0.5045...
|
||||
|
||||
If there is no ngrams overlap for any order of n-grams, BLEU returns the
|
||||
value 0. This is because the precision for the order of n-grams without
|
||||
overlap is 0, and the geometric mean in the final BLEU score computation
|
||||
multiplies the 0 with the precision of other n-grams. This results in 0
|
||||
(independently of the precision of the other n-gram orders). The following
|
||||
example has zero 3-gram and 4-gram overlaps:
|
||||
|
||||
>>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
|
||||
0.0
|
||||
|
||||
To avoid this harsh behaviour when no ngram overlaps are found a smoothing
|
||||
function can be used.
|
||||
|
||||
>>> chencherry = SmoothingFunction()
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
|
||||
... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
|
||||
0.0370...
|
||||
|
||||
The default BLEU calculates a score for up to 4-grams using uniform
|
||||
weights (this is called BLEU-4). To evaluate your translations with
|
||||
higher/lower order ngrams, use customized weights. E.g. when accounting
|
||||
for up to 5-grams with uniform weights (this is called BLEU-5) use:
|
||||
|
||||
>>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||||
0.3920...
|
||||
|
||||
Multiple BLEU scores can be computed at once, by supplying a list of weights.
|
||||
E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use:
|
||||
>>> weights = [
|
||||
... (1./2., 1./2.),
|
||||
... (1./3., 1./3., 1./3.),
|
||||
... (1./4., 1./4., 1./4., 1./4.)
|
||||
... ]
|
||||
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
||||
[0.7453..., 0.6240..., 0.5045...]
|
||||
|
||||
:param references: reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||||
:type weights: tuple(float) / list(tuple(float))
|
||||
:param smoothing_function:
|
||||
:type smoothing_function: SmoothingFunction
|
||||
:param auto_reweigh: Option to re-normalize the weights uniformly.
|
||||
:type auto_reweigh: bool
|
||||
:return: The sentence-level BLEU score. Returns a list if multiple weights were supplied.
|
||||
:rtype: float / list(float)
|
||||
"""
|
||||
return corpus_bleu(
|
||||
[references], [hypothesis], weights, smoothing_function, auto_reweigh
|
||||
)
|
||||
|
||||
|
||||
def corpus_bleu(
|
||||
list_of_references,
|
||||
hypotheses,
|
||||
weights=(0.25, 0.25, 0.25, 0.25),
|
||||
smoothing_function=None,
|
||||
auto_reweigh=False,
|
||||
):
|
||||
"""
|
||||
Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
|
||||
the hypotheses and their respective references.
|
||||
|
||||
Instead of averaging the sentence level BLEU scores (i.e. macro-average
|
||||
precision), the original BLEU metric (Papineni et al. 2002) accounts for
|
||||
the micro-average precision (i.e. summing the numerators and denominators
|
||||
for each hypothesis-reference(s) pairs before the division).
|
||||
|
||||
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||||
... 'interested', 'in', 'world', 'history']
|
||||
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||||
... 'because', 'he', 'read', 'the', 'book']
|
||||
|
||||
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||||
>>> hypotheses = [hyp1, hyp2]
|
||||
>>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
|
||||
0.5920...
|
||||
|
||||
The example below show that corpus_bleu() is different from averaging
|
||||
sentence_bleu() for hypotheses
|
||||
|
||||
>>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
|
||||
>>> score2 = sentence_bleu([ref2a], hyp2)
|
||||
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
|
||||
0.6223...
|
||||
|
||||
Custom weights may be supplied to fine-tune the BLEU score further.
|
||||
A tuple of float weights for unigrams, bigrams, trigrams and so on can be given.
|
||||
>>> weights = (0.1, 0.3, 0.5, 0.1)
|
||||
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||||
0.5818...
|
||||
|
||||
This particular weight gave extra value to trigrams.
|
||||
Furthermore, multiple weights can be given, resulting in multiple BLEU scores.
|
||||
>>> weights = [
|
||||
... (0.5, 0.5),
|
||||
... (0.333, 0.333, 0.334),
|
||||
... (0.25, 0.25, 0.25, 0.25),
|
||||
... (0.2, 0.2, 0.2, 0.2, 0.2)
|
||||
... ]
|
||||
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
|
||||
[0.8242..., 0.7067..., 0.5920..., 0.4719...]
|
||||
|
||||
:param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||||
:type list_of_references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
|
||||
:type weights: tuple(float) / list(tuple(float))
|
||||
:param smoothing_function:
|
||||
:type smoothing_function: SmoothingFunction
|
||||
:param auto_reweigh: Option to re-normalize the weights uniformly.
|
||||
:type auto_reweigh: bool
|
||||
:return: The corpus-level BLEU score.
|
||||
:rtype: float
|
||||
"""
|
||||
# Before proceeding to compute BLEU, perform sanity checks.
|
||||
|
||||
p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
|
||||
p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
|
||||
hyp_lengths, ref_lengths = 0, 0
|
||||
|
||||
assert len(list_of_references) == len(hypotheses), (
|
||||
"The number of hypotheses and their reference(s) should be the " "same "
|
||||
)
|
||||
|
||||
try:
|
||||
weights[0][0]
|
||||
except:
|
||||
weights = [weights]
|
||||
max_weight_length = max(len(weight) for weight in weights)
|
||||
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
# For each order of ngram, calculate the numerator and
|
||||
# denominator for the corpus-level modified precision.
|
||||
for i in range(1, max_weight_length + 1):
|
||||
p_i = modified_precision(references, hypothesis, i)
|
||||
p_numerators[i] += p_i.numerator
|
||||
p_denominators[i] += p_i.denominator
|
||||
|
||||
# Calculate the hypothesis length and the closest reference length.
|
||||
# Adds them to the corpus-level hypothesis and reference counts.
|
||||
hyp_len = len(hypothesis)
|
||||
hyp_lengths += hyp_len
|
||||
ref_lengths += closest_ref_length(references, hyp_len)
|
||||
|
||||
# Calculate corpus-level brevity penalty.
|
||||
bp = brevity_penalty(ref_lengths, hyp_lengths)
|
||||
|
||||
# Collects the various precision values for the different ngram orders.
|
||||
p_n = [
|
||||
Fraction(p_numerators[i], p_denominators[i], _normalize=False)
|
||||
for i in range(1, max_weight_length + 1)
|
||||
]
|
||||
|
||||
# Returns 0 if there's no matching n-grams
|
||||
# We only need to check for p_numerators[1] == 0, since if there's
|
||||
# no unigrams, there won't be any higher order ngrams.
|
||||
if p_numerators[1] == 0:
|
||||
return 0 if len(weights) == 1 else [0] * len(weights)
|
||||
|
||||
# If there's no smoothing, set use method0 from SmoothinFunction class.
|
||||
if not smoothing_function:
|
||||
smoothing_function = SmoothingFunction().method0
|
||||
# Smoothen the modified precision.
|
||||
# Note: smoothing_function() may convert values into floats;
|
||||
# it tries to retain the Fraction object as much as the
|
||||
# smoothing method allows.
|
||||
p_n = smoothing_function(
|
||||
p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
|
||||
)
|
||||
|
||||
bleu_scores = []
|
||||
for weight in weights:
|
||||
# Uniformly re-weighting based on maximum hypothesis lengths if largest
|
||||
# order of n-grams < 4 and weights is set at default.
|
||||
if auto_reweigh:
|
||||
if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
|
||||
weight = (1 / hyp_lengths,) * hyp_lengths
|
||||
|
||||
s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
|
||||
s = bp * math.exp(math.fsum(s))
|
||||
bleu_scores.append(s)
|
||||
return bleu_scores[0] if len(weights) == 1 else bleu_scores
|
||||
|
||||
|
||||
def modified_precision(references, hypothesis, n):
|
||||
"""
|
||||
Calculate modified ngram precision.
|
||||
|
||||
The normal precision method may lead to some wrong translations with
|
||||
high-precision, e.g., the translation, in which a word of reference
|
||||
repeats several times, has very high precision.
|
||||
|
||||
This function only returns the Fraction object that contains the numerator
|
||||
and denominator necessary to calculate the corpus-level precision.
|
||||
To calculate the modified precision for a single pair of hypothesis and
|
||||
references, cast the Fraction object into a float.
|
||||
|
||||
The famous "the the the ... " example shows that you can get BLEU precision
|
||||
by duplicating high frequency words.
|
||||
|
||||
>>> reference1 = 'the cat is on the mat'.split()
|
||||
>>> reference2 = 'there is a cat on the mat'.split()
|
||||
>>> hypothesis1 = 'the the the the the the the'.split()
|
||||
>>> references = [reference1, reference2]
|
||||
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||||
0.2857...
|
||||
|
||||
In the modified n-gram precision, a reference word will be considered
|
||||
exhausted after a matching hypothesis word is identified, e.g.
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will',
|
||||
... 'forever', 'heed', 'Party', 'commands']
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
>>> hypothesis = 'of the'.split()
|
||||
>>> references = [reference1, reference2, reference3]
|
||||
>>> float(modified_precision(references, hypothesis, n=1))
|
||||
1.0
|
||||
>>> float(modified_precision(references, hypothesis, n=2))
|
||||
1.0
|
||||
|
||||
An example of a normal machine translation hypothesis:
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||||
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||||
... 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will',
|
||||
... 'forever', 'heed', 'Party', 'commands']
|
||||
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
>>> references = [reference1, reference2, reference3]
|
||||
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
||||
0.9444...
|
||||
>>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
|
||||
0.5714...
|
||||
>>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
|
||||
0.5882352941176471
|
||||
>>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
|
||||
0.07692...
|
||||
|
||||
|
||||
:param references: A list of reference translations.
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: A hypothesis translation.
|
||||
:type hypothesis: list(str)
|
||||
:param n: The ngram order.
|
||||
:type n: int
|
||||
:return: BLEU's modified precision for the nth order ngram.
|
||||
:rtype: Fraction
|
||||
"""
|
||||
# Extracts all ngrams in hypothesis
|
||||
# Set an empty Counter if hypothesis is empty.
|
||||
counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
|
||||
# Extract a union of references' counts.
|
||||
# max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
|
||||
max_counts = {}
|
||||
for reference in references:
|
||||
reference_counts = (
|
||||
Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
|
||||
)
|
||||
for ngram in counts:
|
||||
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
|
||||
|
||||
# Assigns the intersection between hypothesis and references' counts.
|
||||
clipped_counts = {
|
||||
ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
|
||||
}
|
||||
|
||||
numerator = sum(clipped_counts.values())
|
||||
# Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
|
||||
# Usually this happens when the ngram order is > len(reference).
|
||||
denominator = max(1, sum(counts.values()))
|
||||
|
||||
return Fraction(numerator, denominator, _normalize=False)
|
||||
|
||||
|
||||
def closest_ref_length(references, hyp_len):
|
||||
"""
|
||||
This function finds the reference that is the closest length to the
|
||||
hypothesis. The closest reference length is referred to as *r* variable
|
||||
from the brevity penalty formula in Papineni et. al. (2002)
|
||||
|
||||
:param references: A list of reference translations.
|
||||
:type references: list(list(str))
|
||||
:param hyp_len: The length of the hypothesis.
|
||||
:type hyp_len: int
|
||||
:return: The length of the reference that's closest to the hypothesis.
|
||||
:rtype: int
|
||||
"""
|
||||
ref_lens = (len(reference) for reference in references)
|
||||
closest_ref_len = min(
|
||||
ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
|
||||
)
|
||||
return closest_ref_len
|
||||
|
||||
|
||||
def brevity_penalty(closest_ref_len, hyp_len):
|
||||
"""
|
||||
Calculate brevity penalty.
|
||||
|
||||
As the modified n-gram precision still has the problem from the short
|
||||
length sentence, brevity penalty is used to modify the overall BLEU
|
||||
score according to length.
|
||||
|
||||
An example from the paper. There are three references with length 12, 15
|
||||
and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
|
||||
|
||||
>>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||||
>>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
|
||||
>>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
|
||||
>>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
||||
>>> references = [reference1, reference2, reference3]
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len)
|
||||
1.0
|
||||
|
||||
In case a hypothesis translation is shorter than the references, penalty is
|
||||
applied.
|
||||
|
||||
>>> references = [['a'] * 28, ['a'] * 28]
|
||||
>>> hypothesis = ['a'] * 12
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len)
|
||||
0.2635971381157267
|
||||
|
||||
The length of the closest reference is used to compute the penalty. If the
|
||||
length of a hypothesis is 12, and the reference lengths are 13 and 2, the
|
||||
penalty is applied because the hypothesis length (12) is less then the
|
||||
closest reference length (13).
|
||||
|
||||
>>> references = [['a'] * 13, ['a'] * 2]
|
||||
>>> hypothesis = ['a'] * 12
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||||
0.9200...
|
||||
|
||||
The brevity penalty doesn't depend on reference order. More importantly,
|
||||
when two reference sentences are at the same distance, the shortest
|
||||
reference sentence length is used.
|
||||
|
||||
>>> references = [['a'] * 13, ['a'] * 11]
|
||||
>>> hypothesis = ['a'] * 12
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
|
||||
>>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
|
||||
>>> bp1 == bp2 == 1
|
||||
True
|
||||
|
||||
A test example from mteval-v13a.pl (starting from the line 705):
|
||||
|
||||
>>> references = [['a'] * 11, ['a'] * 8]
|
||||
>>> hypothesis = ['a'] * 7
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
||||
0.8668...
|
||||
|
||||
>>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
|
||||
>>> hypothesis = ['a'] * 7
|
||||
>>> hyp_len = len(hypothesis)
|
||||
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
>>> brevity_penalty(closest_ref_len, hyp_len)
|
||||
1.0
|
||||
|
||||
:param hyp_len: The length of the hypothesis for a single sentence OR the
|
||||
sum of all the hypotheses' lengths for a corpus
|
||||
:type hyp_len: int
|
||||
:param closest_ref_len: The length of the closest reference for a single
|
||||
hypothesis OR the sum of all the closest references for every hypotheses.
|
||||
:type closest_ref_len: int
|
||||
:return: BLEU's brevity penalty.
|
||||
:rtype: float
|
||||
"""
|
||||
if hyp_len > closest_ref_len:
|
||||
return 1
|
||||
# If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
|
||||
elif hyp_len == 0:
|
||||
return 0
|
||||
else:
|
||||
return math.exp(1 - closest_ref_len / hyp_len)
|
||||
|
||||
|
||||
class SmoothingFunction:
|
||||
"""
|
||||
This is an implementation of the smoothing techniques
|
||||
for segment-level BLEU scores that was presented in
|
||||
Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
|
||||
Smoothing Techniques for Sentence-Level BLEU. In WMT14.
|
||||
http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, epsilon=0.1, alpha=5, k=5):
|
||||
"""
|
||||
This will initialize the parameters required for the various smoothing
|
||||
techniques, the default values are set to the numbers used in the
|
||||
experiments from Chen and Cherry (2014).
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
|
||||
... 'that', 'the', 'military', 'always', 'obeys', 'the',
|
||||
... 'commands', 'of', 'the', 'party']
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
|
||||
... 'that', 'the', 'military', 'will', 'forever', 'heed',
|
||||
... 'Party', 'commands']
|
||||
|
||||
>>> chencherry = SmoothingFunction()
|
||||
>>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
|
||||
0.4452...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
|
||||
0.4118...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
|
||||
0.4905...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
|
||||
0.4135...
|
||||
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
|
||||
0.4905...
|
||||
|
||||
:param epsilon: the epsilon value use in method 1
|
||||
:type epsilon: float
|
||||
:param alpha: the alpha value use in method 6
|
||||
:type alpha: int
|
||||
:param k: the k value use in method 4
|
||||
:type k: int
|
||||
"""
|
||||
self.epsilon = epsilon
|
||||
self.alpha = alpha
|
||||
self.k = k
|
||||
|
||||
def method0(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
No smoothing.
|
||||
"""
|
||||
p_n_new = []
|
||||
for i, p_i in enumerate(p_n):
|
||||
if p_i.numerator != 0:
|
||||
p_n_new.append(p_i)
|
||||
else:
|
||||
_msg = str(
|
||||
"\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
|
||||
"Therefore the BLEU score evaluates to 0, independently of\n"
|
||||
"how many N-gram overlaps of lower order it contains.\n"
|
||||
"Consider using lower n-gram order or use "
|
||||
"SmoothingFunction()"
|
||||
).format(i + 1)
|
||||
warnings.warn(_msg)
|
||||
# When numerator==0 where denonminator==0 or !=0, the result
|
||||
# for the precision score should be equal to 0 or undefined.
|
||||
# Due to BLEU geometric mean computation in logarithm space,
|
||||
# we we need to take the return sys.float_info.min such that
|
||||
# math.log(sys.float_info.min) returns a 0 precision score.
|
||||
p_n_new.append(sys.float_info.min)
|
||||
return p_n_new
|
||||
|
||||
def method1(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
|
||||
"""
|
||||
return [
|
||||
(
|
||||
(p_i.numerator + self.epsilon) / p_i.denominator
|
||||
if p_i.numerator == 0
|
||||
else p_i
|
||||
)
|
||||
for p_i in p_n
|
||||
]
|
||||
|
||||
def method2(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 2: Add 1 to both numerator and denominator from
|
||||
Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for
|
||||
Evaluating Automatic Evaluation Metrics for Machine Translation.
|
||||
In COLING 2004.
|
||||
"""
|
||||
return [
|
||||
(
|
||||
Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False)
|
||||
if i != 0
|
||||
else p_n[0]
|
||||
)
|
||||
for i in range(len(p_n))
|
||||
]
|
||||
|
||||
def method3(self, p_n, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 3: NIST geometric sequence smoothing
|
||||
The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
|
||||
precision score whose matching n-gram count is null.
|
||||
k is 1 for the first 'n' value for which the n-gram match count is null/
|
||||
|
||||
For example, if the text contains:
|
||||
|
||||
- one 2-gram match
|
||||
- and (consequently) two 1-gram matches
|
||||
|
||||
the n-gram count for each individual precision score would be:
|
||||
|
||||
- n=1 => prec_count = 2 (two unigrams)
|
||||
- n=2 => prec_count = 1 (one bigram)
|
||||
- n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
|
||||
- n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
|
||||
"""
|
||||
incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
|
||||
for i, p_i in enumerate(p_n):
|
||||
if p_i.numerator == 0:
|
||||
p_n[i] = 1 / (2**incvnt * p_i.denominator)
|
||||
incvnt += 1
|
||||
return p_n
|
||||
|
||||
def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 4:
|
||||
Shorter translations may have inflated precision values due to having
|
||||
smaller denominators; therefore, we give them proportionally
|
||||
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
|
||||
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
|
||||
"""
|
||||
incvnt = 1
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
for i, p_i in enumerate(p_n):
|
||||
if p_i.numerator == 0 and hyp_len > 1:
|
||||
# incvnt = i + 1 * self.k / math.log(
|
||||
# hyp_len
|
||||
# ) # Note that this K is different from the K from NIST.
|
||||
# p_n[i] = incvnt / p_i.denominator\
|
||||
numerator = 1 / (2**incvnt * self.k / math.log(hyp_len))
|
||||
p_n[i] = numerator / p_i.denominator
|
||||
incvnt += 1
|
||||
return p_n
|
||||
|
||||
def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 5:
|
||||
The matched counts for similar values of n should be similar. To a
|
||||
calculate the n-gram matched count, it averages the n−1, n and n+1 gram
|
||||
matched counts.
|
||||
"""
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
m = {}
|
||||
# Requires an precision value for an addition ngram order.
|
||||
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
|
||||
m[-1] = p_n[0] + 1
|
||||
for i, p_i in enumerate(p_n):
|
||||
p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
|
||||
m[i] = p_n[i]
|
||||
return p_n
|
||||
|
||||
def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 6:
|
||||
Interpolates the maximum likelihood estimate of the precision *p_n* with
|
||||
a prior estimate *pi0*. The prior is estimated by assuming that the ratio
|
||||
between pn and pn−1 will be the same as that between pn−1 and pn−2; from
|
||||
Gao and He (2013) Training MRF-Based Phrase Translation Models using
|
||||
Gradient Ascent. In NAACL.
|
||||
"""
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
# This smoothing only works when p_1 and p_2 is non-zero.
|
||||
# Raise an error with an appropriate message when the input is too short
|
||||
# to use this smoothing technique.
|
||||
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
|
||||
for i, p_i in enumerate(p_n):
|
||||
if i in [0, 1]: # Skips the first 2 orders of ngrams.
|
||||
continue
|
||||
else:
|
||||
pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
|
||||
# No. of ngrams in translation that matches the reference.
|
||||
m = p_i.numerator
|
||||
# No. of ngrams in translation.
|
||||
l = sum(1 for _ in ngrams(hypothesis, i + 1))
|
||||
# Calculates the interpolated precision.
|
||||
p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
|
||||
return p_n
|
||||
|
||||
def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
||||
"""
|
||||
Smoothing method 7:
|
||||
Interpolates methods 4 and 5.
|
||||
"""
|
||||
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
||||
p_n = self.method4(p_n, references, hypothesis, hyp_len)
|
||||
p_n = self.method5(p_n, references, hypothesis, hyp_len)
|
||||
return p_n
|
||||
@@ -0,0 +1,221 @@
|
||||
# Natural Language Toolkit: ChrF score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Maja Popovic
|
||||
# Contributors: Liling Tan, Aleš Tamchyna (Memsource)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
""" ChrF score implementation """
|
||||
import re
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
def sentence_chrf(
|
||||
reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
|
||||
):
|
||||
"""
|
||||
Calculates the sentence level CHRF (Character n-gram F-score) described in
|
||||
- Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
|
||||
In Proceedings of the 10th Workshop on Machine Translation.
|
||||
https://www.statmt.org/wmt15/pdf/WMT49.pdf
|
||||
- Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
|
||||
In Proceedings of the 1st Conference on Machine Translation.
|
||||
https://www.statmt.org/wmt16/pdf/W16-2341.pdf
|
||||
|
||||
This implementation of CHRF only supports a single reference at the moment.
|
||||
|
||||
For details not reported in the paper, consult Maja Popovic's original
|
||||
implementation: https://github.com/m-popovic/chrF
|
||||
|
||||
The code should output results equivalent to running CHRF++ with the
|
||||
following options: -nw 0 -b 3
|
||||
|
||||
An example from the original BLEU paper
|
||||
https://www.aclweb.org/anthology/P02-1040.pdf
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands').split()
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party').split()
|
||||
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||||
... 'guidebook that party direct').split()
|
||||
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
|
||||
0.6349...
|
||||
>>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
|
||||
0.3330...
|
||||
|
||||
The infamous "the the the ... " example
|
||||
|
||||
>>> ref = 'the cat is on the mat'.split()
|
||||
>>> hyp = 'the the the the the the the'.split()
|
||||
>>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
|
||||
0.1468...
|
||||
|
||||
An example to show that this function allows users to use strings instead of
|
||||
tokens, i.e. list(str) as inputs.
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands')
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party')
|
||||
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
|
||||
0.6349...
|
||||
>>> type(ref1) == type(hyp1) == str
|
||||
True
|
||||
>>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
|
||||
0.6349...
|
||||
|
||||
To skip the unigrams and only use 2- to 3-grams:
|
||||
|
||||
>>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
|
||||
0.6617...
|
||||
|
||||
:param references: reference sentence
|
||||
:type references: list(str) / str
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str) / str
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:param beta: the parameter to assign more importance to recall over precision
|
||||
:type beta: float
|
||||
:param ignore_whitespace: ignore whitespace characters in scoring
|
||||
:type ignore_whitespace: bool
|
||||
:return: the sentence level CHRF score.
|
||||
:rtype: float
|
||||
"""
|
||||
return corpus_chrf(
|
||||
[reference],
|
||||
[hypothesis],
|
||||
min_len,
|
||||
max_len,
|
||||
beta=beta,
|
||||
ignore_whitespace=ignore_whitespace,
|
||||
)
|
||||
|
||||
|
||||
def _preprocess(sent, ignore_whitespace):
|
||||
if type(sent) != str:
|
||||
# turn list of tokens into a string
|
||||
sent = " ".join(sent)
|
||||
|
||||
if ignore_whitespace:
|
||||
sent = re.sub(r"\s+", "", sent)
|
||||
return sent
|
||||
|
||||
|
||||
def chrf_precision_recall_fscore_support(
|
||||
reference, hypothesis, n, beta=3.0, epsilon=1e-16
|
||||
):
|
||||
"""
|
||||
This function computes the precision, recall and fscore from the ngram
|
||||
overlaps. It returns the `support` which is the true positive score.
|
||||
|
||||
By underspecifying the input type, the function will be agnostic as to how
|
||||
it computes the ngrams and simply take the whichever element in the list;
|
||||
it could be either token or character.
|
||||
|
||||
:param reference: The reference sentence.
|
||||
:type reference: list
|
||||
:param hypothesis: The hypothesis sentence.
|
||||
:type hypothesis: list
|
||||
:param n: Extract up to the n-th order ngrams
|
||||
:type n: int
|
||||
:param beta: The parameter to assign more importance to recall over precision.
|
||||
:type beta: float
|
||||
:param epsilon: The fallback value if the hypothesis or reference is empty.
|
||||
:type epsilon: float
|
||||
:return: Returns the precision, recall and f-score and support (true positive).
|
||||
:rtype: tuple(float)
|
||||
"""
|
||||
ref_ngrams = Counter(ngrams(reference, n))
|
||||
hyp_ngrams = Counter(ngrams(hypothesis, n))
|
||||
|
||||
# calculate the number of ngram matches
|
||||
overlap_ngrams = ref_ngrams & hyp_ngrams
|
||||
tp = sum(overlap_ngrams.values()) # True positives.
|
||||
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
|
||||
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
|
||||
|
||||
try:
|
||||
prec = tp / tpfp # precision
|
||||
rec = tp / tpfn # recall
|
||||
factor = beta**2
|
||||
fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
|
||||
except ZeroDivisionError:
|
||||
prec = rec = fscore = epsilon
|
||||
return prec, rec, fscore, tp
|
||||
|
||||
|
||||
def corpus_chrf(
|
||||
references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
|
||||
):
|
||||
"""
|
||||
Calculates the corpus level CHRF (Character n-gram F-score), it is the
|
||||
macro-averaged value of the sentence/segment level CHRF score.
|
||||
|
||||
This implementation of CHRF only supports a single reference at the moment.
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands').split()
|
||||
>>> ref2 = str('It is the guiding principle which guarantees the military '
|
||||
... 'forces always being under the command of the Party').split()
|
||||
>>>
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party').split()
|
||||
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||||
... 'guidebook that party direct')
|
||||
>>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
|
||||
0.3910...
|
||||
|
||||
:param references: a corpus of list of reference sentences, w.r.t. hypotheses
|
||||
:type references: list(list(str))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:param beta: the parameter to assign more importance to recall over precision
|
||||
:type beta: float
|
||||
:param ignore_whitespace: ignore whitespace characters in scoring
|
||||
:type ignore_whitespace: bool
|
||||
:return: the sentence level CHRF score.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
assert len(references) == len(
|
||||
hypotheses
|
||||
), "The number of hypotheses and their references should be the same"
|
||||
num_sents = len(hypotheses)
|
||||
|
||||
# Keep f-scores for each n-gram order separate
|
||||
ngram_fscores = defaultdict(list)
|
||||
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for reference, hypothesis in zip(references, hypotheses):
|
||||
# preprocess both reference and hypothesis
|
||||
reference = _preprocess(reference, ignore_whitespace)
|
||||
hypothesis = _preprocess(hypothesis, ignore_whitespace)
|
||||
|
||||
# Calculate f-scores for each sentence and for each n-gram order
|
||||
# separately.
|
||||
for n in range(min_len, max_len + 1):
|
||||
# Compute the precision, recall, fscore and support.
|
||||
prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
|
||||
reference, hypothesis, n, beta=beta
|
||||
)
|
||||
ngram_fscores[n].append(fscore)
|
||||
|
||||
# how many n-gram sizes
|
||||
num_ngram_sizes = len(ngram_fscores)
|
||||
|
||||
# sum of f-scores over all sentences for each n-gram order
|
||||
total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
|
||||
|
||||
# macro-average over n-gram orders and over all sentences
|
||||
return (sum(total_scores) / num_ngram_sizes) / num_sents
|
||||
@@ -0,0 +1,263 @@
|
||||
# Natural Language Toolkit: Gale-Church Aligner
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Torsten Marek <marek@ifi.uzh.ch>
|
||||
# Contributor: Cassidy Laidlaw, Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
|
||||
A port of the Gale-Church Aligner.
|
||||
|
||||
Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
|
||||
https://aclweb.org/anthology/J93-1004.pdf
|
||||
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
try:
|
||||
from norm import logsf as norm_logsf
|
||||
from scipy.stats import norm
|
||||
except ImportError:
|
||||
|
||||
def erfcc(x):
|
||||
"""Complementary error function."""
|
||||
z = abs(x)
|
||||
t = 1 / (1 + 0.5 * z)
|
||||
r = t * math.exp(
|
||||
-z * z
|
||||
- 1.26551223
|
||||
+ t
|
||||
* (
|
||||
1.00002368
|
||||
+ t
|
||||
* (
|
||||
0.37409196
|
||||
+ t
|
||||
* (
|
||||
0.09678418
|
||||
+ t
|
||||
* (
|
||||
-0.18628806
|
||||
+ t
|
||||
* (
|
||||
0.27886807
|
||||
+ t
|
||||
* (
|
||||
-1.13520398
|
||||
+ t
|
||||
* (1.48851587 + t * (-0.82215223 + t * 0.17087277))
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
if x >= 0.0:
|
||||
return r
|
||||
else:
|
||||
return 2.0 - r
|
||||
|
||||
def norm_cdf(x):
|
||||
"""Return the area under the normal distribution from M{-∞..x}."""
|
||||
return 1 - 0.5 * erfcc(x / math.sqrt(2))
|
||||
|
||||
def norm_logsf(x):
|
||||
try:
|
||||
return math.log(1 - norm_cdf(x))
|
||||
except ValueError:
|
||||
return float("-inf")
|
||||
|
||||
|
||||
LOG2 = math.log(2)
|
||||
|
||||
|
||||
class LanguageIndependent:
|
||||
# These are the language-independent probabilities and parameters
|
||||
# given in Gale & Church
|
||||
|
||||
# for the computation, l_1 is always the language with less characters
|
||||
PRIORS = {
|
||||
(1, 0): 0.0099,
|
||||
(0, 1): 0.0099,
|
||||
(1, 1): 0.89,
|
||||
(2, 1): 0.089,
|
||||
(1, 2): 0.089,
|
||||
(2, 2): 0.011,
|
||||
}
|
||||
|
||||
AVERAGE_CHARACTERS = 1
|
||||
VARIANCE_CHARACTERS = 6.8
|
||||
|
||||
|
||||
def trace(backlinks, source_sents_lens, target_sents_lens):
|
||||
"""
|
||||
Traverse the alignment cost from the tracebacks and retrieves
|
||||
appropriate sentence pairs.
|
||||
|
||||
:param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
|
||||
:type backlinks: dict
|
||||
:param source_sents_lens: A list of target sentences' lengths
|
||||
:type source_sents_lens: list(int)
|
||||
:param target_sents_lens: A list of target sentences' lengths
|
||||
:type target_sents_lens: list(int)
|
||||
"""
|
||||
links = []
|
||||
position = (len(source_sents_lens), len(target_sents_lens))
|
||||
while position != (0, 0) and all(p >= 0 for p in position):
|
||||
try:
|
||||
s, t = backlinks[position]
|
||||
except TypeError:
|
||||
position = (position[0] - 1, position[1] - 1)
|
||||
continue
|
||||
for i in range(s):
|
||||
for j in range(t):
|
||||
links.append((position[0] - i - 1, position[1] - j - 1))
|
||||
position = (position[0] - s, position[1] - t)
|
||||
|
||||
return links[::-1]
|
||||
|
||||
|
||||
def align_log_prob(i, j, source_sents, target_sents, alignment, params):
|
||||
"""Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
|
||||
being aligned with a specific C{alignment}.
|
||||
|
||||
@param i: The offset of the source sentence.
|
||||
@param j: The offset of the target sentence.
|
||||
@param source_sents: The list of source sentence lengths.
|
||||
@param target_sents: The list of target sentence lengths.
|
||||
@param alignment: The alignment type, a tuple of two integers.
|
||||
@param params: The sentence alignment parameters.
|
||||
|
||||
@returns: The log probability of a specific alignment between the two sentences, given the parameters.
|
||||
"""
|
||||
l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
|
||||
l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
|
||||
try:
|
||||
# actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
|
||||
# reference implementation. With l_s in the denominator, insertions are impossible.
|
||||
m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
|
||||
delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(
|
||||
m * params.VARIANCE_CHARACTERS
|
||||
)
|
||||
except ZeroDivisionError:
|
||||
return float("-inf")
|
||||
|
||||
return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
|
||||
|
||||
|
||||
def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent):
|
||||
"""Return the sentence alignment of two text blocks (usually paragraphs).
|
||||
|
||||
>>> align_blocks([5,5,5], [7,7,7])
|
||||
[(0, 0), (1, 1), (2, 2)]
|
||||
>>> align_blocks([10,5,5], [12,20])
|
||||
[(0, 0), (1, 1), (2, 1)]
|
||||
>>> align_blocks([12,20], [10,5,5])
|
||||
[(0, 0), (1, 1), (1, 2)]
|
||||
>>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
|
||||
[(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]
|
||||
|
||||
@param source_sents_lens: The list of source sentence lengths.
|
||||
@param target_sents_lens: The list of target sentence lengths.
|
||||
@param params: the sentence alignment parameters.
|
||||
@return: The sentence alignments, a list of index pairs.
|
||||
"""
|
||||
|
||||
alignment_types = list(params.PRIORS.keys())
|
||||
|
||||
# there are always three rows in the history (with the last of them being filled)
|
||||
D = [[]]
|
||||
|
||||
backlinks = {}
|
||||
|
||||
for i in range(len(source_sents_lens) + 1):
|
||||
for j in range(len(target_sents_lens) + 1):
|
||||
min_dist = float("inf")
|
||||
min_align = None
|
||||
for a in alignment_types:
|
||||
prev_i = -1 - a[0]
|
||||
prev_j = j - a[1]
|
||||
if prev_i < -len(D) or prev_j < 0:
|
||||
continue
|
||||
p = D[prev_i][prev_j] + align_log_prob(
|
||||
i, j, source_sents_lens, target_sents_lens, a, params
|
||||
)
|
||||
if p < min_dist:
|
||||
min_dist = p
|
||||
min_align = a
|
||||
|
||||
if min_dist == float("inf"):
|
||||
min_dist = 0
|
||||
|
||||
backlinks[(i, j)] = min_align
|
||||
D[-1].append(min_dist)
|
||||
|
||||
if len(D) > 2:
|
||||
D.pop(0)
|
||||
D.append([])
|
||||
|
||||
return trace(backlinks, source_sents_lens, target_sents_lens)
|
||||
|
||||
|
||||
def align_texts(source_blocks, target_blocks, params=LanguageIndependent):
|
||||
"""Creates the sentence alignment of two texts.
|
||||
|
||||
Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
|
||||
alignment links.
|
||||
|
||||
Each block consists of a list that contains the lengths (in characters) of the sentences
|
||||
in this block.
|
||||
|
||||
@param source_blocks: The list of blocks in the source text.
|
||||
@param target_blocks: The list of blocks in the target text.
|
||||
@param params: the sentence alignment parameters.
|
||||
|
||||
@returns: A list of sentence alignment lists
|
||||
"""
|
||||
if len(source_blocks) != len(target_blocks):
|
||||
raise ValueError(
|
||||
"Source and target texts do not have the same number of blocks."
|
||||
)
|
||||
|
||||
return [
|
||||
align_blocks(source_block, target_block, params)
|
||||
for source_block, target_block in zip(source_blocks, target_blocks)
|
||||
]
|
||||
|
||||
|
||||
# File I/O functions; may belong in a corpus reader
|
||||
|
||||
|
||||
def split_at(it, split_value):
|
||||
"""Splits an iterator C{it} at values of C{split_value}.
|
||||
|
||||
Each instance of C{split_value} is swallowed. The iterator produces
|
||||
subiterators which need to be consumed fully before the next subiterator
|
||||
can be used.
|
||||
"""
|
||||
|
||||
def _chunk_iterator(first):
|
||||
v = first
|
||||
while v != split_value:
|
||||
yield v
|
||||
v = it.next()
|
||||
|
||||
while True:
|
||||
yield _chunk_iterator(it.next())
|
||||
|
||||
|
||||
def parse_token_stream(stream, soft_delimiter, hard_delimiter):
|
||||
"""Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
|
||||
and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
|
||||
"""
|
||||
return [
|
||||
[
|
||||
sum(len(token) for token in sentence_it)
|
||||
for sentence_it in split_at(block_it, soft_delimiter)
|
||||
]
|
||||
for block_it in split_at(stream, hard_delimiter)
|
||||
]
|
||||
138
Backend/venv/lib/python3.12/site-packages/nltk/translate/gdfa.py
Normal file
138
Backend/venv/lib/python3.12/site-packages/nltk/translate/gdfa.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# Natural Language Toolkit: GDFA word alignment symmetrization
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def grow_diag_final_and(srclen, trglen, e2f, f2e):
|
||||
"""
|
||||
This module symmetrisatizes the source-to-target and target-to-source
|
||||
word alignment output and produces, aka. GDFA algorithm (Koehn, 2005).
|
||||
|
||||
Step 1: Find the intersection of the bidirectional alignment.
|
||||
|
||||
Step 2: Search for additional neighbor alignment points to be added, given
|
||||
these criteria: (i) neighbor alignments points are not in the
|
||||
intersection and (ii) neighbor alignments are in the union.
|
||||
|
||||
Step 3: Add all other alignment points that are not in the intersection, not in
|
||||
the neighboring alignments that met the criteria but in the original
|
||||
forward/backward alignment outputs.
|
||||
|
||||
>>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 '
|
||||
... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18')
|
||||
>>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 '
|
||||
... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 '
|
||||
... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18')
|
||||
>>> srctext = ("この よう な ハロー 白色 わい 星 の L 関数 "
|
||||
... "は L と 共 に 不連続 に 増加 する こと が "
|
||||
... "期待 さ れる こと を 示し た 。")
|
||||
>>> trgtext = ("Therefore , we expect that the luminosity function "
|
||||
... "of such halo white dwarfs increases discontinuously "
|
||||
... "with the luminosity .")
|
||||
>>> srclen = len(srctext.split())
|
||||
>>> trglen = len(trgtext.split())
|
||||
>>>
|
||||
>>> gdfa = grow_diag_final_and(srclen, trglen, forw, back)
|
||||
>>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12),
|
||||
... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20,
|
||||
... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5),
|
||||
... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22,
|
||||
... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5,
|
||||
... 12), (11, 6), (12, 8)]))
|
||||
True
|
||||
|
||||
References:
|
||||
Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot.
|
||||
2005. Edinburgh System Description for the 2005 IWSLT Speech
|
||||
Translation Evaluation. In MT Eval Workshop.
|
||||
|
||||
:type srclen: int
|
||||
:param srclen: the number of tokens in the source language
|
||||
:type trglen: int
|
||||
:param trglen: the number of tokens in the target language
|
||||
:type e2f: str
|
||||
:param e2f: the forward word alignment outputs from source-to-target
|
||||
language (in pharaoh output format)
|
||||
:type f2e: str
|
||||
:param f2e: the backward word alignment outputs from target-to-source
|
||||
language (in pharaoh output format)
|
||||
:rtype: set(tuple(int))
|
||||
:return: the symmetrized alignment points from the GDFA algorithm
|
||||
"""
|
||||
|
||||
# Converts pharaoh text format into list of tuples.
|
||||
e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()]
|
||||
f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()]
|
||||
|
||||
neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]
|
||||
alignment = set(e2f).intersection(set(f2e)) # Find the intersection.
|
||||
union = set(e2f).union(set(f2e))
|
||||
|
||||
# *aligned* is used to check if neighbors are aligned in grow_diag()
|
||||
aligned = defaultdict(set)
|
||||
for i, j in alignment:
|
||||
aligned["e"].add(i)
|
||||
aligned["f"].add(j)
|
||||
|
||||
def grow_diag():
|
||||
"""
|
||||
Search for the neighbor points and them to the intersected alignment
|
||||
points if criteria are met.
|
||||
"""
|
||||
prev_len = len(alignment) - 1
|
||||
# iterate until no new points added
|
||||
while prev_len < len(alignment):
|
||||
no_new_points = True
|
||||
# for english word e = 0 ... en
|
||||
for e in range(srclen):
|
||||
# for foreign word f = 0 ... fn
|
||||
for f in range(trglen):
|
||||
# if ( e aligned with f)
|
||||
if (e, f) in alignment:
|
||||
# for each neighboring point (e-new, f-new)
|
||||
for neighbor in neighbors:
|
||||
neighbor = tuple(i + j for i, j in zip((e, f), neighbor))
|
||||
e_new, f_new = neighbor
|
||||
# if ( ( e-new not aligned and f-new not aligned)
|
||||
# and (e-new, f-new in union(e2f, f2e) )
|
||||
if (
|
||||
e_new not in aligned and f_new not in aligned
|
||||
) and neighbor in union:
|
||||
alignment.add(neighbor)
|
||||
aligned["e"].add(e_new)
|
||||
aligned["f"].add(f_new)
|
||||
prev_len += 1
|
||||
no_new_points = False
|
||||
# iterate until no new points added
|
||||
if no_new_points:
|
||||
break
|
||||
|
||||
def final_and(a):
|
||||
"""
|
||||
Adds remaining points that are not in the intersection, not in the
|
||||
neighboring alignments but in the original *e2f* and *f2e* alignments
|
||||
"""
|
||||
# for english word e = 0 ... en
|
||||
for e_new in range(srclen):
|
||||
# for foreign word f = 0 ... fn
|
||||
for f_new in range(trglen):
|
||||
# if ( ( e-new not aligned and f-new not aligned)
|
||||
# and (e-new, f-new in union(e2f, f2e) )
|
||||
if (
|
||||
e_new not in aligned
|
||||
and f_new not in aligned
|
||||
and (e_new, f_new) in union
|
||||
):
|
||||
alignment.add((e_new, f_new))
|
||||
aligned["e"].add(e_new)
|
||||
aligned["f"].add(f_new)
|
||||
|
||||
grow_diag()
|
||||
final_and(e2f)
|
||||
final_and(f2e)
|
||||
return sorted(alignment)
|
||||
@@ -0,0 +1,190 @@
|
||||
# Natural Language Toolkit: GLEU Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors:
|
||||
# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
""" GLEU score implementation. """
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from nltk.util import everygrams, ngrams
|
||||
|
||||
|
||||
def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
|
||||
"""
|
||||
Calculates the sentence level GLEU (Google-BLEU) score described in
|
||||
|
||||
Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
|
||||
Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
|
||||
Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
|
||||
Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
|
||||
George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith,
|
||||
Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes,
|
||||
Jeffrey Dean. (2016) Google’s Neural Machine Translation System:
|
||||
Bridging the Gap between Human and Machine Translation.
|
||||
eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf
|
||||
Retrieved on 27 Oct 2016.
|
||||
|
||||
From Wu et al. (2016):
|
||||
"The BLEU score has some undesirable properties when used for single
|
||||
sentences, as it was designed to be a corpus measure. We therefore
|
||||
use a slightly different score for our RL experiments which we call
|
||||
the 'GLEU score'. For the GLEU score, we record all sub-sequences of
|
||||
1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
|
||||
compute a recall, which is the ratio of the number of matching n-grams
|
||||
to the number of total n-grams in the target (ground truth) sequence,
|
||||
and a precision, which is the ratio of the number of matching n-grams
|
||||
to the number of total n-grams in the generated output sequence. Then
|
||||
GLEU score is simply the minimum of recall and precision. This GLEU
|
||||
score's range is always between 0 (no matches) and 1 (all match) and
|
||||
it is symmetrical when switching output and target. According to
|
||||
our experiments, GLEU score correlates quite well with the BLEU
|
||||
metric on a corpus level but does not have its drawbacks for our per
|
||||
sentence reward objective."
|
||||
|
||||
Note: The initial implementation only allowed a single reference, but now
|
||||
a list of references is required (which is consistent with
|
||||
bleu_score.sentence_bleu()).
|
||||
|
||||
The infamous "the the the ... " example
|
||||
|
||||
>>> ref = 'the cat is on the mat'.split()
|
||||
>>> hyp = 'the the the the the the the'.split()
|
||||
>>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS
|
||||
0.0909...
|
||||
|
||||
An example to evaluate normal machine translation outputs
|
||||
|
||||
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||||
... 'will forever heed Party commands').split()
|
||||
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||||
... 'always obeys the commands of the party').split()
|
||||
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||||
... 'guidebook that party direct').split()
|
||||
>>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS
|
||||
0.4393...
|
||||
>>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS
|
||||
0.1206...
|
||||
|
||||
:param references: a list of reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:return: the sentence level GLEU score.
|
||||
:rtype: float
|
||||
"""
|
||||
return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)
|
||||
|
||||
|
||||
def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
|
||||
"""
|
||||
Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all
|
||||
the hypotheses and their respective references.
|
||||
|
||||
Instead of averaging the sentence level GLEU scores (i.e. macro-average
|
||||
precision), Wu et al. (2016) sum up the matching tokens and the max of
|
||||
hypothesis and reference tokens for each sentence, then compute using the
|
||||
aggregate values.
|
||||
|
||||
From Mike Schuster (via email):
|
||||
"For the corpus, we just add up the two statistics n_match and
|
||||
n_all = max(n_all_output, n_all_target) for all sentences, then
|
||||
calculate gleu_score = n_match / n_all, so it is not just a mean of
|
||||
the sentence gleu scores (in our case, longer sentences count more,
|
||||
which I think makes sense as they are more difficult to translate)."
|
||||
|
||||
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||||
... 'interested', 'in', 'world', 'history']
|
||||
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||||
... 'because', 'he', 'read', 'the', 'book']
|
||||
|
||||
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||||
>>> hypotheses = [hyp1, hyp2]
|
||||
>>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
|
||||
0.5673...
|
||||
|
||||
The example below show that corpus_gleu() is different from averaging
|
||||
sentence_gleu() for hypotheses
|
||||
|
||||
>>> score1 = sentence_gleu([ref1a], hyp1)
|
||||
>>> score2 = sentence_gleu([ref2a], hyp2)
|
||||
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
|
||||
0.6144...
|
||||
|
||||
:param list_of_references: a list of reference sentences, w.r.t. hypotheses
|
||||
:type list_of_references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param min_len: The minimum order of n-gram this function should extract.
|
||||
:type min_len: int
|
||||
:param max_len: The maximum order of n-gram this function should extract.
|
||||
:type max_len: int
|
||||
:return: The corpus-level GLEU score.
|
||||
:rtype: float
|
||||
"""
|
||||
# sanity check
|
||||
assert len(list_of_references) == len(
|
||||
hypotheses
|
||||
), "The number of hypotheses and their reference(s) should be the same"
|
||||
|
||||
# sum matches and max-token-lengths over all sentences
|
||||
corpus_n_match = 0
|
||||
corpus_n_all = 0
|
||||
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
|
||||
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
|
||||
|
||||
hyp_counts = []
|
||||
for reference in references:
|
||||
ref_ngrams = Counter(everygrams(reference, min_len, max_len))
|
||||
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
|
||||
|
||||
overlap_ngrams = ref_ngrams & hyp_ngrams
|
||||
tp = sum(overlap_ngrams.values()) # True positives.
|
||||
|
||||
# While GLEU is defined as the minimum of precision and
|
||||
# recall, we can reduce the number of division operations by one by
|
||||
# instead finding the maximum of the denominators for the precision
|
||||
# and recall formulae, since the numerators are the same:
|
||||
# precision = tp / tpfp
|
||||
# recall = tp / tpfn
|
||||
# gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
|
||||
n_all = max(tpfp, tpfn)
|
||||
|
||||
if n_all > 0:
|
||||
hyp_counts.append((tp, n_all))
|
||||
|
||||
# use the reference yielding the highest score
|
||||
if hyp_counts:
|
||||
n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
|
||||
corpus_n_match += n_match
|
||||
corpus_n_all += n_all
|
||||
|
||||
# corner case: empty corpus or empty references---don't divide by zero!
|
||||
if corpus_n_all == 0:
|
||||
gleu_score = 0.0
|
||||
else:
|
||||
gleu_score = corpus_n_match / corpus_n_all
|
||||
|
||||
return gleu_score
|
||||
251
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm1.py
Normal file
251
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm1.py
Normal file
@@ -0,0 +1,251 @@
|
||||
# Natural Language Toolkit: IBM Model 1
|
||||
#
|
||||
# Copyright (C) 2001-2013 NLTK Project
|
||||
# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
|
||||
# Hengfeng Li <hengfeng12345@gmail.com>
|
||||
# Ruxin Hou <r.hou@student.unimelb.edu.au>
|
||||
# Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
|
||||
# Based on earlier version by:
|
||||
# Will Zhang <wilzzha@gmail.com>
|
||||
# Guan Gui <ggui@student.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Lexical translation model that ignores word order.
|
||||
|
||||
In IBM Model 1, word order is ignored for simplicity. As long as the
|
||||
word alignments are equivalent, it doesn't matter where the word occurs
|
||||
in the source or target sentence. Thus, the following three alignments
|
||||
are equally likely::
|
||||
|
||||
Source: je mange du jambon
|
||||
Target: i eat some ham
|
||||
Alignment: (0,0) (1,1) (2,2) (3,3)
|
||||
|
||||
Source: je mange du jambon
|
||||
Target: some ham eat i
|
||||
Alignment: (0,2) (1,3) (2,1) (3,1)
|
||||
|
||||
Source: du jambon je mange
|
||||
Target: eat i some ham
|
||||
Alignment: (0,3) (1,2) (2,0) (3,1)
|
||||
|
||||
Note that an alignment is represented here as
|
||||
(word_index_in_target, word_index_in_source).
|
||||
|
||||
The EM algorithm used in Model 1 is:
|
||||
|
||||
:E step: In the training data, count how many times a source language
|
||||
word is translated into a target language word, weighted by
|
||||
the prior probability of the translation.
|
||||
|
||||
:M step: Estimate the new probability of translation based on the
|
||||
counts from the Expectation step.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel
|
||||
from nltk.translate.ibm_model import Counts
|
||||
|
||||
|
||||
class IBMModel1(IBMModel):
|
||||
"""
|
||||
Lexical translation model that ignores word order
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
|
||||
>>> ibm1 = IBMModel1(bitext, 5)
|
||||
|
||||
>>> print(round(ibm1.translation_table['buch']['book'], 3))
|
||||
0.889
|
||||
>>> print(round(ibm1.translation_table['das']['book'], 3))
|
||||
0.062
|
||||
>>> print(round(ibm1.translation_table['buch'][None], 3))
|
||||
0.113
|
||||
>>> print(round(ibm1.translation_table['ja'][None], 3))
|
||||
0.073
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, the following entry must be present:
|
||||
``translation_table``.
|
||||
See ``IBMModel`` for the type and purpose of this table.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
|
||||
if probability_tables is None:
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
self.align_all(sentence_aligned_corpus)
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
initial_prob = 1 / len(self.trg_vocab)
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"Target language vocabulary is too large ("
|
||||
+ str(len(self.trg_vocab))
|
||||
+ " words). "
|
||||
"Results may be less accurate."
|
||||
)
|
||||
|
||||
for t in self.trg_vocab:
|
||||
self.translation_table[t] = defaultdict(lambda: initial_prob)
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
trg_sentence = aligned_sentence.words
|
||||
src_sentence = [None] + aligned_sentence.mots
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for t in trg_sentence:
|
||||
for s in src_sentence:
|
||||
count = self.prob_alignment_point(s, t)
|
||||
normalized_count = count / total_count[t]
|
||||
counts.t_given_s[t][s] += normalized_count
|
||||
counts.any_t_given_s[s] += normalized_count
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimate
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
|
||||
def prob_all_alignments(self, src_sentence, trg_sentence):
|
||||
"""
|
||||
Computes the probability of all possible word alignments,
|
||||
expressed as a marginal distribution over target words t
|
||||
|
||||
Each entry in the return value represents the contribution to
|
||||
the total alignment probability by the target word t.
|
||||
|
||||
To obtain probability(alignment | src_sentence, trg_sentence),
|
||||
simply sum the entries in the return value.
|
||||
|
||||
:return: Probability of t for all s in ``src_sentence``
|
||||
:rtype: dict(str): float
|
||||
"""
|
||||
alignment_prob_for_t = defaultdict(float)
|
||||
for t in trg_sentence:
|
||||
for s in src_sentence:
|
||||
alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
|
||||
return alignment_prob_for_t
|
||||
|
||||
def prob_alignment_point(self, s, t):
|
||||
"""
|
||||
Probability that word ``t`` in the target sentence is aligned to
|
||||
word ``s`` in the source sentence
|
||||
"""
|
||||
return self.translation_table[t][s]
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
prob = 1.0
|
||||
|
||||
for j, i in enumerate(alignment_info.alignment):
|
||||
if j == 0:
|
||||
continue # skip the dummy zeroeth element
|
||||
trg_word = alignment_info.trg_sentence[j]
|
||||
src_word = alignment_info.src_sentence[i]
|
||||
prob *= self.translation_table[trg_word][src_word]
|
||||
|
||||
return max(prob, IBMModel.MIN_PROB)
|
||||
|
||||
def align_all(self, parallel_corpus):
|
||||
for sentence_pair in parallel_corpus:
|
||||
self.align(sentence_pair)
|
||||
|
||||
def align(self, sentence_pair):
|
||||
"""
|
||||
Determines the best word alignment for one sentence pair from
|
||||
the corpus that the model was trained on.
|
||||
|
||||
The best alignment will be set in ``sentence_pair`` when the
|
||||
method returns. In contrast with the internal implementation of
|
||||
IBM models, the word indices in the ``Alignment`` are zero-
|
||||
indexed, not one-indexed.
|
||||
|
||||
:param sentence_pair: A sentence in the source language and its
|
||||
counterpart sentence in the target language
|
||||
:type sentence_pair: AlignedSent
|
||||
"""
|
||||
best_alignment = []
|
||||
|
||||
for j, trg_word in enumerate(sentence_pair.words):
|
||||
# Initialize trg_word to align with the NULL token
|
||||
best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
|
||||
best_alignment_point = None
|
||||
for i, src_word in enumerate(sentence_pair.mots):
|
||||
align_prob = self.translation_table[trg_word][src_word]
|
||||
if align_prob >= best_prob: # prefer newer word in case of tie
|
||||
best_prob = align_prob
|
||||
best_alignment_point = i
|
||||
|
||||
best_alignment.append((j, best_alignment_point))
|
||||
|
||||
sentence_pair.alignment = Alignment(best_alignment)
|
||||
319
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm2.py
Normal file
319
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm2.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# Natural Language Toolkit: IBM Model 2
|
||||
#
|
||||
# Copyright (C) 2001-2013 NLTK Project
|
||||
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Lexical translation model that considers word order.
|
||||
|
||||
IBM Model 2 improves on Model 1 by accounting for word order.
|
||||
An alignment probability is introduced, a(i | j,l,m), which predicts
|
||||
a source word position, given its aligned target word's position.
|
||||
|
||||
The EM algorithm used in Model 2 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) count how many times a particular position in the source
|
||||
sentence is aligned to a particular position in the target
|
||||
sentence
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel1
|
||||
from nltk.translate.ibm_model import Counts
|
||||
|
||||
|
||||
class IBMModel2(IBMModel):
|
||||
"""
|
||||
Lexical translation model that considers word order
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
|
||||
>>> ibm2 = IBMModel2(bitext, 5)
|
||||
|
||||
>>> print(round(ibm2.translation_table['buch']['book'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm2.translation_table['das']['book'], 3))
|
||||
0.0
|
||||
>>> print(round(ibm2.translation_table['buch'][None], 3))
|
||||
0.0
|
||||
>>> print(round(ibm2.translation_table['ja'][None], 3))
|
||||
0.0
|
||||
|
||||
>>> print(round(ibm2.alignment_table[1][1][2][2], 3))
|
||||
0.939
|
||||
>>> print(round(ibm2.alignment_table[1][2][2][2], 3))
|
||||
0.0
|
||||
>>> print(round(ibm2.alignment_table[2][2][4][5], 3))
|
||||
1.0
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model and an alignment model.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``.
|
||||
See ``IBMModel`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
|
||||
if probability_tables is None:
|
||||
# Get translation probabilities from IBM Model 1
|
||||
# Run more iterations of training for Model 1, since it is
|
||||
# faster than Model 2
|
||||
ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
|
||||
self.translation_table = ibm1.translation_table
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
self.align_all(sentence_aligned_corpus)
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
# a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
|
||||
l_m_combinations = set()
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
if (l, m) not in l_m_combinations:
|
||||
l_m_combinations.add((l, m))
|
||||
initial_prob = 1 / (l + 1)
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A source sentence is too long ("
|
||||
+ str(l)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
|
||||
for i in range(0, l + 1):
|
||||
for j in range(1, m + 1):
|
||||
self.alignment_table[i][j][l][m] = initial_prob
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model2Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
src_sentence = [None] + aligned_sentence.mots
|
||||
trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for j in range(1, m + 1):
|
||||
t = trg_sentence[j]
|
||||
for i in range(0, l + 1):
|
||||
s = src_sentence[i]
|
||||
count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
|
||||
normalized_count = count / total_count[t]
|
||||
|
||||
counts.update_lexical_translation(normalized_count, s, t)
|
||||
counts.update_alignment(normalized_count, i, j, l, m)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_alignment_probabilities(counts)
|
||||
|
||||
def maximize_alignment_probabilities(self, counts):
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
for i, j_s in counts.alignment.items():
|
||||
for j, src_sentence_lengths in j_s.items():
|
||||
for l, trg_sentence_lengths in src_sentence_lengths.items():
|
||||
for m in trg_sentence_lengths:
|
||||
estimate = (
|
||||
counts.alignment[i][j][l][m]
|
||||
/ counts.alignment_for_any_i[j][l][m]
|
||||
)
|
||||
self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
|
||||
|
||||
def prob_all_alignments(self, src_sentence, trg_sentence):
|
||||
"""
|
||||
Computes the probability of all possible word alignments,
|
||||
expressed as a marginal distribution over target words t
|
||||
|
||||
Each entry in the return value represents the contribution to
|
||||
the total alignment probability by the target word t.
|
||||
|
||||
To obtain probability(alignment | src_sentence, trg_sentence),
|
||||
simply sum the entries in the return value.
|
||||
|
||||
:return: Probability of t for all s in ``src_sentence``
|
||||
:rtype: dict(str): float
|
||||
"""
|
||||
alignment_prob_for_t = defaultdict(float)
|
||||
for j in range(1, len(trg_sentence)):
|
||||
t = trg_sentence[j]
|
||||
for i in range(0, len(src_sentence)):
|
||||
alignment_prob_for_t[t] += self.prob_alignment_point(
|
||||
i, j, src_sentence, trg_sentence
|
||||
)
|
||||
return alignment_prob_for_t
|
||||
|
||||
def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
|
||||
"""
|
||||
Probability that position j in ``trg_sentence`` is aligned to
|
||||
position i in the ``src_sentence``
|
||||
"""
|
||||
l = len(src_sentence) - 1
|
||||
m = len(trg_sentence) - 1
|
||||
s = src_sentence[i]
|
||||
t = trg_sentence[j]
|
||||
return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
prob = 1.0
|
||||
l = len(alignment_info.src_sentence) - 1
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
|
||||
for j, i in enumerate(alignment_info.alignment):
|
||||
if j == 0:
|
||||
continue # skip the dummy zeroeth element
|
||||
trg_word = alignment_info.trg_sentence[j]
|
||||
src_word = alignment_info.src_sentence[i]
|
||||
prob *= (
|
||||
self.translation_table[trg_word][src_word]
|
||||
* self.alignment_table[i][j][l][m]
|
||||
)
|
||||
|
||||
return max(prob, IBMModel.MIN_PROB)
|
||||
|
||||
def align_all(self, parallel_corpus):
|
||||
for sentence_pair in parallel_corpus:
|
||||
self.align(sentence_pair)
|
||||
|
||||
def align(self, sentence_pair):
|
||||
"""
|
||||
Determines the best word alignment for one sentence pair from
|
||||
the corpus that the model was trained on.
|
||||
|
||||
The best alignment will be set in ``sentence_pair`` when the
|
||||
method returns. In contrast with the internal implementation of
|
||||
IBM models, the word indices in the ``Alignment`` are zero-
|
||||
indexed, not one-indexed.
|
||||
|
||||
:param sentence_pair: A sentence in the source language and its
|
||||
counterpart sentence in the target language
|
||||
:type sentence_pair: AlignedSent
|
||||
"""
|
||||
best_alignment = []
|
||||
|
||||
l = len(sentence_pair.mots)
|
||||
m = len(sentence_pair.words)
|
||||
|
||||
for j, trg_word in enumerate(sentence_pair.words):
|
||||
# Initialize trg_word to align with the NULL token
|
||||
best_prob = (
|
||||
self.translation_table[trg_word][None]
|
||||
* self.alignment_table[0][j + 1][l][m]
|
||||
)
|
||||
best_prob = max(best_prob, IBMModel.MIN_PROB)
|
||||
best_alignment_point = None
|
||||
for i, src_word in enumerate(sentence_pair.mots):
|
||||
align_prob = (
|
||||
self.translation_table[trg_word][src_word]
|
||||
* self.alignment_table[i + 1][j + 1][l][m]
|
||||
)
|
||||
if align_prob >= best_prob:
|
||||
best_prob = align_prob
|
||||
best_alignment_point = i
|
||||
|
||||
best_alignment.append((j, best_alignment_point))
|
||||
|
||||
sentence_pair.alignment = Alignment(best_alignment)
|
||||
|
||||
|
||||
class Model2Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for alignment.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.alignment = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
)
|
||||
self.alignment_for_any_i = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
|
||||
def update_lexical_translation(self, count, s, t):
|
||||
self.t_given_s[t][s] += count
|
||||
self.any_t_given_s[s] += count
|
||||
|
||||
def update_alignment(self, count, i, j, l, m):
|
||||
self.alignment[i][j][l][m] += count
|
||||
self.alignment_for_any_i[j][l][m] += count
|
||||
346
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm3.py
Normal file
346
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm3.py
Normal file
@@ -0,0 +1,346 @@
|
||||
# Natural Language Toolkit: IBM Model 3
|
||||
#
|
||||
# Copyright (C) 2001-2013 NLTK Project
|
||||
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Translation model that considers how a word can be aligned to
|
||||
multiple words in another language.
|
||||
|
||||
IBM Model 3 improves on Model 2 by directly modeling the phenomenon
|
||||
where a word in one language may be translated into zero or more words
|
||||
in another. This is expressed by the fertility probability,
|
||||
n(phi | source word).
|
||||
|
||||
If a source word translates into more than one word, it is possible to
|
||||
generate sentences that have the same alignment in multiple ways. This
|
||||
is modeled by a distortion step. The distortion probability, d(j|i,l,m),
|
||||
predicts a target word position, given its aligned source word's
|
||||
position. The distortion probability replaces the alignment probability
|
||||
of Model 2.
|
||||
|
||||
The fertility probability is not applicable for NULL. Target words that
|
||||
align to NULL are assumed to be distributed uniformly in the target
|
||||
sentence. The existence of these words is modeled by p1, the probability
|
||||
that a target word produced by a real source word requires another
|
||||
target word that is produced by NULL.
|
||||
|
||||
The EM algorithm used in Model 3 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) count how many times a particular position in the target
|
||||
sentence is aligned to a particular position in the source
|
||||
sentence
|
||||
- (c) count how many times a source word is aligned to phi number
|
||||
of target words
|
||||
- (d) count how many times NULL is aligned to a target word
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Because there are too many possible alignments, only the most probable
|
||||
ones are considered. First, the best alignment is determined using prior
|
||||
probabilities. Then, a hill climbing approach is used to find other good
|
||||
candidates.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
:phi: Fertility, the number of target words produced by a source word
|
||||
:p1: Probability that a target word produced by a source word is
|
||||
accompanied by another target word that is aligned to NULL
|
||||
:p0: 1 - p1
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import factorial
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel2
|
||||
from nltk.translate.ibm_model import Counts
|
||||
|
||||
|
||||
class IBMModel3(IBMModel):
|
||||
"""
|
||||
Translation model that considers how a word can be aligned to
|
||||
multiple words in another language
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
|
||||
|
||||
>>> ibm3 = IBMModel3(bitext, 5)
|
||||
|
||||
>>> print(round(ibm3.translation_table['buch']['book'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm3.translation_table['das']['book'], 3))
|
||||
0.0
|
||||
>>> print(round(ibm3.translation_table['ja'][None], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm3.distortion_table[1][1][2][2], 3))
|
||||
1.0
|
||||
>>> print(round(ibm3.distortion_table[1][2][2][2], 3))
|
||||
0.0
|
||||
>>> print(round(ibm3.distortion_table[2][2][4][5], 3))
|
||||
0.75
|
||||
|
||||
>>> print(round(ibm3.fertility_table[2]['summarize'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm3.fertility_table[1]['book'], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm3.p1, 3))
|
||||
0.054
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model, a distortion model, a fertility model, and a
|
||||
model for generating NULL-aligned words.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``,
|
||||
``fertility_table``, ``p1``, ``distortion_table``.
|
||||
See ``IBMModel`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
|
||||
if probability_tables is None:
|
||||
# Get translation and alignment probabilities from IBM Model 2
|
||||
ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
|
||||
self.translation_table = ibm2.translation_table
|
||||
self.alignment_table = ibm2.alignment_table
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
self.fertility_table = probability_tables["fertility_table"]
|
||||
self.p1 = probability_tables["p1"]
|
||||
self.distortion_table = probability_tables["distortion_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
def reset_probabilities(self):
|
||||
super().reset_probabilities()
|
||||
self.distortion_table = defaultdict(
|
||||
lambda: defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
)
|
||||
"""
|
||||
dict[int][int][int][int]: float. Probability(j | i,l,m).
|
||||
Values accessed as ``distortion_table[j][i][l][m]``.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
# d(j | i,l,m) = 1 / m for all i, j, l, m
|
||||
l_m_combinations = set()
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
if (l, m) not in l_m_combinations:
|
||||
l_m_combinations.add((l, m))
|
||||
initial_prob = 1 / m
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A target sentence is too long ("
|
||||
+ str(m)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
for j in range(1, m + 1):
|
||||
for i in range(0, l + 1):
|
||||
self.distortion_table[j][i][l][m] = initial_prob
|
||||
|
||||
# simple initialization, taken from GIZA++
|
||||
self.fertility_table[0] = defaultdict(lambda: 0.2)
|
||||
self.fertility_table[1] = defaultdict(lambda: 0.65)
|
||||
self.fertility_table[2] = defaultdict(lambda: 0.1)
|
||||
self.fertility_table[3] = defaultdict(lambda: 0.04)
|
||||
MAX_FERTILITY = 10
|
||||
initial_fert_prob = 0.01 / (MAX_FERTILITY - 4)
|
||||
for phi in range(4, MAX_FERTILITY):
|
||||
self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob)
|
||||
|
||||
self.p1 = 0.5
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model3Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# Sample the alignment space
|
||||
sampled_alignments, best_alignment = self.sample(aligned_sentence)
|
||||
# Record the most probable alignment
|
||||
aligned_sentence.alignment = Alignment(
|
||||
best_alignment.zero_indexed_alignment()
|
||||
)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_of_alignments(sampled_alignments)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for alignment_info in sampled_alignments:
|
||||
count = self.prob_t_a_given_s(alignment_info)
|
||||
normalized_count = count / total_count
|
||||
|
||||
for j in range(1, m + 1):
|
||||
counts.update_lexical_translation(
|
||||
normalized_count, alignment_info, j
|
||||
)
|
||||
counts.update_distortion(normalized_count, alignment_info, j, l, m)
|
||||
|
||||
counts.update_null_generation(normalized_count, alignment_info)
|
||||
counts.update_fertility(normalized_count, alignment_info)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
|
||||
existing_alignment_table = self.alignment_table
|
||||
self.reset_probabilities()
|
||||
self.alignment_table = existing_alignment_table # don't retrain
|
||||
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_distortion_probabilities(counts)
|
||||
self.maximize_fertility_probabilities(counts)
|
||||
self.maximize_null_generation_probabilities(counts)
|
||||
|
||||
def maximize_distortion_probabilities(self, counts):
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
for j, i_s in counts.distortion.items():
|
||||
for i, src_sentence_lengths in i_s.items():
|
||||
for l, trg_sentence_lengths in src_sentence_lengths.items():
|
||||
for m in trg_sentence_lengths:
|
||||
estimate = (
|
||||
counts.distortion[j][i][l][m]
|
||||
/ counts.distortion_for_any_j[i][l][m]
|
||||
)
|
||||
self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB)
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
src_sentence = alignment_info.src_sentence
|
||||
trg_sentence = alignment_info.trg_sentence
|
||||
l = len(src_sentence) - 1 # exclude NULL
|
||||
m = len(trg_sentence) - 1
|
||||
p1 = self.p1
|
||||
p0 = 1 - p1
|
||||
|
||||
probability = 1.0
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
|
||||
# Combine NULL insertion probability
|
||||
null_fertility = alignment_info.fertility_of_i(0)
|
||||
probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Compute combination (m - null_fertility) choose null_fertility
|
||||
for i in range(1, null_fertility + 1):
|
||||
probability *= (m - null_fertility - i + 1) / i
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combine fertility probabilities
|
||||
for i in range(1, l + 1):
|
||||
fertility = alignment_info.fertility_of_i(i)
|
||||
probability *= (
|
||||
factorial(fertility) * self.fertility_table[fertility][src_sentence[i]]
|
||||
)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combine lexical and distortion probabilities
|
||||
for j in range(1, m + 1):
|
||||
t = trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
s = src_sentence[i]
|
||||
|
||||
probability *= (
|
||||
self.translation_table[t][s] * self.distortion_table[j][i][l][m]
|
||||
)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return probability
|
||||
|
||||
|
||||
class Model3Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for distortion.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.distortion = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
)
|
||||
self.distortion_for_any_j = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
|
||||
def update_distortion(self, count, alignment_info, j, l, m):
|
||||
i = alignment_info.alignment[j]
|
||||
self.distortion[j][i][l][m] += count
|
||||
self.distortion_for_any_j[i][l][m] += count
|
||||
490
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm4.py
Normal file
490
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm4.py
Normal file
@@ -0,0 +1,490 @@
|
||||
# Natural Language Toolkit: IBM Model 4
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Translation model that reorders output words based on their type and
|
||||
distance from other related words in the output sentence.
|
||||
|
||||
IBM Model 4 improves the distortion model of Model 3, motivated by the
|
||||
observation that certain words tend to be re-ordered in a predictable
|
||||
way relative to one another. For example, <adjective><noun> in English
|
||||
usually has its order flipped as <noun><adjective> in French.
|
||||
|
||||
Model 4 requires words in the source and target vocabularies to be
|
||||
categorized into classes. This can be linguistically driven, like parts
|
||||
of speech (adjective, nouns, prepositions, etc). Word classes can also
|
||||
be obtained by statistical methods. The original IBM Model 4 uses an
|
||||
information theoretic approach to group words into 50 classes for each
|
||||
vocabulary.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
|
||||
:Cept:
|
||||
A source word with non-zero fertility i.e. aligned to one or more
|
||||
target words.
|
||||
:Tablet:
|
||||
The set of target word(s) aligned to a cept.
|
||||
:Head of cept:
|
||||
The first word of the tablet of that cept.
|
||||
:Center of cept:
|
||||
The average position of the words in that cept's tablet. If the
|
||||
value is not an integer, the ceiling is taken.
|
||||
For example, for a tablet with words in positions 2, 5, 6 in the
|
||||
target sentence, the center of the corresponding cept is
|
||||
ceil((2 + 5 + 6) / 3) = 5
|
||||
:Displacement:
|
||||
For a head word, defined as (position of head word - position of
|
||||
previous cept's center). Can be positive or negative.
|
||||
For a non-head word, defined as (position of non-head word -
|
||||
position of previous word in the same tablet). Always positive,
|
||||
because successive words in a tablet are assumed to appear to the
|
||||
right of the previous word.
|
||||
|
||||
In contrast to Model 3 which reorders words in a tablet independently of
|
||||
other words, Model 4 distinguishes between three cases.
|
||||
|
||||
1. Words generated by NULL are distributed uniformly.
|
||||
2. For a head word t, its position is modeled by the probability
|
||||
d_head(displacement | word_class_s(s),word_class_t(t)),
|
||||
where s is the previous cept, and word_class_s and word_class_t maps
|
||||
s and t to a source and target language word class respectively.
|
||||
3. For a non-head word t, its position is modeled by the probability
|
||||
d_non_head(displacement | word_class_t(t))
|
||||
|
||||
The EM algorithm used in Model 4 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) for a particular word class, count how many times a head
|
||||
word is located at a particular displacement from the
|
||||
previous cept's center
|
||||
- (c) for a particular word class, count how many times a
|
||||
non-head word is located at a particular displacement from
|
||||
the previous target word
|
||||
- (d) count how many times a source word is aligned to phi number
|
||||
of target words
|
||||
- (e) count how many times NULL is aligned to a target word
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Like Model 3, there are too many possible alignments to consider. Thus,
|
||||
a hill climbing approach is used to sample good candidates.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
:phi: Fertility, the number of target words produced by a source word
|
||||
:p1: Probability that a target word produced by a source word is
|
||||
accompanied by another target word that is aligned to NULL
|
||||
:p0: 1 - p1
|
||||
:dj: Displacement, Δj
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import factorial
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel3
|
||||
from nltk.translate.ibm_model import Counts, longest_target_sentence_length
|
||||
|
||||
|
||||
class IBMModel4(IBMModel):
|
||||
"""
|
||||
Translation model that reorders output words based on their type and
|
||||
their distance from other related words in the output sentence
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
|
||||
>>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
|
||||
>>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
|
||||
|
||||
>>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes)
|
||||
|
||||
>>> print(round(ibm4.translation_table['buch']['book'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm4.translation_table['das']['book'], 3))
|
||||
0.0
|
||||
>>> print(round(ibm4.translation_table['ja'][None], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm4.head_distortion_table[1][0][1], 3))
|
||||
1.0
|
||||
>>> print(round(ibm4.head_distortion_table[2][0][1], 3))
|
||||
0.0
|
||||
>>> print(round(ibm4.non_head_distortion_table[3][6], 3))
|
||||
0.5
|
||||
|
||||
>>> print(round(ibm4.fertility_table[2]['summarize'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm4.fertility_table[1]['book'], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm4.p1, 3))
|
||||
0.033
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sentence_aligned_corpus,
|
||||
iterations,
|
||||
source_word_classes,
|
||||
target_word_classes,
|
||||
probability_tables=None,
|
||||
):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model, distortion models, a fertility model, and a
|
||||
model for generating NULL-aligned words.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param source_word_classes: Lookup table that maps a source word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type source_word_classes: dict[str]: int
|
||||
|
||||
:param target_word_classes: Lookup table that maps a target word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type target_word_classes: dict[str]: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``,
|
||||
``fertility_table``, ``p1``, ``head_distortion_table``,
|
||||
``non_head_distortion_table``. See ``IBMModel`` and
|
||||
``IBMModel4`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
self.src_classes = source_word_classes
|
||||
self.trg_classes = target_word_classes
|
||||
|
||||
if probability_tables is None:
|
||||
# Get probabilities from IBM model 3
|
||||
ibm3 = IBMModel3(sentence_aligned_corpus, iterations)
|
||||
self.translation_table = ibm3.translation_table
|
||||
self.alignment_table = ibm3.alignment_table
|
||||
self.fertility_table = ibm3.fertility_table
|
||||
self.p1 = ibm3.p1
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
self.fertility_table = probability_tables["fertility_table"]
|
||||
self.p1 = probability_tables["p1"]
|
||||
self.head_distortion_table = probability_tables["head_distortion_table"]
|
||||
self.non_head_distortion_table = probability_tables[
|
||||
"non_head_distortion_table"
|
||||
]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
def reset_probabilities(self):
|
||||
super().reset_probabilities()
|
||||
self.head_distortion_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
"""
|
||||
dict[int][int][int]: float. Probability(displacement of head
|
||||
word | word class of previous cept,target word class).
|
||||
Values accessed as ``distortion_table[dj][src_class][trg_class]``.
|
||||
"""
|
||||
|
||||
self.non_head_distortion_table = defaultdict(
|
||||
lambda: defaultdict(lambda: self.MIN_PROB)
|
||||
)
|
||||
"""
|
||||
dict[int][int]: float. Probability(displacement of non-head
|
||||
word | target word class).
|
||||
Values accessed as ``distortion_table[dj][trg_class]``.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
"""
|
||||
Set distortion probabilities uniformly to
|
||||
1 / cardinality of displacement values
|
||||
"""
|
||||
max_m = longest_target_sentence_length(sentence_aligned_corpus)
|
||||
|
||||
# The maximum displacement is m-1, when a word is in the last
|
||||
# position m of the target sentence and the previously placed
|
||||
# word is in the first position.
|
||||
# Conversely, the minimum displacement is -(m-1).
|
||||
# Thus, the displacement range is (m-1) - (-(m-1)). Note that
|
||||
# displacement cannot be zero and is not included in the range.
|
||||
if max_m <= 1:
|
||||
initial_prob = IBMModel.MIN_PROB
|
||||
else:
|
||||
initial_prob = 1 / (2 * (max_m - 1))
|
||||
if initial_prob < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A target sentence is too long ("
|
||||
+ str(max_m)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
|
||||
for dj in range(1, max_m):
|
||||
self.head_distortion_table[dj] = defaultdict(
|
||||
lambda: defaultdict(lambda: initial_prob)
|
||||
)
|
||||
self.head_distortion_table[-dj] = defaultdict(
|
||||
lambda: defaultdict(lambda: initial_prob)
|
||||
)
|
||||
self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob)
|
||||
self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob)
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model4Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# Sample the alignment space
|
||||
sampled_alignments, best_alignment = self.sample(aligned_sentence)
|
||||
# Record the most probable alignment
|
||||
aligned_sentence.alignment = Alignment(
|
||||
best_alignment.zero_indexed_alignment()
|
||||
)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_of_alignments(sampled_alignments)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for alignment_info in sampled_alignments:
|
||||
count = self.prob_t_a_given_s(alignment_info)
|
||||
normalized_count = count / total_count
|
||||
|
||||
for j in range(1, m + 1):
|
||||
counts.update_lexical_translation(
|
||||
normalized_count, alignment_info, j
|
||||
)
|
||||
counts.update_distortion(
|
||||
normalized_count,
|
||||
alignment_info,
|
||||
j,
|
||||
self.src_classes,
|
||||
self.trg_classes,
|
||||
)
|
||||
|
||||
counts.update_null_generation(normalized_count, alignment_info)
|
||||
counts.update_fertility(normalized_count, alignment_info)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
|
||||
existing_alignment_table = self.alignment_table
|
||||
self.reset_probabilities()
|
||||
self.alignment_table = existing_alignment_table # don't retrain
|
||||
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_distortion_probabilities(counts)
|
||||
self.maximize_fertility_probabilities(counts)
|
||||
self.maximize_null_generation_probabilities(counts)
|
||||
|
||||
def maximize_distortion_probabilities(self, counts):
|
||||
head_d_table = self.head_distortion_table
|
||||
for dj, src_classes in counts.head_distortion.items():
|
||||
for s_cls, trg_classes in src_classes.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.head_distortion[dj][s_cls][t_cls]
|
||||
/ counts.head_distortion_for_any_dj[s_cls][t_cls]
|
||||
)
|
||||
head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
non_head_d_table = self.non_head_distortion_table
|
||||
for dj, trg_classes in counts.non_head_distortion.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.non_head_distortion[dj][t_cls]
|
||||
/ counts.non_head_distortion_for_any_dj[t_cls]
|
||||
)
|
||||
non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
return IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
|
||||
|
||||
@staticmethod # exposed for Model 5 to use
|
||||
def model4_prob_t_a_given_s(alignment_info, ibm_model):
|
||||
probability = 1.0
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
|
||||
def null_generation_term():
|
||||
# Binomial distribution: B(m - null_fertility, p1)
|
||||
value = 1.0
|
||||
p1 = ibm_model.p1
|
||||
p0 = 1 - p1
|
||||
null_fertility = alignment_info.fertility_of_i(0)
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combination: (m - null_fertility) choose null_fertility
|
||||
for i in range(1, null_fertility + 1):
|
||||
value *= (m - null_fertility - i + 1) / i
|
||||
return value
|
||||
|
||||
def fertility_term():
|
||||
value = 1.0
|
||||
src_sentence = alignment_info.src_sentence
|
||||
for i in range(1, len(src_sentence)):
|
||||
fertility = alignment_info.fertility_of_i(i)
|
||||
value *= (
|
||||
factorial(fertility)
|
||||
* ibm_model.fertility_table[fertility][src_sentence[i]]
|
||||
)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
return value
|
||||
|
||||
def lexical_translation_term(j):
|
||||
t = alignment_info.trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
s = alignment_info.src_sentence[i]
|
||||
return ibm_model.translation_table[t][s]
|
||||
|
||||
def distortion_term(j):
|
||||
t = alignment_info.trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
if i == 0:
|
||||
# case 1: t is aligned to NULL
|
||||
return 1.0
|
||||
if alignment_info.is_head_word(j):
|
||||
# case 2: t is the first word of a tablet
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
src_class = None
|
||||
if previous_cept is not None:
|
||||
previous_s = alignment_info.src_sentence[previous_cept]
|
||||
src_class = ibm_model.src_classes[previous_s]
|
||||
trg_class = ibm_model.trg_classes[t]
|
||||
dj = j - alignment_info.center_of_cept(previous_cept)
|
||||
return ibm_model.head_distortion_table[dj][src_class][trg_class]
|
||||
|
||||
# case 3: t is a subsequent word of a tablet
|
||||
previous_position = alignment_info.previous_in_tablet(j)
|
||||
trg_class = ibm_model.trg_classes[t]
|
||||
dj = j - previous_position
|
||||
return ibm_model.non_head_distortion_table[dj][trg_class]
|
||||
|
||||
# end nested functions
|
||||
|
||||
# Abort computation whenever probability falls below MIN_PROB at
|
||||
# any point, since MIN_PROB can be considered as zero
|
||||
probability *= null_generation_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
probability *= fertility_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
for j in range(1, len(alignment_info.trg_sentence)):
|
||||
probability *= lexical_translation_term(j)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
probability *= distortion_term(j)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return probability
|
||||
|
||||
|
||||
class Model4Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for distortion.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.head_distortion = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(float))
|
||||
self.non_head_distortion = defaultdict(lambda: defaultdict(float))
|
||||
self.non_head_distortion_for_any_dj = defaultdict(float)
|
||||
|
||||
def update_distortion(self, count, alignment_info, j, src_classes, trg_classes):
|
||||
i = alignment_info.alignment[j]
|
||||
t = alignment_info.trg_sentence[j]
|
||||
if i == 0:
|
||||
# case 1: t is aligned to NULL
|
||||
pass
|
||||
elif alignment_info.is_head_word(j):
|
||||
# case 2: t is the first word of a tablet
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
if previous_cept is not None:
|
||||
previous_src_word = alignment_info.src_sentence[previous_cept]
|
||||
src_class = src_classes[previous_src_word]
|
||||
else:
|
||||
src_class = None
|
||||
trg_class = trg_classes[t]
|
||||
dj = j - alignment_info.center_of_cept(previous_cept)
|
||||
self.head_distortion[dj][src_class][trg_class] += count
|
||||
self.head_distortion_for_any_dj[src_class][trg_class] += count
|
||||
else:
|
||||
# case 3: t is a subsequent word of a tablet
|
||||
previous_j = alignment_info.previous_in_tablet(j)
|
||||
trg_class = trg_classes[t]
|
||||
dj = j - previous_j
|
||||
self.non_head_distortion[dj][trg_class] += count
|
||||
self.non_head_distortion_for_any_dj[trg_class] += count
|
||||
661
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm5.py
Normal file
661
Backend/venv/lib/python3.12/site-packages/nltk/translate/ibm5.py
Normal file
@@ -0,0 +1,661 @@
|
||||
# Natural Language Toolkit: IBM Model 5
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Translation model that keeps track of vacant positions in the target
|
||||
sentence to decide where to place translated words.
|
||||
|
||||
Translation can be viewed as a process where each word in the source
|
||||
sentence is stepped through sequentially, generating translated words
|
||||
for each source word. The target sentence can be viewed as being made
|
||||
up of ``m`` empty slots initially, which gradually fill up as generated
|
||||
words are placed in them.
|
||||
|
||||
Models 3 and 4 use distortion probabilities to decide how to place
|
||||
translated words. For simplicity, these models ignore the history of
|
||||
which slots have already been occupied with translated words.
|
||||
Consider the placement of the last translated word: there is only one
|
||||
empty slot left in the target sentence, so the distortion probability
|
||||
should be 1.0 for that position and 0.0 everywhere else. However, the
|
||||
distortion probabilities for Models 3 and 4 are set up such that all
|
||||
positions are under consideration.
|
||||
|
||||
IBM Model 5 fixes this deficiency by accounting for occupied slots
|
||||
during translation. It introduces the vacancy function v(j), the number
|
||||
of vacancies up to, and including, position j in the target sentence.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
|
||||
:Maximum vacancy:
|
||||
The number of valid slots that a word can be placed in.
|
||||
This is not necessarily the same as the number of vacant slots.
|
||||
For example, if a tablet contains more than one word, the head word
|
||||
cannot be placed at the last vacant slot because there will be no
|
||||
space for the other words in the tablet. The number of valid slots
|
||||
has to take into account the length of the tablet.
|
||||
Non-head words cannot be placed before the head word, so vacancies
|
||||
to the left of the head word are ignored.
|
||||
:Vacancy difference:
|
||||
For a head word: (v(j) - v(center of previous cept))
|
||||
Can be positive or negative.
|
||||
For a non-head word: (v(j) - v(position of previously placed word))
|
||||
Always positive, because successive words in a tablet are assumed to
|
||||
appear to the right of the previous word.
|
||||
|
||||
Positioning of target words fall under three cases:
|
||||
|
||||
1. Words generated by NULL are distributed uniformly
|
||||
2. For a head word t, its position is modeled by the probability
|
||||
v_head(dv | max_v,word_class_t(t))
|
||||
3. For a non-head word t, its position is modeled by the probability
|
||||
v_non_head(dv | max_v,word_class_t(t))
|
||||
|
||||
dv and max_v are defined differently for head and non-head words.
|
||||
|
||||
The EM algorithm used in Model 5 is:
|
||||
|
||||
:E step: In the training data, collect counts, weighted by prior
|
||||
probabilities.
|
||||
|
||||
- (a) count how many times a source language word is translated
|
||||
into a target language word
|
||||
- (b) for a particular word class and maximum vacancy, count how
|
||||
many times a head word and the previous cept's center have
|
||||
a particular difference in number of vacancies
|
||||
- (b) for a particular word class and maximum vacancy, count how
|
||||
many times a non-head word and the previous target word
|
||||
have a particular difference in number of vacancies
|
||||
- (d) count how many times a source word is aligned to phi number
|
||||
of target words
|
||||
- (e) count how many times NULL is aligned to a target word
|
||||
|
||||
:M step: Estimate new probabilities based on the counts from the E step
|
||||
|
||||
Like Model 4, there are too many possible alignments to consider. Thus,
|
||||
a hill climbing approach is used to sample good candidates. In addition,
|
||||
pruning is used to weed out unlikely alignments based on Model 4 scores.
|
||||
|
||||
Notations
|
||||
---------
|
||||
|
||||
:i: Position in the source sentence
|
||||
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
|
||||
:j: Position in the target sentence
|
||||
Valid values are 1, 2, ..., length of target sentence
|
||||
:l: Number of words in the source sentence, excluding NULL
|
||||
:m: Number of words in the target sentence
|
||||
:s: A word in the source language
|
||||
:t: A word in the target language
|
||||
:phi: Fertility, the number of target words produced by a source word
|
||||
:p1: Probability that a target word produced by a source word is
|
||||
accompanied by another target word that is aligned to NULL
|
||||
:p0: 1 - p1
|
||||
:max_v: Maximum vacancy
|
||||
:dv: Vacancy difference, Δv
|
||||
|
||||
The definition of v_head here differs from GIZA++, section 4.7 of
|
||||
[Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is
|
||||
v_head(v(j) | v(center of previous cept),max_v,word_class(t)).
|
||||
|
||||
Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with
|
||||
v(center of previous cept) to obtain dv:
|
||||
v_head(v(j) - v(center of previous cept) | max_v,word_class(t)).
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import factorial
|
||||
|
||||
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel4
|
||||
from nltk.translate.ibm_model import Counts, longest_target_sentence_length
|
||||
|
||||
|
||||
class IBMModel5(IBMModel):
|
||||
"""
|
||||
Translation model that keeps track of vacant positions in the target
|
||||
sentence to decide where to place translated words
|
||||
|
||||
>>> bitext = []
|
||||
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
|
||||
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
|
||||
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
|
||||
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
|
||||
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
|
||||
>>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
|
||||
>>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
|
||||
|
||||
>>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes)
|
||||
|
||||
>>> print(round(ibm5.head_vacancy_table[1][1][1], 3))
|
||||
1.0
|
||||
>>> print(round(ibm5.head_vacancy_table[2][1][1], 3))
|
||||
0.0
|
||||
>>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm5.fertility_table[2]['summarize'], 3))
|
||||
1.0
|
||||
>>> print(round(ibm5.fertility_table[1]['book'], 3))
|
||||
1.0
|
||||
|
||||
>>> print(round(ibm5.p1, 3))
|
||||
0.033
|
||||
|
||||
>>> test_sentence = bitext[2]
|
||||
>>> test_sentence.words
|
||||
['das', 'buch', 'ist', 'ja', 'klein']
|
||||
>>> test_sentence.mots
|
||||
['the', 'book', 'is', 'small']
|
||||
>>> test_sentence.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
|
||||
|
||||
"""
|
||||
|
||||
MIN_SCORE_FACTOR = 0.2
|
||||
"""
|
||||
Alignments with scores below this factor are pruned during sampling
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sentence_aligned_corpus,
|
||||
iterations,
|
||||
source_word_classes,
|
||||
target_word_classes,
|
||||
probability_tables=None,
|
||||
):
|
||||
"""
|
||||
Train on ``sentence_aligned_corpus`` and create a lexical
|
||||
translation model, vacancy models, a fertility model, and a
|
||||
model for generating NULL-aligned words.
|
||||
|
||||
Translation direction is from ``AlignedSent.mots`` to
|
||||
``AlignedSent.words``.
|
||||
|
||||
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
|
||||
:param iterations: Number of iterations to run training algorithm
|
||||
:type iterations: int
|
||||
|
||||
:param source_word_classes: Lookup table that maps a source word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type source_word_classes: dict[str]: int
|
||||
|
||||
:param target_word_classes: Lookup table that maps a target word
|
||||
to its word class, the latter represented by an integer id
|
||||
:type target_word_classes: dict[str]: int
|
||||
|
||||
:param probability_tables: Optional. Use this to pass in custom
|
||||
probability values. If not specified, probabilities will be
|
||||
set to a uniform distribution, or some other sensible value.
|
||||
If specified, all the following entries must be present:
|
||||
``translation_table``, ``alignment_table``,
|
||||
``fertility_table``, ``p1``, ``head_distortion_table``,
|
||||
``non_head_distortion_table``, ``head_vacancy_table``,
|
||||
``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``,
|
||||
and ``IBMModel5`` for the type and purpose of these tables.
|
||||
:type probability_tables: dict[str]: object
|
||||
"""
|
||||
super().__init__(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
self.src_classes = source_word_classes
|
||||
self.trg_classes = target_word_classes
|
||||
|
||||
if probability_tables is None:
|
||||
# Get probabilities from IBM model 4
|
||||
ibm4 = IBMModel4(
|
||||
sentence_aligned_corpus,
|
||||
iterations,
|
||||
source_word_classes,
|
||||
target_word_classes,
|
||||
)
|
||||
self.translation_table = ibm4.translation_table
|
||||
self.alignment_table = ibm4.alignment_table
|
||||
self.fertility_table = ibm4.fertility_table
|
||||
self.p1 = ibm4.p1
|
||||
self.head_distortion_table = ibm4.head_distortion_table
|
||||
self.non_head_distortion_table = ibm4.non_head_distortion_table
|
||||
self.set_uniform_probabilities(sentence_aligned_corpus)
|
||||
else:
|
||||
# Set user-defined probabilities
|
||||
self.translation_table = probability_tables["translation_table"]
|
||||
self.alignment_table = probability_tables["alignment_table"]
|
||||
self.fertility_table = probability_tables["fertility_table"]
|
||||
self.p1 = probability_tables["p1"]
|
||||
self.head_distortion_table = probability_tables["head_distortion_table"]
|
||||
self.non_head_distortion_table = probability_tables[
|
||||
"non_head_distortion_table"
|
||||
]
|
||||
self.head_vacancy_table = probability_tables["head_vacancy_table"]
|
||||
self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"]
|
||||
|
||||
for n in range(0, iterations):
|
||||
self.train(sentence_aligned_corpus)
|
||||
|
||||
def reset_probabilities(self):
|
||||
super().reset_probabilities()
|
||||
self.head_vacancy_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
"""
|
||||
dict[int][int][int]: float. Probability(vacancy difference |
|
||||
number of remaining valid positions,target word class).
|
||||
Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``.
|
||||
"""
|
||||
|
||||
self.non_head_vacancy_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
)
|
||||
"""
|
||||
dict[int][int][int]: float. Probability(vacancy difference |
|
||||
number of remaining valid positions,target word class).
|
||||
Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
"""
|
||||
Set vacancy probabilities uniformly to
|
||||
1 / cardinality of vacancy difference values
|
||||
"""
|
||||
max_m = longest_target_sentence_length(sentence_aligned_corpus)
|
||||
|
||||
# The maximum vacancy difference occurs when a word is placed in
|
||||
# the last available position m of the target sentence and the
|
||||
# previous word position has no vacancies.
|
||||
# The minimum is 1-max_v, when a word is placed in the first
|
||||
# available position and the previous word is placed beyond the
|
||||
# last available position.
|
||||
# Thus, the number of possible vacancy difference values is
|
||||
# (max_v) - (1-max_v) + 1 = 2 * max_v.
|
||||
if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB:
|
||||
warnings.warn(
|
||||
"A target sentence is too long ("
|
||||
+ str(max_m)
|
||||
+ " words). Results may be less accurate."
|
||||
)
|
||||
|
||||
for max_v in range(1, max_m + 1):
|
||||
for dv in range(1, max_m + 1):
|
||||
initial_prob = 1 / (2 * max_v)
|
||||
self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob)
|
||||
self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
|
||||
lambda: initial_prob
|
||||
)
|
||||
self.non_head_vacancy_table[dv][max_v] = defaultdict(
|
||||
lambda: initial_prob
|
||||
)
|
||||
self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict(
|
||||
lambda: initial_prob
|
||||
)
|
||||
|
||||
def train(self, parallel_corpus):
|
||||
counts = Model5Counts()
|
||||
for aligned_sentence in parallel_corpus:
|
||||
l = len(aligned_sentence.mots)
|
||||
m = len(aligned_sentence.words)
|
||||
|
||||
# Sample the alignment space
|
||||
sampled_alignments, best_alignment = self.sample(aligned_sentence)
|
||||
# Record the most probable alignment
|
||||
aligned_sentence.alignment = Alignment(
|
||||
best_alignment.zero_indexed_alignment()
|
||||
)
|
||||
|
||||
# E step (a): Compute normalization factors to weigh counts
|
||||
total_count = self.prob_of_alignments(sampled_alignments)
|
||||
|
||||
# E step (b): Collect counts
|
||||
for alignment_info in sampled_alignments:
|
||||
count = self.prob_t_a_given_s(alignment_info)
|
||||
normalized_count = count / total_count
|
||||
|
||||
for j in range(1, m + 1):
|
||||
counts.update_lexical_translation(
|
||||
normalized_count, alignment_info, j
|
||||
)
|
||||
|
||||
slots = Slots(m)
|
||||
for i in range(1, l + 1):
|
||||
counts.update_vacancy(
|
||||
normalized_count, alignment_info, i, self.trg_classes, slots
|
||||
)
|
||||
|
||||
counts.update_null_generation(normalized_count, alignment_info)
|
||||
counts.update_fertility(normalized_count, alignment_info)
|
||||
|
||||
# M step: Update probabilities with maximum likelihood estimates
|
||||
# If any probability is less than MIN_PROB, clamp it to MIN_PROB
|
||||
existing_alignment_table = self.alignment_table
|
||||
self.reset_probabilities()
|
||||
self.alignment_table = existing_alignment_table # don't retrain
|
||||
|
||||
self.maximize_lexical_translation_probabilities(counts)
|
||||
self.maximize_vacancy_probabilities(counts)
|
||||
self.maximize_fertility_probabilities(counts)
|
||||
self.maximize_null_generation_probabilities(counts)
|
||||
|
||||
def sample(self, sentence_pair):
|
||||
"""
|
||||
Sample the most probable alignments from the entire alignment
|
||||
space according to Model 4
|
||||
|
||||
Note that Model 4 scoring is used instead of Model 5 because the
|
||||
latter is too expensive to compute.
|
||||
|
||||
First, determine the best alignment according to IBM Model 2.
|
||||
With this initial alignment, use hill climbing to determine the
|
||||
best alignment according to a IBM Model 4. Add this
|
||||
alignment and its neighbors to the sample set. Repeat this
|
||||
process with other initial alignments obtained by pegging an
|
||||
alignment point. Finally, prune alignments that have
|
||||
substantially lower Model 4 scores than the best alignment.
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to generate a sample of alignments from
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:return: A set of best alignments represented by their ``AlignmentInfo``
|
||||
and the best alignment of the set for convenience
|
||||
:rtype: set(AlignmentInfo), AlignmentInfo
|
||||
"""
|
||||
sampled_alignments, best_alignment = super().sample(sentence_pair)
|
||||
return self.prune(sampled_alignments), best_alignment
|
||||
|
||||
def prune(self, alignment_infos):
|
||||
"""
|
||||
Removes alignments from ``alignment_infos`` that have
|
||||
substantially lower Model 4 scores than the best alignment
|
||||
|
||||
:return: Pruned alignments
|
||||
:rtype: set(AlignmentInfo)
|
||||
"""
|
||||
alignments = []
|
||||
best_score = 0
|
||||
|
||||
for alignment_info in alignment_infos:
|
||||
score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
|
||||
best_score = max(score, best_score)
|
||||
alignments.append((alignment_info, score))
|
||||
|
||||
threshold = IBMModel5.MIN_SCORE_FACTOR * best_score
|
||||
alignments = [a[0] for a in alignments if a[1] > threshold]
|
||||
return set(alignments)
|
||||
|
||||
def hillclimb(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Starting from the alignment in ``alignment_info``, look at
|
||||
neighboring alignments iteratively for the best one, according
|
||||
to Model 4
|
||||
|
||||
Note that Model 4 scoring is used instead of Model 5 because the
|
||||
latter is too expensive to compute.
|
||||
|
||||
There is no guarantee that the best alignment in the alignment
|
||||
space will be found, because the algorithm might be stuck in a
|
||||
local maximum.
|
||||
|
||||
:param j_pegged: If specified, the search will be constrained to
|
||||
alignments where ``j_pegged`` remains unchanged
|
||||
:type j_pegged: int
|
||||
|
||||
:return: The best alignment found from hill climbing
|
||||
:rtype: AlignmentInfo
|
||||
"""
|
||||
alignment = alignment_info # alias with shorter name
|
||||
max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self)
|
||||
|
||||
while True:
|
||||
old_alignment = alignment
|
||||
for neighbor_alignment in self.neighboring(alignment, j_pegged):
|
||||
neighbor_probability = IBMModel4.model4_prob_t_a_given_s(
|
||||
neighbor_alignment, self
|
||||
)
|
||||
|
||||
if neighbor_probability > max_probability:
|
||||
alignment = neighbor_alignment
|
||||
max_probability = neighbor_probability
|
||||
|
||||
if alignment == old_alignment:
|
||||
# Until there are no better alignments
|
||||
break
|
||||
|
||||
alignment.score = max_probability
|
||||
return alignment
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
"""
|
||||
probability = 1.0
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
slots = Slots(len(alignment_info.trg_sentence) - 1)
|
||||
|
||||
def null_generation_term():
|
||||
# Binomial distribution: B(m - null_fertility, p1)
|
||||
value = 1.0
|
||||
p1 = self.p1
|
||||
p0 = 1 - p1
|
||||
null_fertility = alignment_info.fertility_of_i(0)
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# Combination: (m - null_fertility) choose null_fertility
|
||||
for i in range(1, null_fertility + 1):
|
||||
value *= (m - null_fertility - i + 1) / i
|
||||
return value
|
||||
|
||||
def fertility_term():
|
||||
value = 1.0
|
||||
src_sentence = alignment_info.src_sentence
|
||||
for i in range(1, len(src_sentence)):
|
||||
fertility = alignment_info.fertility_of_i(i)
|
||||
value *= (
|
||||
factorial(fertility)
|
||||
* self.fertility_table[fertility][src_sentence[i]]
|
||||
)
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
return value
|
||||
|
||||
def lexical_translation_term(j):
|
||||
t = alignment_info.trg_sentence[j]
|
||||
i = alignment_info.alignment[j]
|
||||
s = alignment_info.src_sentence[i]
|
||||
return self.translation_table[t][s]
|
||||
|
||||
def vacancy_term(i):
|
||||
value = 1.0
|
||||
tablet = alignment_info.cepts[i]
|
||||
tablet_length = len(tablet)
|
||||
total_vacancies = slots.vacancies_at(len(slots))
|
||||
|
||||
# case 1: NULL-aligned words
|
||||
if tablet_length == 0:
|
||||
return value
|
||||
|
||||
# case 2: head word
|
||||
j = tablet[0]
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
previous_center = alignment_info.center_of_cept(previous_cept)
|
||||
dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
|
||||
max_v = total_vacancies - tablet_length + 1
|
||||
trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
|
||||
value *= self.head_vacancy_table[dv][max_v][trg_class]
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
# case 3: non-head words
|
||||
for k in range(1, tablet_length):
|
||||
previous_position = tablet[k - 1]
|
||||
previous_vacancies = slots.vacancies_at(previous_position)
|
||||
j = tablet[k]
|
||||
dv = slots.vacancies_at(j) - previous_vacancies
|
||||
max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
|
||||
trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
|
||||
value *= self.non_head_vacancy_table[dv][max_v][trg_class]
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
if value < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return value
|
||||
|
||||
# end nested functions
|
||||
|
||||
# Abort computation whenever probability falls below MIN_PROB at
|
||||
# any point, since MIN_PROB can be considered as zero
|
||||
probability *= null_generation_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
probability *= fertility_term()
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
for j in range(1, len(alignment_info.trg_sentence)):
|
||||
probability *= lexical_translation_term(j)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
for i in range(1, len(alignment_info.src_sentence)):
|
||||
probability *= vacancy_term(i)
|
||||
if probability < MIN_PROB:
|
||||
return MIN_PROB
|
||||
|
||||
return probability
|
||||
|
||||
def maximize_vacancy_probabilities(self, counts):
|
||||
MIN_PROB = IBMModel.MIN_PROB
|
||||
head_vacancy_table = self.head_vacancy_table
|
||||
for dv, max_vs in counts.head_vacancy.items():
|
||||
for max_v, trg_classes in max_vs.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.head_vacancy[dv][max_v][t_cls]
|
||||
/ counts.head_vacancy_for_any_dv[max_v][t_cls]
|
||||
)
|
||||
head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
|
||||
|
||||
non_head_vacancy_table = self.non_head_vacancy_table
|
||||
for dv, max_vs in counts.non_head_vacancy.items():
|
||||
for max_v, trg_classes in max_vs.items():
|
||||
for t_cls in trg_classes:
|
||||
estimate = (
|
||||
counts.non_head_vacancy[dv][max_v][t_cls]
|
||||
/ counts.non_head_vacancy_for_any_dv[max_v][t_cls]
|
||||
)
|
||||
non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB)
|
||||
|
||||
|
||||
class Model5Counts(Counts):
|
||||
"""
|
||||
Data object to store counts of various parameters during training.
|
||||
Includes counts for vacancies.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.head_vacancy = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
self.head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(float))
|
||||
self.non_head_vacancy = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
self.non_head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(float))
|
||||
|
||||
def update_vacancy(self, count, alignment_info, i, trg_classes, slots):
|
||||
"""
|
||||
:param count: Value to add to the vacancy counts
|
||||
:param alignment_info: Alignment under consideration
|
||||
:param i: Source word position under consideration
|
||||
:param trg_classes: Target word classes
|
||||
:param slots: Vacancy states of the slots in the target sentence.
|
||||
Output parameter that will be modified as new words are placed
|
||||
in the target sentence.
|
||||
"""
|
||||
tablet = alignment_info.cepts[i]
|
||||
tablet_length = len(tablet)
|
||||
total_vacancies = slots.vacancies_at(len(slots))
|
||||
|
||||
# case 1: NULL aligned words
|
||||
if tablet_length == 0:
|
||||
return # ignore zero fertility words
|
||||
|
||||
# case 2: head word
|
||||
j = tablet[0]
|
||||
previous_cept = alignment_info.previous_cept(j)
|
||||
previous_center = alignment_info.center_of_cept(previous_cept)
|
||||
dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
|
||||
max_v = total_vacancies - tablet_length + 1
|
||||
trg_class = trg_classes[alignment_info.trg_sentence[j]]
|
||||
self.head_vacancy[dv][max_v][trg_class] += count
|
||||
self.head_vacancy_for_any_dv[max_v][trg_class] += count
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
|
||||
# case 3: non-head words
|
||||
for k in range(1, tablet_length):
|
||||
previous_position = tablet[k - 1]
|
||||
previous_vacancies = slots.vacancies_at(previous_position)
|
||||
j = tablet[k]
|
||||
dv = slots.vacancies_at(j) - previous_vacancies
|
||||
max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies
|
||||
trg_class = trg_classes[alignment_info.trg_sentence[j]]
|
||||
self.non_head_vacancy[dv][max_v][trg_class] += count
|
||||
self.non_head_vacancy_for_any_dv[max_v][trg_class] += count
|
||||
slots.occupy(j) # mark position as occupied
|
||||
total_vacancies -= 1
|
||||
|
||||
|
||||
class Slots:
|
||||
"""
|
||||
Represents positions in a target sentence. Used to keep track of
|
||||
which slot (position) is occupied.
|
||||
"""
|
||||
|
||||
def __init__(self, target_sentence_length):
|
||||
self._slots = [False] * (target_sentence_length + 1) # 1-indexed
|
||||
|
||||
def occupy(self, position):
|
||||
"""
|
||||
:return: Mark slot at ``position`` as occupied
|
||||
"""
|
||||
self._slots[position] = True
|
||||
|
||||
def vacancies_at(self, position):
|
||||
"""
|
||||
:return: Number of vacant slots up to, and including, ``position``
|
||||
"""
|
||||
vacancies = 0
|
||||
for k in range(1, position + 1):
|
||||
if not self._slots[k]:
|
||||
vacancies += 1
|
||||
return vacancies
|
||||
|
||||
def __len__(self):
|
||||
return len(self._slots) - 1 # exclude dummy zeroeth element
|
||||
@@ -0,0 +1,549 @@
|
||||
# Natural Language Toolkit: IBM Model Core
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Common methods and classes for all IBM models. See ``IBMModel1``,
|
||||
``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5``
|
||||
for specific implementations.
|
||||
|
||||
The IBM models are a series of generative models that learn lexical
|
||||
translation probabilities, p(target language word|source language word),
|
||||
given a sentence-aligned parallel corpus.
|
||||
|
||||
The models increase in sophistication from model 1 to 5. Typically, the
|
||||
output of lower models is used to seed the higher models. All models
|
||||
use the Expectation-Maximization (EM) algorithm to learn various
|
||||
probability tables.
|
||||
|
||||
Words in a sentence are one-indexed. The first word of a sentence has
|
||||
position 1, not 0. Index 0 is reserved in the source sentence for the
|
||||
NULL token. The concept of position does not apply to NULL, but it is
|
||||
indexed at 0 by convention.
|
||||
|
||||
Each target word is aligned to exactly one source word or the NULL
|
||||
token.
|
||||
|
||||
References:
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
|
||||
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
|
||||
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
|
||||
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
|
||||
263-311.
|
||||
"""
|
||||
|
||||
from bisect import insort_left
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from math import ceil
|
||||
|
||||
|
||||
def longest_target_sentence_length(sentence_aligned_corpus):
|
||||
"""
|
||||
:param sentence_aligned_corpus: Parallel corpus under consideration
|
||||
:type sentence_aligned_corpus: list(AlignedSent)
|
||||
:return: Number of words in the longest target language sentence
|
||||
of ``sentence_aligned_corpus``
|
||||
"""
|
||||
max_m = 0
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
m = len(aligned_sentence.words)
|
||||
max_m = max(m, max_m)
|
||||
return max_m
|
||||
|
||||
|
||||
class IBMModel:
|
||||
"""
|
||||
Abstract base class for all IBM models
|
||||
"""
|
||||
|
||||
# Avoid division by zero and precision errors by imposing a minimum
|
||||
# value for probabilities. Note that this approach is theoretically
|
||||
# incorrect, since it may create probabilities that sum to more
|
||||
# than 1. In practice, the contribution of probabilities with MIN_PROB
|
||||
# is tiny enough that the value of MIN_PROB can be treated as zero.
|
||||
MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7
|
||||
|
||||
def __init__(self, sentence_aligned_corpus):
|
||||
self.init_vocab(sentence_aligned_corpus)
|
||||
self.reset_probabilities()
|
||||
|
||||
def reset_probabilities(self):
|
||||
self.translation_table = defaultdict(
|
||||
lambda: defaultdict(lambda: IBMModel.MIN_PROB)
|
||||
)
|
||||
"""
|
||||
dict[str][str]: float. Probability(target word | source word).
|
||||
Values accessed as ``translation_table[target_word][source_word]``.
|
||||
"""
|
||||
|
||||
self.alignment_table = defaultdict(
|
||||
lambda: defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB))
|
||||
)
|
||||
)
|
||||
"""
|
||||
dict[int][int][int][int]: float. Probability(i | j,l,m).
|
||||
Values accessed as ``alignment_table[i][j][l][m]``.
|
||||
Used in model 2 and hill climbing in models 3 and above
|
||||
"""
|
||||
|
||||
self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
|
||||
"""
|
||||
dict[int][str]: float. Probability(fertility | source word).
|
||||
Values accessed as ``fertility_table[fertility][source_word]``.
|
||||
Used in model 3 and higher.
|
||||
"""
|
||||
|
||||
self.p1 = 0.5
|
||||
"""
|
||||
Probability that a generated word requires another target word
|
||||
that is aligned to NULL.
|
||||
Used in model 3 and higher.
|
||||
"""
|
||||
|
||||
def set_uniform_probabilities(self, sentence_aligned_corpus):
|
||||
"""
|
||||
Initialize probability tables to a uniform distribution
|
||||
|
||||
Derived classes should implement this accordingly.
|
||||
"""
|
||||
pass
|
||||
|
||||
def init_vocab(self, sentence_aligned_corpus):
|
||||
src_vocab = set()
|
||||
trg_vocab = set()
|
||||
for aligned_sentence in sentence_aligned_corpus:
|
||||
trg_vocab.update(aligned_sentence.words)
|
||||
src_vocab.update(aligned_sentence.mots)
|
||||
# Add the NULL token
|
||||
src_vocab.add(None)
|
||||
|
||||
self.src_vocab = src_vocab
|
||||
"""
|
||||
set(str): All source language words used in training
|
||||
"""
|
||||
|
||||
self.trg_vocab = trg_vocab
|
||||
"""
|
||||
set(str): All target language words used in training
|
||||
"""
|
||||
|
||||
def sample(self, sentence_pair):
|
||||
"""
|
||||
Sample the most probable alignments from the entire alignment
|
||||
space
|
||||
|
||||
First, determine the best alignment according to IBM Model 2.
|
||||
With this initial alignment, use hill climbing to determine the
|
||||
best alignment according to a higher IBM Model. Add this
|
||||
alignment and its neighbors to the sample set. Repeat this
|
||||
process with other initial alignments obtained by pegging an
|
||||
alignment point.
|
||||
|
||||
Hill climbing may be stuck in a local maxima, hence the pegging
|
||||
and trying out of different alignments.
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to generate a sample of alignments from
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:return: A set of best alignments represented by their ``AlignmentInfo``
|
||||
and the best alignment of the set for convenience
|
||||
:rtype: set(AlignmentInfo), AlignmentInfo
|
||||
"""
|
||||
sampled_alignments = set()
|
||||
l = len(sentence_pair.mots)
|
||||
m = len(sentence_pair.words)
|
||||
|
||||
# Start from the best model 2 alignment
|
||||
initial_alignment = self.best_model2_alignment(sentence_pair)
|
||||
potential_alignment = self.hillclimb(initial_alignment)
|
||||
sampled_alignments.update(self.neighboring(potential_alignment))
|
||||
best_alignment = potential_alignment
|
||||
|
||||
# Start from other model 2 alignments,
|
||||
# with the constraint that j is aligned (pegged) to i
|
||||
for j in range(1, m + 1):
|
||||
for i in range(0, l + 1):
|
||||
initial_alignment = self.best_model2_alignment(sentence_pair, j, i)
|
||||
potential_alignment = self.hillclimb(initial_alignment, j)
|
||||
neighbors = self.neighboring(potential_alignment, j)
|
||||
sampled_alignments.update(neighbors)
|
||||
if potential_alignment.score > best_alignment.score:
|
||||
best_alignment = potential_alignment
|
||||
|
||||
return sampled_alignments, best_alignment
|
||||
|
||||
def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0):
|
||||
"""
|
||||
Finds the best alignment according to IBM Model 2
|
||||
|
||||
Used as a starting point for hill climbing in Models 3 and
|
||||
above, because it is easier to compute than the best alignments
|
||||
in higher models
|
||||
|
||||
:param sentence_pair: Source and target language sentence pair
|
||||
to be word-aligned
|
||||
:type sentence_pair: AlignedSent
|
||||
|
||||
:param j_pegged: If specified, the alignment point of j_pegged
|
||||
will be fixed to i_pegged
|
||||
:type j_pegged: int
|
||||
|
||||
:param i_pegged: Alignment point to j_pegged
|
||||
:type i_pegged: int
|
||||
"""
|
||||
src_sentence = [None] + sentence_pair.mots
|
||||
trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed
|
||||
|
||||
l = len(src_sentence) - 1 # exclude NULL
|
||||
m = len(trg_sentence) - 1
|
||||
|
||||
alignment = [0] * (m + 1) # init all alignments to NULL
|
||||
cepts = [[] for i in range(l + 1)] # init all cepts to empty list
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j == j_pegged:
|
||||
# use the pegged alignment instead of searching for best one
|
||||
best_i = i_pegged
|
||||
else:
|
||||
best_i = 0
|
||||
max_alignment_prob = IBMModel.MIN_PROB
|
||||
t = trg_sentence[j]
|
||||
|
||||
for i in range(0, l + 1):
|
||||
s = src_sentence[i]
|
||||
alignment_prob = (
|
||||
self.translation_table[t][s] * self.alignment_table[i][j][l][m]
|
||||
)
|
||||
|
||||
if alignment_prob >= max_alignment_prob:
|
||||
max_alignment_prob = alignment_prob
|
||||
best_i = i
|
||||
|
||||
alignment[j] = best_i
|
||||
cepts[best_i].append(j)
|
||||
|
||||
return AlignmentInfo(
|
||||
tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts
|
||||
)
|
||||
|
||||
def hillclimb(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Starting from the alignment in ``alignment_info``, look at
|
||||
neighboring alignments iteratively for the best one
|
||||
|
||||
There is no guarantee that the best alignment in the alignment
|
||||
space will be found, because the algorithm might be stuck in a
|
||||
local maximum.
|
||||
|
||||
:param j_pegged: If specified, the search will be constrained to
|
||||
alignments where ``j_pegged`` remains unchanged
|
||||
:type j_pegged: int
|
||||
|
||||
:return: The best alignment found from hill climbing
|
||||
:rtype: AlignmentInfo
|
||||
"""
|
||||
alignment = alignment_info # alias with shorter name
|
||||
max_probability = self.prob_t_a_given_s(alignment)
|
||||
|
||||
while True:
|
||||
old_alignment = alignment
|
||||
for neighbor_alignment in self.neighboring(alignment, j_pegged):
|
||||
neighbor_probability = self.prob_t_a_given_s(neighbor_alignment)
|
||||
|
||||
if neighbor_probability > max_probability:
|
||||
alignment = neighbor_alignment
|
||||
max_probability = neighbor_probability
|
||||
|
||||
if alignment == old_alignment:
|
||||
# Until there are no better alignments
|
||||
break
|
||||
|
||||
alignment.score = max_probability
|
||||
return alignment
|
||||
|
||||
def neighboring(self, alignment_info, j_pegged=None):
|
||||
"""
|
||||
Determine the neighbors of ``alignment_info``, obtained by
|
||||
moving or swapping one alignment point
|
||||
|
||||
:param j_pegged: If specified, neighbors that have a different
|
||||
alignment point from j_pegged will not be considered
|
||||
:type j_pegged: int
|
||||
|
||||
:return: A set neighboring alignments represented by their
|
||||
``AlignmentInfo``
|
||||
:rtype: set(AlignmentInfo)
|
||||
"""
|
||||
neighbors = set()
|
||||
|
||||
l = len(alignment_info.src_sentence) - 1 # exclude NULL
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
original_alignment = alignment_info.alignment
|
||||
original_cepts = alignment_info.cepts
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j != j_pegged:
|
||||
# Add alignments that differ by one alignment point
|
||||
for i in range(0, l + 1):
|
||||
new_alignment = list(original_alignment)
|
||||
new_cepts = deepcopy(original_cepts)
|
||||
old_i = original_alignment[j]
|
||||
|
||||
# update alignment
|
||||
new_alignment[j] = i
|
||||
|
||||
# update cepts
|
||||
insort_left(new_cepts[i], j)
|
||||
new_cepts[old_i].remove(j)
|
||||
|
||||
new_alignment_info = AlignmentInfo(
|
||||
tuple(new_alignment),
|
||||
alignment_info.src_sentence,
|
||||
alignment_info.trg_sentence,
|
||||
new_cepts,
|
||||
)
|
||||
neighbors.add(new_alignment_info)
|
||||
|
||||
for j in range(1, m + 1):
|
||||
if j != j_pegged:
|
||||
# Add alignments that have two alignment points swapped
|
||||
for other_j in range(1, m + 1):
|
||||
if other_j != j_pegged and other_j != j:
|
||||
new_alignment = list(original_alignment)
|
||||
new_cepts = deepcopy(original_cepts)
|
||||
other_i = original_alignment[other_j]
|
||||
i = original_alignment[j]
|
||||
|
||||
# update alignments
|
||||
new_alignment[j] = other_i
|
||||
new_alignment[other_j] = i
|
||||
|
||||
# update cepts
|
||||
new_cepts[other_i].remove(other_j)
|
||||
insort_left(new_cepts[other_i], j)
|
||||
new_cepts[i].remove(j)
|
||||
insort_left(new_cepts[i], other_j)
|
||||
|
||||
new_alignment_info = AlignmentInfo(
|
||||
tuple(new_alignment),
|
||||
alignment_info.src_sentence,
|
||||
alignment_info.trg_sentence,
|
||||
new_cepts,
|
||||
)
|
||||
neighbors.add(new_alignment_info)
|
||||
|
||||
return neighbors
|
||||
|
||||
def maximize_lexical_translation_probabilities(self, counts):
|
||||
for t, src_words in counts.t_given_s.items():
|
||||
for s in src_words:
|
||||
estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s]
|
||||
self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def maximize_fertility_probabilities(self, counts):
|
||||
for phi, src_words in counts.fertility.items():
|
||||
for s in src_words:
|
||||
estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s]
|
||||
self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
|
||||
|
||||
def maximize_null_generation_probabilities(self, counts):
|
||||
p1_estimate = counts.p1 / (counts.p1 + counts.p0)
|
||||
p1_estimate = max(p1_estimate, IBMModel.MIN_PROB)
|
||||
# Clip p1 if it is too large, because p0 = 1 - p1 should not be
|
||||
# smaller than MIN_PROB
|
||||
self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB)
|
||||
|
||||
def prob_of_alignments(self, alignments):
|
||||
probability = 0
|
||||
for alignment_info in alignments:
|
||||
probability += self.prob_t_a_given_s(alignment_info)
|
||||
return probability
|
||||
|
||||
def prob_t_a_given_s(self, alignment_info):
|
||||
"""
|
||||
Probability of target sentence and an alignment given the
|
||||
source sentence
|
||||
|
||||
All required information is assumed to be in ``alignment_info``
|
||||
and self.
|
||||
|
||||
Derived classes should override this method
|
||||
"""
|
||||
return 0.0
|
||||
|
||||
|
||||
class AlignmentInfo:
|
||||
"""
|
||||
Helper data object for training IBM Models 3 and up
|
||||
|
||||
Read-only. For a source sentence and its counterpart in the target
|
||||
language, this class holds information about the sentence pair's
|
||||
alignment, cepts, and fertility.
|
||||
|
||||
Warning: Alignments are one-indexed here, in contrast to
|
||||
nltk.translate.Alignment and AlignedSent, which are zero-indexed
|
||||
This class is not meant to be used outside of IBM models.
|
||||
"""
|
||||
|
||||
def __init__(self, alignment, src_sentence, trg_sentence, cepts):
|
||||
if not isinstance(alignment, tuple):
|
||||
raise TypeError(
|
||||
"The alignment must be a tuple because it is used "
|
||||
"to uniquely identify AlignmentInfo objects."
|
||||
)
|
||||
|
||||
self.alignment = alignment
|
||||
"""
|
||||
tuple(int): Alignment function. ``alignment[j]`` is the position
|
||||
in the source sentence that is aligned to the position j in the
|
||||
target sentence.
|
||||
"""
|
||||
|
||||
self.src_sentence = src_sentence
|
||||
"""
|
||||
tuple(str): Source sentence referred to by this object.
|
||||
Should include NULL token (None) in index 0.
|
||||
"""
|
||||
|
||||
self.trg_sentence = trg_sentence
|
||||
"""
|
||||
tuple(str): Target sentence referred to by this object.
|
||||
Should have a dummy element in index 0 so that the first word
|
||||
starts from index 1.
|
||||
"""
|
||||
|
||||
self.cepts = cepts
|
||||
"""
|
||||
list(list(int)): The positions of the target words, in
|
||||
ascending order, aligned to a source word position. For example,
|
||||
cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7
|
||||
of the target sentence are aligned to the word in position 4 of
|
||||
the source sentence
|
||||
"""
|
||||
|
||||
self.score = None
|
||||
"""
|
||||
float: Optional. Probability of alignment, as defined by the
|
||||
IBM model that assesses this alignment
|
||||
"""
|
||||
|
||||
def fertility_of_i(self, i):
|
||||
"""
|
||||
Fertility of word in position ``i`` of the source sentence
|
||||
"""
|
||||
return len(self.cepts[i])
|
||||
|
||||
def is_head_word(self, j):
|
||||
"""
|
||||
:return: Whether the word in position ``j`` of the target
|
||||
sentence is a head word
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
return self.cepts[i][0] == j
|
||||
|
||||
def center_of_cept(self, i):
|
||||
"""
|
||||
:return: The ceiling of the average positions of the words in
|
||||
the tablet of cept ``i``, or 0 if ``i`` is None
|
||||
"""
|
||||
if i is None:
|
||||
return 0
|
||||
|
||||
average_position = sum(self.cepts[i]) / len(self.cepts[i])
|
||||
return int(ceil(average_position))
|
||||
|
||||
def previous_cept(self, j):
|
||||
"""
|
||||
:return: The previous cept of ``j``, or None if ``j`` belongs to
|
||||
the first cept
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
if i == 0:
|
||||
raise ValueError(
|
||||
"Words aligned to NULL cannot have a previous "
|
||||
"cept because NULL has no position"
|
||||
)
|
||||
previous_cept = i - 1
|
||||
while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
|
||||
previous_cept -= 1
|
||||
|
||||
if previous_cept <= 0:
|
||||
previous_cept = None
|
||||
return previous_cept
|
||||
|
||||
def previous_in_tablet(self, j):
|
||||
"""
|
||||
:return: The position of the previous word that is in the same
|
||||
tablet as ``j``, or None if ``j`` is the first word of the
|
||||
tablet
|
||||
"""
|
||||
i = self.alignment[j]
|
||||
tablet_position = self.cepts[i].index(j)
|
||||
if tablet_position == 0:
|
||||
return None
|
||||
return self.cepts[i][tablet_position - 1]
|
||||
|
||||
def zero_indexed_alignment(self):
|
||||
"""
|
||||
:return: Zero-indexed alignment, suitable for use in external
|
||||
``nltk.translate`` modules like ``nltk.translate.Alignment``
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
zero_indexed_alignment = []
|
||||
for j in range(1, len(self.trg_sentence)):
|
||||
i = self.alignment[j] - 1
|
||||
if i < 0:
|
||||
i = None # alignment to NULL token
|
||||
zero_indexed_alignment.append((j - 1, i))
|
||||
return zero_indexed_alignment
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.alignment == other.alignment
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.alignment)
|
||||
|
||||
|
||||
class Counts:
|
||||
"""
|
||||
Data object to store counts of various parameters during training
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.t_given_s = defaultdict(lambda: defaultdict(float))
|
||||
self.any_t_given_s = defaultdict(float)
|
||||
self.p0 = 0.0
|
||||
self.p1 = 0.0
|
||||
self.fertility = defaultdict(lambda: defaultdict(float))
|
||||
self.fertility_for_any_phi = defaultdict(float)
|
||||
|
||||
def update_lexical_translation(self, count, alignment_info, j):
|
||||
i = alignment_info.alignment[j]
|
||||
t = alignment_info.trg_sentence[j]
|
||||
s = alignment_info.src_sentence[i]
|
||||
self.t_given_s[t][s] += count
|
||||
self.any_t_given_s[s] += count
|
||||
|
||||
def update_null_generation(self, count, alignment_info):
|
||||
m = len(alignment_info.trg_sentence) - 1
|
||||
fertility_of_null = alignment_info.fertility_of_i(0)
|
||||
self.p1 += fertility_of_null * count
|
||||
self.p0 += (m - 2 * fertility_of_null) * count
|
||||
|
||||
def update_fertility(self, count, alignment_info):
|
||||
for i in range(0, len(alignment_info.src_sentence)):
|
||||
s = alignment_info.src_sentence[i]
|
||||
phi = alignment_info.fertility_of_i(i)
|
||||
self.fertility[phi][s] += count
|
||||
self.fertility_for_any_phi[s] += count
|
||||
@@ -0,0 +1,332 @@
|
||||
# Natural Language Toolkit: LEPOR Score
|
||||
#
|
||||
# Copyright (C) 2001-2023 NLTK Project
|
||||
# Author: Ikram Ul Haq (ulhaqi12)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""LEPOR score implementation."""
|
||||
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
from typing import Callable, List
|
||||
|
||||
import nltk
|
||||
|
||||
|
||||
def length_penalty(reference: List[str], hypothesis: List[str]) -> float:
|
||||
"""
|
||||
This function calculates the length penalty(LP) for the LEPOR metric, which is defined to embrace the penaltyvfor
|
||||
both longer and shorter hypothesis compared with the reference translations.
|
||||
Refer from Eq (2) on https://aclanthology.org/C12-2044
|
||||
|
||||
:param reference: Reference sentence
|
||||
:type reference: str
|
||||
:param hypothesis: Hypothesis sentence
|
||||
:type hypothesis: str
|
||||
|
||||
:return: Penalty of difference in length in reference and hypothesis sentence.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
ref_len = len(reference)
|
||||
hyp_len = len(hypothesis)
|
||||
|
||||
if ref_len == hyp_len:
|
||||
return 1
|
||||
elif ref_len < hyp_len:
|
||||
return math.exp(1 - (ref_len / hyp_len))
|
||||
else: # i.e. r_len > hyp_len
|
||||
return math.exp(1 - (hyp_len / ref_len))
|
||||
|
||||
|
||||
def alignment(ref_tokens: List[str], hyp_tokens: List[str]):
|
||||
"""
|
||||
This function computes the context-dependent n-gram word alignment tasks that
|
||||
takes into account the surrounding context (neighbouring words) of the potential
|
||||
word to select a better matching pairs between the output and the reference.
|
||||
|
||||
This alignment task is used to compute the ngram positional difference penalty
|
||||
component of the LEPOR score. Generally, the function finds the matching tokens
|
||||
between the reference and hypothesis, then find the indices of longest matching
|
||||
n-grams by checking the left and right unigram window of the matching tokens.
|
||||
|
||||
:param ref_tokens: A list of tokens in reference sentence.
|
||||
:type ref_tokens: List[str]
|
||||
:param hyp_tokens: A list of tokens in hypothesis sentence.
|
||||
:type hyp_tokens: List[str]
|
||||
"""
|
||||
alignments = []
|
||||
|
||||
# Store the reference and hypothesis tokens length.
|
||||
hyp_len = len(hyp_tokens)
|
||||
ref_len = len(ref_tokens)
|
||||
|
||||
for hyp_index, hyp_token in enumerate(hyp_tokens):
|
||||
# If no match.
|
||||
if ref_tokens.count(hyp_token) == 0:
|
||||
alignments.append(-1)
|
||||
# If only one match.
|
||||
elif ref_tokens.count(hyp_token) == 1:
|
||||
alignments.append(ref_tokens.index(hyp_token))
|
||||
# Otherwise, compute the multiple possibilities.
|
||||
else:
|
||||
# Keeps an index of where the hypothesis token matches the reference.
|
||||
ref_indexes = [
|
||||
i for i, ref_token in enumerate(ref_tokens) if ref_token == hyp_token
|
||||
]
|
||||
|
||||
# Iterate through the matched tokens, and check if
|
||||
# the one token to the left/right also matches.
|
||||
is_matched = []
|
||||
for ind, ref_index in enumerate(ref_indexes):
|
||||
# The one to the left token also matches.
|
||||
if (
|
||||
0 < ref_index - 1 < ref_len
|
||||
and 0 < hyp_index - 1 < hyp_len
|
||||
and ref_tokens[ref_index - 1] == hyp_tokens[hyp_index - 1]
|
||||
):
|
||||
is_matched[ind] = True
|
||||
# The one to the right token also matches.
|
||||
elif (
|
||||
0 < ref_index + 1 < ref_len
|
||||
and 0 < hyp_index + 1 < hyp_len
|
||||
and ref_tokens[ref_index + 1] == hyp_tokens[hyp_index + 1]
|
||||
):
|
||||
is_matched[ind] = True
|
||||
# If the left and right tokens don't match.
|
||||
else:
|
||||
is_matched[ind] = False
|
||||
|
||||
# Stores the alignments that have matching phrases.
|
||||
# If there's only a single matched alignment.
|
||||
if is_matched.count(True) == 1:
|
||||
alignments.append(ref_indexes[is_matched.index(True)])
|
||||
# If there's multiple matched alignments that have matching
|
||||
# tokens in the left/right window, we shift the index of the
|
||||
# alignment to the right most matching token.
|
||||
elif is_matched.count(True) > 1:
|
||||
min_distance = 0
|
||||
min_index = 0
|
||||
for match, ref_index in zip(is_matched, ref_indexes):
|
||||
if match:
|
||||
distance = abs(hyp_index - ref_index)
|
||||
if distance > min_distance:
|
||||
min_distance = distance
|
||||
min_index = ref_index
|
||||
alignments.append(min_index)
|
||||
# If there's no matched alignments,
|
||||
# we still keep indexes of the matching tokens
|
||||
# without explicitly checking for the left/right window.
|
||||
else:
|
||||
min_distance = 0
|
||||
min_index = 0
|
||||
for ref_index in ref_indexes:
|
||||
distance = abs(hyp_index - ref_index)
|
||||
if distance > min_distance:
|
||||
min_distance = distance
|
||||
min_index = ref_index
|
||||
alignments.append(min_index)
|
||||
|
||||
for ref_index in ref_indexes:
|
||||
distance = abs(hyp_index - ref_index)
|
||||
if distance > min_distance:
|
||||
min_distance = distance
|
||||
min_index = ref_index
|
||||
alignments.append(min_index)
|
||||
|
||||
# The alignments are one indexed to keep track of the ending slice pointer of the matching ngrams.
|
||||
alignments = [a + 1 for a in alignments if a != -1]
|
||||
return alignments
|
||||
|
||||
|
||||
def ngram_positional_penalty(
|
||||
ref_tokens: List[str], hyp_tokens: List[str]
|
||||
) -> (float, float):
|
||||
"""
|
||||
This function calculates the n-gram position difference penalty (NPosPenal) described in the LEPOR paper.
|
||||
The NPosPenal is an exponential of the length normalized n-gram matches between the reference and the hypothesis.
|
||||
|
||||
:param ref_tokens: A list of words in reference sentence.
|
||||
:type ref_tokens: List[str]
|
||||
:param hyp_tokens: A list of words in hypothesis sentence.
|
||||
:type hyp_tokens: List[str]
|
||||
|
||||
:return: A tuple containing two elements:
|
||||
- NPosPenal: N-gram positional penalty.
|
||||
- match_count: Count of matched n-grams.
|
||||
:rtype: tuple
|
||||
"""
|
||||
|
||||
alignments = alignment(ref_tokens, hyp_tokens)
|
||||
match_count = len(alignments)
|
||||
|
||||
# Stores the n-gram position values (difference values) of aligned words
|
||||
# between output and reference sentences,
|
||||
# aka |PD| of eq (4) in https://aclanthology.org/C12-2044
|
||||
pd = []
|
||||
for i, a in enumerate(alignments):
|
||||
pd.append(abs((i + 1) / len(hyp_tokens) - a / len(ref_tokens)))
|
||||
|
||||
npd = sum(pd) / len(hyp_tokens)
|
||||
return math.exp(-npd), match_count
|
||||
|
||||
|
||||
def harmonic(
|
||||
match_count: int,
|
||||
reference_length: int,
|
||||
hypothesis_length: int,
|
||||
alpha: float,
|
||||
beta: float,
|
||||
) -> float:
|
||||
"""
|
||||
Function will calculate the precision and recall of matched words and calculate a final score on wighting
|
||||
using alpha and beta parameters.
|
||||
|
||||
:param match_count: Number of words in hypothesis aligned with reference.
|
||||
:type match_count: int
|
||||
:param reference_length: Length of the reference sentence
|
||||
:type reference_length: int
|
||||
:param hypothesis_length: Length of the hypothesis sentence
|
||||
:type hypothesis_length: int
|
||||
:param alpha: A parameter to set weight fot recall.
|
||||
:type alpha: float
|
||||
:param beta: A parameter to set weight fot precision.
|
||||
:type beta: float
|
||||
|
||||
:return: Harmonic mean.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
epsilon = sys.float_info.epsilon
|
||||
|
||||
precision = match_count / hypothesis_length
|
||||
recall = match_count / reference_length
|
||||
|
||||
harmonic_score = (alpha + beta) / (
|
||||
(alpha / (recall + epsilon)) + (beta / (precision + epsilon))
|
||||
)
|
||||
|
||||
return harmonic_score
|
||||
|
||||
|
||||
def sentence_lepor(
|
||||
references: List[str],
|
||||
hypothesis: str,
|
||||
alpha: float = 1.0,
|
||||
beta: float = 1.0,
|
||||
tokenizer: Callable[[str], List[str]] = None,
|
||||
) -> List[float]:
|
||||
"""
|
||||
Calculate LEPOR score a sentence from Han, A. L.-F. (2017).
|
||||
LEPOR: An Augmented Machine Translation Evaluation Metric. https://arxiv.org/abs/1703.08748v2
|
||||
|
||||
>>> hypothesis = 'a bird is on a stone.'
|
||||
|
||||
>>> reference1 = 'a bird behind the stone.'
|
||||
>>> reference2 = 'a bird is on the rock.'
|
||||
|
||||
>>> sentence_lepor([reference1, reference2], hypothesis)
|
||||
[0.7824248013113159, 0.7739937377760259]
|
||||
|
||||
:param references: Reference sentences
|
||||
:type references: list(str)
|
||||
:param hypothesis: Hypothesis sentence
|
||||
:type hypothesis: str
|
||||
:param alpha: A parameter to set weight fot recall.
|
||||
:type alpha: float
|
||||
:param beta: A parameter to set weight fot precision.
|
||||
:type beta: float
|
||||
:param tokenizer: A callable tokenizer that will accept a string and returns a list of tokens.
|
||||
:type tokenizer: Callable[[str], List[str]]
|
||||
|
||||
:return: The list of Lepor scores for a hypothesis with all references.
|
||||
:rtype: list(float)
|
||||
|
||||
"""
|
||||
|
||||
lepor_scores = list()
|
||||
|
||||
# Tokenize sentences.
|
||||
if tokenizer:
|
||||
hypothesis = tokenizer(hypothesis)
|
||||
for index, reference in enumerate(references):
|
||||
references[index] = tokenizer(reference)
|
||||
|
||||
else: # If tokenizer is not provided, use the one in NLTK.
|
||||
hypothesis = nltk.word_tokenize(hypothesis)
|
||||
for index, reference in enumerate(references):
|
||||
references[index] = nltk.word_tokenize(reference)
|
||||
|
||||
for reference in references:
|
||||
if len(reference) == 0 or len(hypothesis) == 0:
|
||||
raise ValueError("One of the sentence is empty. Exit.")
|
||||
|
||||
# Calculate the length penalty due to the difference in the length of reference and hypothesis.
|
||||
lp = length_penalty(reference, hypothesis)
|
||||
|
||||
# Calculate the penalty on different positions of same word in translation.
|
||||
npd, match_count = ngram_positional_penalty(reference, hypothesis)
|
||||
|
||||
harmonic_score = harmonic(
|
||||
match_count, len(reference), len(hypothesis), alpha, beta
|
||||
)
|
||||
|
||||
lepor_scores.append(lp * npd * harmonic_score)
|
||||
|
||||
return lepor_scores
|
||||
|
||||
|
||||
def corpus_lepor(
|
||||
references: List[List[str]],
|
||||
hypothesis: List[str],
|
||||
alpha: float = 1.0,
|
||||
beta: float = 1.0,
|
||||
tokenizer: Callable[[str], List[str]] = None,
|
||||
) -> List[List[float]]:
|
||||
"""
|
||||
Calculate LEPOR score for list of sentences from Han, A. L.-F. (2017).
|
||||
LEPOR: An Augmented Machine Translation Evaluation Metric. https://arxiv.org/abs/1703.08748v2
|
||||
|
||||
>>> hypothesis = ['a bird is on a stone.', 'scary crow was not bad.']
|
||||
|
||||
>>> references = [['a bird behind the stone.', 'a bird is on the rock'],
|
||||
... ['scary cow was good.', 'scary crow was elegant.']]
|
||||
|
||||
>>> corpus_lepor(references, hypothesis)
|
||||
[[0.7824248013113159, 0.7931427828105261], [0.5639427891892225, 0.7860963170056643]]
|
||||
|
||||
|
||||
:param references: Reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: Hypothesis sentences
|
||||
:type hypothesis: list(str)
|
||||
:param alpha: A parameter to set weight fot recall.
|
||||
:type alpha: float
|
||||
:param beta: A parameter to set weight fot precision.
|
||||
:type beta: float
|
||||
:param tokenizer: A callable tokenizer that will accept a string and returns a list of tokens.
|
||||
:type tokenizer: Callable[[str], List[str]]
|
||||
|
||||
:return: The Lepor score. Returns a list for all sentences
|
||||
:rtype: list(list(float))
|
||||
|
||||
"""
|
||||
|
||||
if len(references) == 0 or len(hypothesis) == 0:
|
||||
raise ValueError("There is an Empty list. Exit.")
|
||||
|
||||
assert len(references) == len(hypothesis), (
|
||||
"The number of hypothesis and their reference(s) should be the " "same "
|
||||
)
|
||||
|
||||
lepor_scores = list()
|
||||
|
||||
for reference_sen, hypothesis_sen in zip(references, hypothesis):
|
||||
# Calculate Lepor for each sentence separately and append in a list.
|
||||
lepor_scores.append(
|
||||
sentence_lepor(reference_sen, hypothesis_sen, alpha, beta, tokenizer)
|
||||
)
|
||||
|
||||
return lepor_scores
|
||||
@@ -0,0 +1,409 @@
|
||||
# Natural Language Toolkit: Machine Translation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Uday Krishna <udaykrishna5@gmail.com>
|
||||
# Contributor: Tom Aarsen
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
from itertools import chain, product
|
||||
from typing import Callable, Iterable, List, Tuple
|
||||
|
||||
from nltk.corpus import WordNetCorpusReader, wordnet
|
||||
from nltk.stem.api import StemmerI
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
|
||||
def _generate_enums(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
preprocess: Callable[[str], str] = str.lower,
|
||||
) -> Tuple[List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Takes in pre-tokenized inputs for hypothesis and reference and returns
|
||||
enumerated word lists for each of them
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:preprocess: preprocessing method (default str.lower)
|
||||
:return: enumerated words list
|
||||
"""
|
||||
if isinstance(hypothesis, str):
|
||||
raise TypeError(
|
||||
f'"hypothesis" expects pre-tokenized hypothesis (Iterable[str]): {hypothesis}'
|
||||
)
|
||||
|
||||
if isinstance(reference, str):
|
||||
raise TypeError(
|
||||
f'"reference" expects pre-tokenized reference (Iterable[str]): {reference}'
|
||||
)
|
||||
|
||||
enum_hypothesis_list = list(enumerate(map(preprocess, hypothesis)))
|
||||
enum_reference_list = list(enumerate(map(preprocess, reference)))
|
||||
return enum_hypothesis_list, enum_reference_list
|
||||
|
||||
|
||||
def exact_match(
|
||||
hypothesis: Iterable[str], reference: Iterable[str]
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
matches exact words in hypothesis and reference
|
||||
and returns a word mapping based on the enumerated
|
||||
word id between hypothesis and reference
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _match_enums(enum_hypothesis_list, enum_reference_list)
|
||||
|
||||
|
||||
def _match_enums(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
matches exact words in hypothesis and reference and returns
|
||||
a word mapping between enum_hypothesis_list and enum_reference_list
|
||||
based on the enumerated word id.
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
word_match = []
|
||||
for i in range(len(enum_hypothesis_list))[::-1]:
|
||||
for j in range(len(enum_reference_list))[::-1]:
|
||||
if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
|
||||
word_match.append(
|
||||
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
|
||||
)
|
||||
enum_hypothesis_list.pop(i)
|
||||
enum_reference_list.pop(j)
|
||||
break
|
||||
return word_match, enum_hypothesis_list, enum_reference_list
|
||||
|
||||
|
||||
def _enum_stem_match(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Stems each word and matches them in hypothesis and reference
|
||||
and returns a word mapping between enum_hypothesis_list and
|
||||
enum_reference_list based on the enumerated word id. The function also
|
||||
returns a enumerated list of unmatched words for hypothesis and reference.
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
stemmed_enum_hypothesis_list = [
|
||||
(word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list
|
||||
]
|
||||
|
||||
stemmed_enum_reference_list = [
|
||||
(word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list
|
||||
]
|
||||
|
||||
return _match_enums(stemmed_enum_hypothesis_list, stemmed_enum_reference_list)
|
||||
|
||||
|
||||
def stem_match(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Stems each word and matches them in hypothesis and reference
|
||||
and returns a word mapping between hypothesis and reference
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
|
||||
enumerated unmatched reference tuples
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer)
|
||||
|
||||
|
||||
def _enum_wordnetsyn_match(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Matches each word in reference to a word in hypothesis
|
||||
if any synonym of a hypothesis word is the exact match
|
||||
to the reference word.
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
"""
|
||||
word_match = []
|
||||
for i in range(len(enum_hypothesis_list))[::-1]:
|
||||
hypothesis_syns = set(
|
||||
chain.from_iterable(
|
||||
(
|
||||
lemma.name()
|
||||
for lemma in synset.lemmas()
|
||||
if lemma.name().find("_") < 0
|
||||
)
|
||||
for synset in wordnet.synsets(enum_hypothesis_list[i][1])
|
||||
)
|
||||
).union({enum_hypothesis_list[i][1]})
|
||||
for j in range(len(enum_reference_list))[::-1]:
|
||||
if enum_reference_list[j][1] in hypothesis_syns:
|
||||
word_match.append(
|
||||
(enum_hypothesis_list[i][0], enum_reference_list[j][0])
|
||||
)
|
||||
enum_hypothesis_list.pop(i)
|
||||
enum_reference_list.pop(j)
|
||||
break
|
||||
return word_match, enum_hypothesis_list, enum_reference_list
|
||||
|
||||
|
||||
def wordnetsyn_match(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Matches each word in reference to a word in hypothesis if any synonym
|
||||
of a hypothesis word is the exact match to the reference word.
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:return: list of mapped tuples
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _enum_wordnetsyn_match(
|
||||
enum_hypothesis_list, enum_reference_list, wordnet=wordnet
|
||||
)
|
||||
|
||||
|
||||
def _enum_align_words(
|
||||
enum_hypothesis_list: List[Tuple[int, str]],
|
||||
enum_reference_list: List[Tuple[int, str]],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Aligns/matches words in the hypothesis to reference by sequentially
|
||||
applying exact match, stemmed match and wordnet based synonym match.
|
||||
in case there are multiple matches the match which has the least number
|
||||
of crossing is chosen. Takes enumerated list as input instead of
|
||||
string input
|
||||
|
||||
:param enum_hypothesis_list: enumerated hypothesis list
|
||||
:param enum_reference_list: enumerated reference list
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:return: sorted list of matched tuples, unmatched hypothesis list,
|
||||
unmatched reference list
|
||||
"""
|
||||
exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums(
|
||||
enum_hypothesis_list, enum_reference_list
|
||||
)
|
||||
|
||||
stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match(
|
||||
enum_hypothesis_list, enum_reference_list, stemmer=stemmer
|
||||
)
|
||||
|
||||
wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match(
|
||||
enum_hypothesis_list, enum_reference_list, wordnet=wordnet
|
||||
)
|
||||
|
||||
return (
|
||||
sorted(
|
||||
exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0]
|
||||
),
|
||||
enum_hypothesis_list,
|
||||
enum_reference_list,
|
||||
)
|
||||
|
||||
|
||||
def align_words(
|
||||
hypothesis: Iterable[str],
|
||||
reference: Iterable[str],
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]:
|
||||
"""
|
||||
Aligns/matches words in the hypothesis to reference by sequentially
|
||||
applying exact match, stemmed match and wordnet based synonym match.
|
||||
In case there are multiple matches the match which has the least number
|
||||
of crossing is chosen.
|
||||
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param reference: pre-tokenized reference
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list
|
||||
"""
|
||||
enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
|
||||
return _enum_align_words(
|
||||
enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet
|
||||
)
|
||||
|
||||
|
||||
def _count_chunks(matches: List[Tuple[int, int]]) -> int:
|
||||
"""
|
||||
Counts the fewest possible number of chunks such that matched unigrams
|
||||
of each chunk are adjacent to each other. This is used to calculate the
|
||||
fragmentation part of the metric.
|
||||
|
||||
:param matches: list containing a mapping of matched words (output of align_words)
|
||||
:return: Number of chunks a sentence is divided into post alignment
|
||||
"""
|
||||
i = 0
|
||||
chunks = 1
|
||||
while i < len(matches) - 1:
|
||||
if (matches[i + 1][0] == matches[i][0] + 1) and (
|
||||
matches[i + 1][1] == matches[i][1] + 1
|
||||
):
|
||||
i += 1
|
||||
continue
|
||||
i += 1
|
||||
chunks += 1
|
||||
return chunks
|
||||
|
||||
|
||||
def single_meteor_score(
|
||||
reference: Iterable[str],
|
||||
hypothesis: Iterable[str],
|
||||
preprocess: Callable[[str], str] = str.lower,
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
alpha: float = 0.9,
|
||||
beta: float = 3.0,
|
||||
gamma: float = 0.5,
|
||||
) -> float:
|
||||
"""
|
||||
Calculates METEOR score for single hypothesis and reference as per
|
||||
"Meteor: An Automatic Metric for MT Evaluation with HighLevels of
|
||||
Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal,
|
||||
in Proceedings of ACL.
|
||||
https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
|
||||
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']
|
||||
|
||||
|
||||
>>> round(single_meteor_score(reference1, hypothesis1),4)
|
||||
0.6944
|
||||
|
||||
If there is no words match during the alignment the method returns the
|
||||
score as 0. We can safely return a zero instead of raising a
|
||||
division by zero error as no match usually implies a bad translation.
|
||||
|
||||
>>> round(single_meteor_score(['this', 'is', 'a', 'cat'], ['non', 'matching', 'hypothesis']),4)
|
||||
0.0
|
||||
|
||||
:param reference: pre-tokenized reference
|
||||
:param hypothesis: pre-tokenized hypothesis
|
||||
:param preprocess: preprocessing function (default str.lower)
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:param alpha: parameter for controlling relative weights of precision and recall.
|
||||
:param beta: parameter for controlling shape of penalty as a
|
||||
function of as a function of fragmentation.
|
||||
:param gamma: relative weight assigned to fragmentation penalty.
|
||||
:return: The sentence-level METEOR score.
|
||||
"""
|
||||
enum_hypothesis, enum_reference = _generate_enums(
|
||||
hypothesis, reference, preprocess=preprocess
|
||||
)
|
||||
translation_length = len(enum_hypothesis)
|
||||
reference_length = len(enum_reference)
|
||||
matches, _, _ = _enum_align_words(
|
||||
enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet
|
||||
)
|
||||
matches_count = len(matches)
|
||||
try:
|
||||
precision = float(matches_count) / translation_length
|
||||
recall = float(matches_count) / reference_length
|
||||
fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
|
||||
chunk_count = float(_count_chunks(matches))
|
||||
frag_frac = chunk_count / matches_count
|
||||
except ZeroDivisionError:
|
||||
return 0.0
|
||||
penalty = gamma * frag_frac**beta
|
||||
return (1 - penalty) * fmean
|
||||
|
||||
|
||||
def meteor_score(
|
||||
references: Iterable[Iterable[str]],
|
||||
hypothesis: Iterable[str],
|
||||
preprocess: Callable[[str], str] = str.lower,
|
||||
stemmer: StemmerI = PorterStemmer(),
|
||||
wordnet: WordNetCorpusReader = wordnet,
|
||||
alpha: float = 0.9,
|
||||
beta: float = 3.0,
|
||||
gamma: float = 0.5,
|
||||
) -> float:
|
||||
"""
|
||||
Calculates METEOR score for hypothesis with multiple references as
|
||||
described in "Meteor: An Automatic Metric for MT Evaluation with
|
||||
HighLevels of Correlation with Human Judgments" by Alon Lavie and
|
||||
Abhaya Agarwal, in Proceedings of ACL.
|
||||
https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
|
||||
|
||||
|
||||
In case of multiple references the best score is chosen. This method
|
||||
iterates over single_meteor_score and picks the best pair among all
|
||||
the references for a given hypothesis
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party']
|
||||
|
||||
>>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4)
|
||||
0.6944
|
||||
|
||||
If there is no words match during the alignment the method returns the
|
||||
score as 0. We can safely return a zero instead of raising a
|
||||
division by zero error as no match usually implies a bad translation.
|
||||
|
||||
>>> round(meteor_score([['this', 'is', 'a', 'cat']], ['non', 'matching', 'hypothesis']),4)
|
||||
0.0
|
||||
|
||||
:param references: pre-tokenized reference sentences
|
||||
:param hypothesis: a pre-tokenized hypothesis sentence
|
||||
:param preprocess: preprocessing function (default str.lower)
|
||||
:param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
|
||||
:param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
|
||||
:param alpha: parameter for controlling relative weights of precision and recall.
|
||||
:param beta: parameter for controlling shape of penalty as a function
|
||||
of as a function of fragmentation.
|
||||
:param gamma: relative weight assigned to fragmentation penalty.
|
||||
:return: The sentence-level METEOR score.
|
||||
"""
|
||||
return max(
|
||||
single_meteor_score(
|
||||
reference,
|
||||
hypothesis,
|
||||
preprocess=preprocess,
|
||||
stemmer=stemmer,
|
||||
wordnet=wordnet,
|
||||
alpha=alpha,
|
||||
beta=beta,
|
||||
gamma=gamma,
|
||||
)
|
||||
for reference in references
|
||||
)
|
||||
@@ -0,0 +1,41 @@
|
||||
# Natural Language Toolkit: Translation metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Will Zhang <wilzzha@gmail.com>
|
||||
# Guan Gui <ggui@student.unimelb.edu.au>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
def alignment_error_rate(reference, hypothesis, possible=None):
|
||||
"""
|
||||
Return the Alignment Error Rate (AER) of an alignment
|
||||
with respect to a "gold standard" reference alignment.
|
||||
Return an error rate between 0.0 (perfect alignment) and 1.0 (no
|
||||
alignment).
|
||||
|
||||
>>> from nltk.translate import Alignment
|
||||
>>> ref = Alignment([(0, 0), (1, 1), (2, 2)])
|
||||
>>> test = Alignment([(0, 0), (1, 2), (2, 1)])
|
||||
>>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS
|
||||
0.6666666666666667
|
||||
|
||||
:type reference: Alignment
|
||||
:param reference: A gold standard alignment (sure alignments)
|
||||
:type hypothesis: Alignment
|
||||
:param hypothesis: A hypothesis alignment (aka. candidate alignments)
|
||||
:type possible: Alignment or None
|
||||
:param possible: A gold standard reference of possible alignments
|
||||
(defaults to *reference* if None)
|
||||
:rtype: float or None
|
||||
"""
|
||||
|
||||
if possible is None:
|
||||
possible = reference
|
||||
else:
|
||||
assert reference.issubset(possible) # sanity check
|
||||
|
||||
return 1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) / float(
|
||||
len(hypothesis) + len(reference)
|
||||
)
|
||||
@@ -0,0 +1,195 @@
|
||||
# Natural Language Toolkit: NIST Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors:
|
||||
# Contributors:
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""NIST score implementation."""
|
||||
|
||||
import fractions
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
def sentence_nist(references, hypothesis, n=5):
|
||||
"""
|
||||
Calculate NIST score from
|
||||
George Doddington. 2002. "Automatic evaluation of machine translation quality
|
||||
using n-gram co-occurrence statistics." Proceedings of HLT.
|
||||
Morgan Kaufmann Publishers Inc. https://dl.acm.org/citation.cfm?id=1289189.1289273
|
||||
|
||||
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
|
||||
score. The official script used by NIST to compute BLEU and NIST score is
|
||||
mteval-14.pl. The main differences are:
|
||||
|
||||
- BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
|
||||
- NIST has a different brevity penalty
|
||||
- NIST score from mteval-14.pl has a self-contained tokenizer
|
||||
|
||||
Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT
|
||||
used in the NIST score computation.
|
||||
|
||||
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
|
||||
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
||||
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
||||
... 'that', 'party', 'direct']
|
||||
|
||||
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
|
||||
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the',
|
||||
... 'Party']
|
||||
|
||||
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
|
||||
3.3709...
|
||||
|
||||
>>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
|
||||
1.4619...
|
||||
|
||||
:param references: reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param n: highest n-gram order
|
||||
:type n: int
|
||||
"""
|
||||
return corpus_nist([references], [hypothesis], n)
|
||||
|
||||
|
||||
def corpus_nist(list_of_references, hypotheses, n=5):
|
||||
"""
|
||||
Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
|
||||
the hypotheses and their respective references.
|
||||
|
||||
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||||
:type references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param n: highest n-gram order
|
||||
:type n: int
|
||||
"""
|
||||
# Before proceeding to compute NIST, perform sanity checks.
|
||||
assert len(list_of_references) == len(
|
||||
hypotheses
|
||||
), "The number of hypotheses and their reference(s) should be the same"
|
||||
|
||||
# Collect the ngram coounts from the reference sentences.
|
||||
ngram_freq = Counter()
|
||||
total_reference_words = 0
|
||||
for (
|
||||
references
|
||||
) in list_of_references: # For each source sent, there's a list of reference sents.
|
||||
for reference in references:
|
||||
# For each order of ngram, count the ngram occurrences.
|
||||
for i in range(1, n + 1):
|
||||
ngram_freq.update(ngrams(reference, i))
|
||||
total_reference_words += len(reference)
|
||||
|
||||
# Compute the information weights based on the reference sentences.
|
||||
# Eqn 2 in Doddington (2002):
|
||||
# Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
|
||||
information_weights = {}
|
||||
for _ngram in ngram_freq: # w_1 ... w_n
|
||||
_mgram = _ngram[:-1] # w_1 ... w_n-1
|
||||
# From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl#L546
|
||||
# it's computed as such:
|
||||
# denominator = ngram_freq[_mgram] if _mgram and _mgram in ngram_freq else denominator = total_reference_words
|
||||
# information_weights[_ngram] = -1 * math.log(ngram_freq[_ngram]/denominator) / math.log(2)
|
||||
#
|
||||
# Mathematically, it's equivalent to the our implementation:
|
||||
if _mgram and _mgram in ngram_freq:
|
||||
numerator = ngram_freq[_mgram]
|
||||
else:
|
||||
numerator = total_reference_words
|
||||
information_weights[_ngram] = math.log(numerator / ngram_freq[_ngram], 2)
|
||||
|
||||
# Micro-average.
|
||||
nist_precision_numerator_per_ngram = Counter()
|
||||
nist_precision_denominator_per_ngram = Counter()
|
||||
l_ref, l_sys = 0, 0
|
||||
# For each order of ngram.
|
||||
for i in range(1, n + 1):
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
hyp_len = len(hypothesis)
|
||||
|
||||
# Find reference with the best NIST score.
|
||||
nist_score_per_ref = []
|
||||
for reference in references:
|
||||
_ref_len = len(reference)
|
||||
# Counter of ngrams in hypothesis.
|
||||
hyp_ngrams = (
|
||||
Counter(ngrams(hypothesis, i))
|
||||
if len(hypothesis) >= i
|
||||
else Counter()
|
||||
)
|
||||
ref_ngrams = (
|
||||
Counter(ngrams(reference, i)) if len(reference) >= i else Counter()
|
||||
)
|
||||
ngram_overlaps = hyp_ngrams & ref_ngrams
|
||||
# Precision part of the score in Eqn 3
|
||||
_numerator = sum(
|
||||
information_weights[_ngram] * count
|
||||
for _ngram, count in ngram_overlaps.items()
|
||||
)
|
||||
_denominator = sum(hyp_ngrams.values())
|
||||
_precision = 0 if _denominator == 0 else _numerator / _denominator
|
||||
nist_score_per_ref.append(
|
||||
(_precision, _numerator, _denominator, _ref_len)
|
||||
)
|
||||
# Best reference.
|
||||
precision, numerator, denominator, ref_len = max(nist_score_per_ref)
|
||||
nist_precision_numerator_per_ngram[i] += numerator
|
||||
nist_precision_denominator_per_ngram[i] += denominator
|
||||
l_ref += ref_len
|
||||
l_sys += hyp_len
|
||||
|
||||
# Final NIST micro-average mean aggregation.
|
||||
nist_precision = 0
|
||||
for i in nist_precision_numerator_per_ngram:
|
||||
precision = (
|
||||
nist_precision_numerator_per_ngram[i]
|
||||
/ nist_precision_denominator_per_ngram[i]
|
||||
)
|
||||
nist_precision += precision
|
||||
# Eqn 3 in Doddington(2002)
|
||||
return nist_precision * nist_length_penalty(l_ref, l_sys)
|
||||
|
||||
|
||||
def nist_length_penalty(ref_len, hyp_len):
|
||||
"""
|
||||
Calculates the NIST length penalty, from Eq. 3 in Doddington (2002)
|
||||
|
||||
penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 )))
|
||||
|
||||
where,
|
||||
|
||||
`beta` is chosen to make the brevity penalty factor = 0.5 when the
|
||||
no. of words in the system output (hyp) is 2/3 of the average
|
||||
no. of words in the reference translation (ref)
|
||||
|
||||
The NIST penalty is different from BLEU's such that it minimize the impact
|
||||
of the score of small variations in the length of a translation.
|
||||
See Fig. 4 in Doddington (2002)
|
||||
"""
|
||||
ratio = hyp_len / ref_len
|
||||
if 0 < ratio < 1:
|
||||
ratio_x, score_x = 1.5, 0.5
|
||||
beta = math.log(score_x) / math.log(ratio_x) ** 2
|
||||
return math.exp(beta * math.log(ratio) ** 2)
|
||||
else: # ratio <= 0 or ratio >= 1
|
||||
return max(min(ratio, 1.0), 0.0)
|
||||
@@ -0,0 +1,193 @@
|
||||
# Natural Language Toolkit: Phrase Extraction Algorithm
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
def extract(
|
||||
f_start,
|
||||
f_end,
|
||||
e_start,
|
||||
e_end,
|
||||
alignment,
|
||||
f_aligned,
|
||||
srctext,
|
||||
trgtext,
|
||||
srclen,
|
||||
trglen,
|
||||
max_phrase_length,
|
||||
):
|
||||
"""
|
||||
This function checks for alignment point consistency and extracts
|
||||
phrases using the chunk of consistent phrases.
|
||||
|
||||
A phrase pair (e, f ) is consistent with an alignment A if and only if:
|
||||
|
||||
(i) No English words in the phrase pair are aligned to words outside it.
|
||||
|
||||
∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
|
||||
|
||||
(ii) No Foreign words in the phrase pair are aligned to words outside it.
|
||||
|
||||
∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
|
||||
|
||||
(iii) The phrase pair contains at least one alignment point.
|
||||
|
||||
∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A
|
||||
|
||||
:type f_start: int
|
||||
:param f_start: Starting index of the possible foreign language phrases
|
||||
:type f_end: int
|
||||
:param f_end: End index of the possible foreign language phrases
|
||||
:type e_start: int
|
||||
:param e_start: Starting index of the possible source language phrases
|
||||
:type e_end: int
|
||||
:param e_end: End index of the possible source language phrases
|
||||
:type srctext: list
|
||||
:param srctext: The source language tokens, a list of string.
|
||||
:type trgtext: list
|
||||
:param trgtext: The target language tokens, a list of string.
|
||||
:type srclen: int
|
||||
:param srclen: The number of tokens in the source language tokens.
|
||||
:type trglen: int
|
||||
:param trglen: The number of tokens in the target language tokens.
|
||||
"""
|
||||
|
||||
if f_end < 0: # 0-based indexing.
|
||||
return {}
|
||||
# Check if alignment points are consistent.
|
||||
for e, f in alignment:
|
||||
if (f_start <= f <= f_end) and (e < e_start or e > e_end):
|
||||
return {}
|
||||
|
||||
# Add phrase pairs (incl. additional unaligned f)
|
||||
phrases = set()
|
||||
fs = f_start
|
||||
while True:
|
||||
fe = min(f_end, f_start + max_phrase_length - 1)
|
||||
while True:
|
||||
# add phrase pair ([e_start, e_end], [fs, fe]) to set E
|
||||
# Need to +1 in range to include the end-point.
|
||||
src_phrase = " ".join(srctext[e_start : e_end + 1])
|
||||
trg_phrase = " ".join(trgtext[fs : fe + 1])
|
||||
# Include more data for later ordering.
|
||||
phrases.add(((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase))
|
||||
fe += 1
|
||||
if fe in f_aligned or fe >= trglen:
|
||||
break
|
||||
fs -= 1
|
||||
if fs in f_aligned or fs < 0:
|
||||
break
|
||||
return phrases
|
||||
|
||||
|
||||
def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
|
||||
"""
|
||||
Phrase extraction algorithm extracts all consistent phrase pairs from
|
||||
a word-aligned sentence pair.
|
||||
|
||||
The idea is to loop over all possible source language (e) phrases and find
|
||||
the minimal foreign phrase (f) that matches each of them. Matching is done
|
||||
by identifying all alignment points for the source phrase and finding the
|
||||
shortest foreign phrase that includes all the foreign counterparts for the
|
||||
source words.
|
||||
|
||||
In short, a phrase alignment has to
|
||||
(a) contain all alignment points for all covered words
|
||||
(b) contain at least one alignment point
|
||||
|
||||
>>> srctext = "michael assumes that he will stay in the house"
|
||||
>>> trgtext = "michael geht davon aus , dass er im haus bleibt"
|
||||
>>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
|
||||
... (5,9), (6,7), (7,7), (8,8)]
|
||||
>>> phrases = phrase_extraction(srctext, trgtext, alignment)
|
||||
>>> for i in sorted(phrases):
|
||||
... print(i)
|
||||
...
|
||||
((0, 1), (0, 1), 'michael', 'michael')
|
||||
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
|
||||
((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
|
||||
((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
|
||||
((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
|
||||
((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
|
||||
((1, 2), (1, 4), 'assumes', 'geht davon aus')
|
||||
((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
|
||||
((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
|
||||
((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
|
||||
((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
|
||||
((2, 3), (4, 6), 'that', ', dass')
|
||||
((2, 3), (5, 6), 'that', 'dass')
|
||||
((2, 4), (4, 7), 'that he', ', dass er')
|
||||
((2, 4), (5, 7), 'that he', 'dass er')
|
||||
((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
|
||||
((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
|
||||
((3, 4), (6, 7), 'he', 'er')
|
||||
((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
|
||||
((4, 6), (9, 10), 'will stay', 'bleibt')
|
||||
((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt')
|
||||
((6, 8), (7, 8), 'in the', 'im')
|
||||
((6, 9), (7, 9), 'in the house', 'im haus')
|
||||
((8, 9), (8, 9), 'house', 'haus')
|
||||
|
||||
:type srctext: str
|
||||
:param srctext: The sentence string from the source language.
|
||||
:type trgtext: str
|
||||
:param trgtext: The sentence string from the target language.
|
||||
:type alignment: list(tuple)
|
||||
:param alignment: The word alignment outputs as list of tuples, where
|
||||
the first elements of tuples are the source words' indices and
|
||||
second elements are the target words' indices. This is also the output
|
||||
format of nltk.translate.ibm1
|
||||
:rtype: list(tuple)
|
||||
:return: A list of tuples, each element in a list is a phrase and each
|
||||
phrase is a tuple made up of (i) its source location, (ii) its target
|
||||
location, (iii) the source phrase and (iii) the target phrase. The phrase
|
||||
list of tuples represents all the possible phrases extracted from the
|
||||
word alignments.
|
||||
:type max_phrase_length: int
|
||||
:param max_phrase_length: maximal phrase length, if 0 or not specified
|
||||
it is set to a length of the longer sentence (srctext or trgtext).
|
||||
"""
|
||||
|
||||
srctext = srctext.split() # e
|
||||
trgtext = trgtext.split() # f
|
||||
srclen = len(srctext) # len(e)
|
||||
trglen = len(trgtext) # len(f)
|
||||
# Keeps an index of which source/target words that are aligned.
|
||||
f_aligned = [j for _, j in alignment]
|
||||
max_phrase_length = max_phrase_length or max(srclen, trglen)
|
||||
|
||||
# set of phrase pairs BP
|
||||
bp = set()
|
||||
|
||||
for e_start in range(srclen):
|
||||
max_idx = min(srclen, e_start + max_phrase_length)
|
||||
for e_end in range(e_start, max_idx):
|
||||
# // find the minimally matching foreign phrase
|
||||
# (f start , f end ) = ( length(f), 0 )
|
||||
# f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
|
||||
f_start, f_end = trglen - 1, -1 # 0-based indexing
|
||||
|
||||
for e, f in alignment:
|
||||
if e_start <= e <= e_end:
|
||||
f_start = min(f, f_start)
|
||||
f_end = max(f, f_end)
|
||||
# add extract (f start , f end , e start , e end ) to set BP
|
||||
phrases = extract(
|
||||
f_start,
|
||||
f_end,
|
||||
e_start,
|
||||
e_end,
|
||||
alignment,
|
||||
f_aligned,
|
||||
srctext,
|
||||
trgtext,
|
||||
srclen,
|
||||
trglen,
|
||||
max_phrase_length,
|
||||
)
|
||||
if phrases:
|
||||
bp.update(phrases)
|
||||
return bp
|
||||
@@ -0,0 +1,330 @@
|
||||
# Natural Language Toolkit: RIBES Score
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
|
||||
# Mark Byers, ekhumoro, P. Ortiz
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
""" RIBES score implementation """
|
||||
|
||||
import math
|
||||
from itertools import islice
|
||||
|
||||
from nltk.util import choose, ngrams
|
||||
|
||||
|
||||
def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
|
||||
"""
|
||||
The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
|
||||
Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
|
||||
Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
|
||||
Distant Language Pairs". In Proceedings of EMNLP.
|
||||
https://www.aclweb.org/anthology/D/D10/D10-1092.pdf
|
||||
|
||||
The generic RIBES scores used in shared task, e.g. Workshop for
|
||||
Asian Translation (WAT) uses the following RIBES calculations:
|
||||
|
||||
RIBES = kendall_tau * (alpha**p1) * (beta**bp)
|
||||
|
||||
Please note that this re-implementation differs from the official
|
||||
RIBES implementation and though it emulates the results as describe
|
||||
in the original paper, there are further optimization implemented
|
||||
in the official RIBES script.
|
||||
|
||||
Users are encouraged to use the official RIBES script instead of this
|
||||
implementation when evaluating your machine translation system. Refer
|
||||
to https://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
|
||||
|
||||
:param references: a list of reference sentences
|
||||
:type references: list(list(str))
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
:param alpha: hyperparameter used as a prior for the unigram precision.
|
||||
:type alpha: float
|
||||
:param beta: hyperparameter used as a prior for the brevity penalty.
|
||||
:type beta: float
|
||||
:return: The best ribes score from one of the references.
|
||||
:rtype: float
|
||||
"""
|
||||
best_ribes = -1.0
|
||||
# Calculates RIBES for each reference and returns the best score.
|
||||
for reference in references:
|
||||
# Collects the *worder* from the ranked correlation alignments.
|
||||
worder = word_rank_alignment(reference, hypothesis)
|
||||
nkt = kendall_tau(worder)
|
||||
|
||||
# Calculates the brevity penalty
|
||||
bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis)))
|
||||
|
||||
# Calculates the unigram precision, *p1*
|
||||
p1 = len(worder) / len(hypothesis)
|
||||
|
||||
_ribes = nkt * (p1**alpha) * (bp**beta)
|
||||
|
||||
if _ribes > best_ribes: # Keeps the best score.
|
||||
best_ribes = _ribes
|
||||
|
||||
return best_ribes
|
||||
|
||||
|
||||
def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
|
||||
"""
|
||||
This function "calculates RIBES for a system output (hypothesis) with
|
||||
multiple references, and returns "best" score among multi-references and
|
||||
individual scores. The scores are corpus-wise, i.e., averaged by the number
|
||||
of sentences." (c.f. RIBES version 1.03.1 code).
|
||||
|
||||
Different from BLEU's micro-average precision, RIBES calculates the
|
||||
macro-average precision by averaging the best RIBES score for each pair of
|
||||
hypothesis and its corresponding references
|
||||
|
||||
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
||||
... 'ensures', 'that', 'the', 'military', 'always',
|
||||
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
||||
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
||||
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
||||
... 'heed', 'Party', 'commands']
|
||||
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
||||
... 'guarantees', 'the', 'military', 'forces', 'always',
|
||||
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
||||
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
||||
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
||||
... 'of', 'the', 'party']
|
||||
|
||||
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
||||
... 'interested', 'in', 'world', 'history']
|
||||
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
||||
... 'because', 'he', 'read', 'the', 'book']
|
||||
|
||||
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
||||
>>> hypotheses = [hyp1, hyp2]
|
||||
>>> round(corpus_ribes(list_of_references, hypotheses),4)
|
||||
0.3597
|
||||
|
||||
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
||||
:type references: list(list(list(str)))
|
||||
:param hypotheses: a list of hypothesis sentences
|
||||
:type hypotheses: list(list(str))
|
||||
:param alpha: hyperparameter used as a prior for the unigram precision.
|
||||
:type alpha: float
|
||||
:param beta: hyperparameter used as a prior for the brevity penalty.
|
||||
:type beta: float
|
||||
:return: The best ribes score from one of the references.
|
||||
:rtype: float
|
||||
"""
|
||||
corpus_best_ribes = 0.0
|
||||
# Iterate through each hypothesis and their corresponding references.
|
||||
for references, hypothesis in zip(list_of_references, hypotheses):
|
||||
corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
|
||||
return corpus_best_ribes / len(hypotheses)
|
||||
|
||||
|
||||
def position_of_ngram(ngram, sentence):
|
||||
"""
|
||||
This function returns the position of the first instance of the ngram
|
||||
appearing in a sentence.
|
||||
|
||||
Note that one could also use string as follows but the code is a little
|
||||
convoluted with type casting back and forth:
|
||||
|
||||
char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
|
||||
word_pos = char_pos.count(' ')
|
||||
|
||||
Another way to conceive this is:
|
||||
|
||||
return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
|
||||
if ng == ngram)
|
||||
|
||||
:param ngram: The ngram that needs to be searched
|
||||
:type ngram: tuple
|
||||
:param sentence: The list of tokens to search from.
|
||||
:type sentence: list(str)
|
||||
"""
|
||||
# Iterates through the ngrams in sentence.
|
||||
for i, sublist in enumerate(ngrams(sentence, len(ngram))):
|
||||
# Returns the index of the word when ngram matches.
|
||||
if ngram == sublist:
|
||||
return i
|
||||
|
||||
|
||||
def word_rank_alignment(reference, hypothesis, character_based=False):
|
||||
"""
|
||||
This is the word rank alignment algorithm described in the paper to produce
|
||||
the *worder* list, i.e. a list of word indices of the hypothesis word orders
|
||||
w.r.t. the list of reference words.
|
||||
|
||||
Below is (H0, R0) example from the Isozaki et al. 2010 paper,
|
||||
note the examples are indexed from 1 but the results here are indexed from 0:
|
||||
|
||||
>>> ref = str('he was interested in world history because he '
|
||||
... 'read the book').split()
|
||||
>>> hyp = str('he read the book because he was interested in world '
|
||||
... 'history').split()
|
||||
>>> word_rank_alignment(ref, hyp)
|
||||
[7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
|
||||
The (H1, R1) example from the paper, note the 0th index:
|
||||
|
||||
>>> ref = 'John hit Bob yesterday'.split()
|
||||
>>> hyp = 'Bob hit John yesterday'.split()
|
||||
>>> word_rank_alignment(ref, hyp)
|
||||
[2, 1, 0, 3]
|
||||
|
||||
Here is the (H2, R2) example from the paper, note the 0th index here too:
|
||||
|
||||
>>> ref = 'the boy read the book'.split()
|
||||
>>> hyp = 'the book was read by the boy'.split()
|
||||
>>> word_rank_alignment(ref, hyp)
|
||||
[3, 4, 2, 0, 1]
|
||||
|
||||
:param reference: a reference sentence
|
||||
:type reference: list(str)
|
||||
:param hypothesis: a hypothesis sentence
|
||||
:type hypothesis: list(str)
|
||||
"""
|
||||
worder = []
|
||||
hyp_len = len(hypothesis)
|
||||
# Stores a list of possible ngrams from the reference sentence.
|
||||
# This is used for matching context window later in the algorithm.
|
||||
ref_ngrams = []
|
||||
hyp_ngrams = []
|
||||
for n in range(1, len(reference) + 1):
|
||||
for ng in ngrams(reference, n):
|
||||
ref_ngrams.append(ng)
|
||||
for ng in ngrams(hypothesis, n):
|
||||
hyp_ngrams.append(ng)
|
||||
for i, h_word in enumerate(hypothesis):
|
||||
# If word is not in the reference, continue.
|
||||
if h_word not in reference:
|
||||
continue
|
||||
# If we can determine one-to-one word correspondence for unigrams that
|
||||
# only appear once in both the reference and hypothesis.
|
||||
elif hypothesis.count(h_word) == reference.count(h_word) == 1:
|
||||
worder.append(reference.index(h_word))
|
||||
else:
|
||||
max_window_size = max(i, hyp_len - i + 1)
|
||||
for window in range(1, max_window_size):
|
||||
if i + window < hyp_len: # If searching the right context is possible.
|
||||
# Retrieve the right context window.
|
||||
right_context_ngram = tuple(islice(hypothesis, i, i + window + 1))
|
||||
num_times_in_ref = ref_ngrams.count(right_context_ngram)
|
||||
num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
|
||||
# If ngram appears only once in both ref and hyp.
|
||||
if num_times_in_ref == num_times_in_hyp == 1:
|
||||
# Find the position of ngram that matched the reference.
|
||||
pos = position_of_ngram(right_context_ngram, reference)
|
||||
worder.append(pos) # Add the positions of the ngram.
|
||||
break
|
||||
if window <= i: # If searching the left context is possible.
|
||||
# Retrieve the left context window.
|
||||
left_context_ngram = tuple(islice(hypothesis, i - window, i + 1))
|
||||
num_times_in_ref = ref_ngrams.count(left_context_ngram)
|
||||
num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
|
||||
if num_times_in_ref == num_times_in_hyp == 1:
|
||||
# Find the position of ngram that matched the reference.
|
||||
pos = position_of_ngram(left_context_ngram, reference)
|
||||
# Add the positions of the ngram.
|
||||
worder.append(pos + len(left_context_ngram) - 1)
|
||||
break
|
||||
return worder
|
||||
|
||||
|
||||
def find_increasing_sequences(worder):
|
||||
"""
|
||||
Given the *worder* list, this function groups monotonic +1 sequences.
|
||||
|
||||
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
>>> list(find_increasing_sequences(worder))
|
||||
[(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
|
||||
|
||||
:param worder: The worder list output from word_rank_alignment
|
||||
:param type: list(int)
|
||||
"""
|
||||
items = iter(worder)
|
||||
a, b = None, next(items, None)
|
||||
result = [b]
|
||||
while b is not None:
|
||||
a, b = b, next(items, None)
|
||||
if b is not None and a + 1 == b:
|
||||
result.append(b)
|
||||
else:
|
||||
if len(result) > 1:
|
||||
yield tuple(result)
|
||||
result = [b]
|
||||
|
||||
|
||||
def kendall_tau(worder, normalize=True):
|
||||
"""
|
||||
Calculates the Kendall's Tau correlation coefficient given the *worder*
|
||||
list of word alignments from word_rank_alignment(), using the formula:
|
||||
|
||||
tau = 2 * num_increasing_pairs / num_possible_pairs -1
|
||||
|
||||
Note that the no. of increasing pairs can be discontinuous in the *worder*
|
||||
list and each each increasing sequence can be tabulated as choose(len(seq), 2)
|
||||
no. of increasing pairs, e.g.
|
||||
|
||||
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
>>> number_possible_pairs = choose(len(worder), 2)
|
||||
>>> round(kendall_tau(worder, normalize=False),3)
|
||||
-0.236
|
||||
>>> round(kendall_tau(worder),3)
|
||||
0.382
|
||||
|
||||
:param worder: The worder list output from word_rank_alignment
|
||||
:type worder: list(int)
|
||||
:param normalize: Flag to indicate normalization to between 0.0 and 1.0.
|
||||
:type normalize: boolean
|
||||
:return: The Kendall's Tau correlation coefficient.
|
||||
:rtype: float
|
||||
"""
|
||||
worder_len = len(worder)
|
||||
# With worder_len < 2, `choose(worder_len, 2)` will be 0.
|
||||
# As we divide by this, it will give a ZeroDivisionError.
|
||||
# To avoid this, we can just return the lowest possible score.
|
||||
if worder_len < 2:
|
||||
tau = -1
|
||||
else:
|
||||
# Extract the groups of increasing/monotonic sequences.
|
||||
increasing_sequences = find_increasing_sequences(worder)
|
||||
# Calculate no. of increasing_pairs in *worder* list.
|
||||
num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences)
|
||||
# Calculate no. of possible pairs.
|
||||
num_possible_pairs = choose(worder_len, 2)
|
||||
# Kendall's Tau computation.
|
||||
tau = 2 * num_increasing_pairs / num_possible_pairs - 1
|
||||
if normalize: # If normalized, the tau output falls between 0.0 to 1.0
|
||||
return (tau + 1) / 2
|
||||
else: # Otherwise, the tau outputs falls between -1.0 to +1.0
|
||||
return tau
|
||||
|
||||
|
||||
def spearman_rho(worder, normalize=True):
|
||||
"""
|
||||
Calculates the Spearman's Rho correlation coefficient given the *worder*
|
||||
list of word alignment from word_rank_alignment(), using the formula:
|
||||
|
||||
rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
|
||||
|
||||
Given that d is the sum of difference between the *worder* list of indices
|
||||
and the original word indices from the reference sentence.
|
||||
|
||||
Using the (H0,R0) and (H5, R5) example from the paper
|
||||
|
||||
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
|
||||
>>> round(spearman_rho(worder, normalize=False), 3)
|
||||
-0.591
|
||||
>>> round(spearman_rho(worder), 3)
|
||||
0.205
|
||||
|
||||
:param worder: The worder list output from word_rank_alignment
|
||||
:param type: list(int)
|
||||
"""
|
||||
worder_len = len(worder)
|
||||
sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len)))
|
||||
rho = 1 - sum_d_square / choose(worder_len + 1, 3)
|
||||
|
||||
if normalize: # If normalized, the rho output falls between 0.0 to 1.0
|
||||
return (rho + 1) / 2
|
||||
else: # Otherwise, the rho outputs falls between -1.0 to +1.0
|
||||
return rho
|
||||
@@ -0,0 +1,515 @@
|
||||
# Natural Language Toolkit: Stack decoder
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A decoder that uses stacks to implement phrase-based translation.
|
||||
|
||||
In phrase-based translation, the source sentence is segmented into
|
||||
phrases of one or more words, and translations for those phrases are
|
||||
used to build the target sentence.
|
||||
|
||||
Hypothesis data structures are used to keep track of the source words
|
||||
translated so far and the partial output. A hypothesis can be expanded
|
||||
by selecting an untranslated phrase, looking up its translation in a
|
||||
phrase table, and appending that translation to the partial output.
|
||||
Translation is complete when a hypothesis covers all source words.
|
||||
|
||||
The search space is huge because the source sentence can be segmented
|
||||
in different ways, the source phrases can be selected in any order,
|
||||
and there could be multiple translations for the same source phrase in
|
||||
the phrase table. To make decoding tractable, stacks are used to limit
|
||||
the number of candidate hypotheses by doing histogram and/or threshold
|
||||
pruning.
|
||||
|
||||
Hypotheses with the same number of words translated are placed in the
|
||||
same stack. In histogram pruning, each stack has a size limit, and
|
||||
the hypothesis with the lowest score is removed when the stack is full.
|
||||
In threshold pruning, hypotheses that score below a certain threshold
|
||||
of the best hypothesis in that stack are removed.
|
||||
|
||||
Hypothesis scoring can include various factors such as phrase
|
||||
translation probability, language model probability, length of
|
||||
translation, cost of remaining words to be translated, and so on.
|
||||
|
||||
|
||||
References:
|
||||
Philipp Koehn. 2010. Statistical Machine Translation.
|
||||
Cambridge University Press, New York.
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from math import log
|
||||
|
||||
|
||||
class StackDecoder:
|
||||
"""
|
||||
Phrase-based stack decoder for machine translation
|
||||
|
||||
>>> from nltk.translate import PhraseTable
|
||||
>>> phrase_table = PhraseTable()
|
||||
>>> phrase_table.add(('niemand',), ('nobody',), log(0.8))
|
||||
>>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2))
|
||||
>>> phrase_table.add(('erwartet',), ('expects',), log(0.8))
|
||||
>>> phrase_table.add(('erwartet',), ('expecting',), log(0.2))
|
||||
>>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1))
|
||||
>>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8))
|
||||
>>> phrase_table.add(('!',), ('!',), log(0.8))
|
||||
|
||||
>>> # nltk.model should be used here once it is implemented
|
||||
>>> from collections import defaultdict
|
||||
>>> language_prob = defaultdict(lambda: -999.0)
|
||||
>>> language_prob[('nobody',)] = log(0.5)
|
||||
>>> language_prob[('expects',)] = log(0.4)
|
||||
>>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2)
|
||||
>>> language_prob[('!',)] = log(0.1)
|
||||
>>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})()
|
||||
|
||||
>>> stack_decoder = StackDecoder(phrase_table, language_model)
|
||||
|
||||
>>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!'])
|
||||
['nobody', 'expects', 'the', 'spanish', 'inquisition', '!']
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, phrase_table, language_model):
|
||||
"""
|
||||
:param phrase_table: Table of translations for source language
|
||||
phrases and the log probabilities for those translations.
|
||||
:type phrase_table: PhraseTable
|
||||
|
||||
:param language_model: Target language model. Must define a
|
||||
``probability_change`` method that calculates the change in
|
||||
log probability of a sentence, if a given string is appended
|
||||
to it.
|
||||
This interface is experimental and will likely be replaced
|
||||
with nltk.model once it is implemented.
|
||||
:type language_model: object
|
||||
"""
|
||||
self.phrase_table = phrase_table
|
||||
self.language_model = language_model
|
||||
|
||||
self.word_penalty = 0.0
|
||||
"""
|
||||
float: Influences the translation length exponentially.
|
||||
If positive, shorter translations are preferred.
|
||||
If negative, longer translations are preferred.
|
||||
If zero, no penalty is applied.
|
||||
"""
|
||||
|
||||
self.beam_threshold = 0.0
|
||||
"""
|
||||
float: Hypotheses that score below this factor of the best
|
||||
hypothesis in a stack are dropped from consideration.
|
||||
Value between 0.0 and 1.0.
|
||||
"""
|
||||
|
||||
self.stack_size = 100
|
||||
"""
|
||||
int: Maximum number of hypotheses to consider in a stack.
|
||||
Higher values increase the likelihood of a good translation,
|
||||
but increases processing time.
|
||||
"""
|
||||
|
||||
self.__distortion_factor = 0.5
|
||||
self.__compute_log_distortion()
|
||||
|
||||
@property
|
||||
def distortion_factor(self):
|
||||
"""
|
||||
float: Amount of reordering of source phrases.
|
||||
Lower values favour monotone translation, suitable when
|
||||
word order is similar for both source and target languages.
|
||||
Value between 0.0 and 1.0. Default 0.5.
|
||||
"""
|
||||
return self.__distortion_factor
|
||||
|
||||
@distortion_factor.setter
|
||||
def distortion_factor(self, d):
|
||||
self.__distortion_factor = d
|
||||
self.__compute_log_distortion()
|
||||
|
||||
def __compute_log_distortion(self):
|
||||
# cache log(distortion_factor) so we don't have to recompute it
|
||||
# when scoring hypotheses
|
||||
if self.__distortion_factor == 0.0:
|
||||
self.__log_distortion_factor = log(1e-9) # 1e-9 is almost zero
|
||||
else:
|
||||
self.__log_distortion_factor = log(self.__distortion_factor)
|
||||
|
||||
def translate(self, src_sentence):
|
||||
"""
|
||||
:param src_sentence: Sentence to be translated
|
||||
:type src_sentence: list(str)
|
||||
|
||||
:return: Translated sentence
|
||||
:rtype: list(str)
|
||||
"""
|
||||
sentence = tuple(src_sentence) # prevent accidental modification
|
||||
sentence_length = len(sentence)
|
||||
stacks = [
|
||||
_Stack(self.stack_size, self.beam_threshold)
|
||||
for _ in range(0, sentence_length + 1)
|
||||
]
|
||||
empty_hypothesis = _Hypothesis()
|
||||
stacks[0].push(empty_hypothesis)
|
||||
|
||||
all_phrases = self.find_all_src_phrases(sentence)
|
||||
future_score_table = self.compute_future_scores(sentence)
|
||||
for stack in stacks:
|
||||
for hypothesis in stack:
|
||||
possible_expansions = StackDecoder.valid_phrases(
|
||||
all_phrases, hypothesis
|
||||
)
|
||||
for src_phrase_span in possible_expansions:
|
||||
src_phrase = sentence[src_phrase_span[0] : src_phrase_span[1]]
|
||||
for translation_option in self.phrase_table.translations_for(
|
||||
src_phrase
|
||||
):
|
||||
raw_score = self.expansion_score(
|
||||
hypothesis, translation_option, src_phrase_span
|
||||
)
|
||||
new_hypothesis = _Hypothesis(
|
||||
raw_score=raw_score,
|
||||
src_phrase_span=src_phrase_span,
|
||||
trg_phrase=translation_option.trg_phrase,
|
||||
previous=hypothesis,
|
||||
)
|
||||
new_hypothesis.future_score = self.future_score(
|
||||
new_hypothesis, future_score_table, sentence_length
|
||||
)
|
||||
total_words = new_hypothesis.total_translated_words()
|
||||
stacks[total_words].push(new_hypothesis)
|
||||
|
||||
if not stacks[sentence_length]:
|
||||
warnings.warn(
|
||||
"Unable to translate all words. "
|
||||
"The source sentence contains words not in "
|
||||
"the phrase table"
|
||||
)
|
||||
# Instead of returning empty output, perhaps a partial
|
||||
# translation could be returned
|
||||
return []
|
||||
|
||||
best_hypothesis = stacks[sentence_length].best()
|
||||
return best_hypothesis.translation_so_far()
|
||||
|
||||
def find_all_src_phrases(self, src_sentence):
|
||||
"""
|
||||
Finds all subsequences in src_sentence that have a phrase
|
||||
translation in the translation table
|
||||
|
||||
:type src_sentence: tuple(str)
|
||||
|
||||
:return: Subsequences that have a phrase translation,
|
||||
represented as a table of lists of end positions.
|
||||
For example, if result[2] is [5, 6, 9], then there are
|
||||
three phrases starting from position 2 in ``src_sentence``,
|
||||
ending at positions 5, 6, and 9 exclusive. The list of
|
||||
ending positions are in ascending order.
|
||||
:rtype: list(list(int))
|
||||
"""
|
||||
sentence_length = len(src_sentence)
|
||||
phrase_indices = [[] for _ in src_sentence]
|
||||
for start in range(0, sentence_length):
|
||||
for end in range(start + 1, sentence_length + 1):
|
||||
potential_phrase = src_sentence[start:end]
|
||||
if potential_phrase in self.phrase_table:
|
||||
phrase_indices[start].append(end)
|
||||
return phrase_indices
|
||||
|
||||
def compute_future_scores(self, src_sentence):
|
||||
"""
|
||||
Determines the approximate scores for translating every
|
||||
subsequence in ``src_sentence``
|
||||
|
||||
Future scores can be used a look-ahead to determine the
|
||||
difficulty of translating the remaining parts of a src_sentence.
|
||||
|
||||
:type src_sentence: tuple(str)
|
||||
|
||||
:return: Scores of subsequences referenced by their start and
|
||||
end positions. For example, result[2][5] is the score of the
|
||||
subsequence covering positions 2, 3, and 4.
|
||||
:rtype: dict(int: (dict(int): float))
|
||||
"""
|
||||
scores = defaultdict(lambda: defaultdict(lambda: float("-inf")))
|
||||
for seq_length in range(1, len(src_sentence) + 1):
|
||||
for start in range(0, len(src_sentence) - seq_length + 1):
|
||||
end = start + seq_length
|
||||
phrase = src_sentence[start:end]
|
||||
if phrase in self.phrase_table:
|
||||
score = self.phrase_table.translations_for(phrase)[
|
||||
0
|
||||
].log_prob # pick best (first) translation
|
||||
# Warning: API of language_model is subject to change
|
||||
score += self.language_model.probability(phrase)
|
||||
scores[start][end] = score
|
||||
|
||||
# check if a better score can be obtained by combining
|
||||
# two child subsequences
|
||||
for mid in range(start + 1, end):
|
||||
combined_score = scores[start][mid] + scores[mid][end]
|
||||
if combined_score > scores[start][end]:
|
||||
scores[start][end] = combined_score
|
||||
return scores
|
||||
|
||||
def future_score(self, hypothesis, future_score_table, sentence_length):
|
||||
"""
|
||||
Determines the approximate score for translating the
|
||||
untranslated words in ``hypothesis``
|
||||
"""
|
||||
score = 0.0
|
||||
for span in hypothesis.untranslated_spans(sentence_length):
|
||||
score += future_score_table[span[0]][span[1]]
|
||||
return score
|
||||
|
||||
def expansion_score(self, hypothesis, translation_option, src_phrase_span):
|
||||
"""
|
||||
Calculate the score of expanding ``hypothesis`` with
|
||||
``translation_option``
|
||||
|
||||
:param hypothesis: Hypothesis being expanded
|
||||
:type hypothesis: _Hypothesis
|
||||
|
||||
:param translation_option: Information about the proposed expansion
|
||||
:type translation_option: PhraseTableEntry
|
||||
|
||||
:param src_phrase_span: Word position span of the source phrase
|
||||
:type src_phrase_span: tuple(int, int)
|
||||
"""
|
||||
score = hypothesis.raw_score
|
||||
score += translation_option.log_prob
|
||||
# The API of language_model is subject to change; it could accept
|
||||
# a string, a list of words, and/or some other type
|
||||
score += self.language_model.probability_change(
|
||||
hypothesis, translation_option.trg_phrase
|
||||
)
|
||||
score += self.distortion_score(hypothesis, src_phrase_span)
|
||||
score -= self.word_penalty * len(translation_option.trg_phrase)
|
||||
return score
|
||||
|
||||
def distortion_score(self, hypothesis, next_src_phrase_span):
|
||||
if not hypothesis.src_phrase_span:
|
||||
return 0.0
|
||||
next_src_phrase_start = next_src_phrase_span[0]
|
||||
prev_src_phrase_end = hypothesis.src_phrase_span[1]
|
||||
distortion_distance = next_src_phrase_start - prev_src_phrase_end
|
||||
return abs(distortion_distance) * self.__log_distortion_factor
|
||||
|
||||
@staticmethod
|
||||
def valid_phrases(all_phrases_from, hypothesis):
|
||||
"""
|
||||
Extract phrases from ``all_phrases_from`` that contains words
|
||||
that have not been translated by ``hypothesis``
|
||||
|
||||
:param all_phrases_from: Phrases represented by their spans, in
|
||||
the same format as the return value of
|
||||
``find_all_src_phrases``
|
||||
:type all_phrases_from: list(list(int))
|
||||
|
||||
:type hypothesis: _Hypothesis
|
||||
|
||||
:return: A list of phrases, represented by their spans, that
|
||||
cover untranslated positions.
|
||||
:rtype: list(tuple(int, int))
|
||||
"""
|
||||
untranslated_spans = hypothesis.untranslated_spans(len(all_phrases_from))
|
||||
valid_phrases = []
|
||||
for available_span in untranslated_spans:
|
||||
start = available_span[0]
|
||||
available_end = available_span[1]
|
||||
while start < available_end:
|
||||
for phrase_end in all_phrases_from[start]:
|
||||
if phrase_end > available_end:
|
||||
# Subsequent elements in all_phrases_from[start]
|
||||
# will also be > available_end, since the
|
||||
# elements are in ascending order
|
||||
break
|
||||
valid_phrases.append((start, phrase_end))
|
||||
start += 1
|
||||
return valid_phrases
|
||||
|
||||
|
||||
class _Hypothesis:
|
||||
"""
|
||||
Partial solution to a translation.
|
||||
|
||||
Records the word positions of the phrase being translated, its
|
||||
translation, raw score, and the cost of the untranslated parts of
|
||||
the sentence. When the next phrase is selected to build upon the
|
||||
partial solution, a new _Hypothesis object is created, with a back
|
||||
pointer to the previous hypothesis.
|
||||
|
||||
To find out which words have been translated so far, look at the
|
||||
``src_phrase_span`` in the hypothesis chain. Similarly, the
|
||||
translation output can be found by traversing up the chain.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
raw_score=0.0,
|
||||
src_phrase_span=(),
|
||||
trg_phrase=(),
|
||||
previous=None,
|
||||
future_score=0.0,
|
||||
):
|
||||
"""
|
||||
:param raw_score: Likelihood of hypothesis so far.
|
||||
Higher is better. Does not account for untranslated words.
|
||||
:type raw_score: float
|
||||
|
||||
:param src_phrase_span: Span of word positions covered by the
|
||||
source phrase in this hypothesis expansion. For example,
|
||||
(2, 5) means that the phrase is from the second word up to,
|
||||
but not including the fifth word in the source sentence.
|
||||
:type src_phrase_span: tuple(int)
|
||||
|
||||
:param trg_phrase: Translation of the source phrase in this
|
||||
hypothesis expansion
|
||||
:type trg_phrase: tuple(str)
|
||||
|
||||
:param previous: Previous hypothesis before expansion to this one
|
||||
:type previous: _Hypothesis
|
||||
|
||||
:param future_score: Approximate score for translating the
|
||||
remaining words not covered by this hypothesis. Higher means
|
||||
that the remaining words are easier to translate.
|
||||
:type future_score: float
|
||||
"""
|
||||
self.raw_score = raw_score
|
||||
self.src_phrase_span = src_phrase_span
|
||||
self.trg_phrase = trg_phrase
|
||||
self.previous = previous
|
||||
self.future_score = future_score
|
||||
|
||||
def score(self):
|
||||
"""
|
||||
Overall score of hypothesis after accounting for local and
|
||||
global features
|
||||
"""
|
||||
return self.raw_score + self.future_score
|
||||
|
||||
def untranslated_spans(self, sentence_length):
|
||||
"""
|
||||
Starting from each untranslated word, find the longest
|
||||
continuous span of untranslated positions
|
||||
|
||||
:param sentence_length: Length of source sentence being
|
||||
translated by the hypothesis
|
||||
:type sentence_length: int
|
||||
|
||||
:rtype: list(tuple(int, int))
|
||||
"""
|
||||
translated_positions = self.translated_positions()
|
||||
translated_positions.sort()
|
||||
translated_positions.append(sentence_length) # add sentinel position
|
||||
|
||||
untranslated_spans = []
|
||||
start = 0
|
||||
# each untranslated span must end in one of the translated_positions
|
||||
for end in translated_positions:
|
||||
if start < end:
|
||||
untranslated_spans.append((start, end))
|
||||
start = end + 1
|
||||
|
||||
return untranslated_spans
|
||||
|
||||
def translated_positions(self):
|
||||
"""
|
||||
List of positions in the source sentence of words already
|
||||
translated. The list is not sorted.
|
||||
|
||||
:rtype: list(int)
|
||||
"""
|
||||
translated_positions = []
|
||||
current_hypothesis = self
|
||||
while current_hypothesis.previous is not None:
|
||||
translated_span = current_hypothesis.src_phrase_span
|
||||
translated_positions.extend(range(translated_span[0], translated_span[1]))
|
||||
current_hypothesis = current_hypothesis.previous
|
||||
return translated_positions
|
||||
|
||||
def total_translated_words(self):
|
||||
return len(self.translated_positions())
|
||||
|
||||
def translation_so_far(self):
|
||||
translation = []
|
||||
self.__build_translation(self, translation)
|
||||
return translation
|
||||
|
||||
def __build_translation(self, hypothesis, output):
|
||||
if hypothesis.previous is None:
|
||||
return
|
||||
self.__build_translation(hypothesis.previous, output)
|
||||
output.extend(hypothesis.trg_phrase)
|
||||
|
||||
|
||||
class _Stack:
|
||||
"""
|
||||
Collection of _Hypothesis objects
|
||||
"""
|
||||
|
||||
def __init__(self, max_size=100, beam_threshold=0.0):
|
||||
"""
|
||||
:param beam_threshold: Hypotheses that score less than this
|
||||
factor of the best hypothesis are discarded from the stack.
|
||||
Value must be between 0.0 and 1.0.
|
||||
:type beam_threshold: float
|
||||
"""
|
||||
self.max_size = max_size
|
||||
self.items = []
|
||||
|
||||
if beam_threshold == 0.0:
|
||||
self.__log_beam_threshold = float("-inf")
|
||||
else:
|
||||
self.__log_beam_threshold = log(beam_threshold)
|
||||
|
||||
def push(self, hypothesis):
|
||||
"""
|
||||
Add ``hypothesis`` to the stack.
|
||||
Removes lowest scoring hypothesis if the stack is full.
|
||||
After insertion, hypotheses that score less than
|
||||
``beam_threshold`` times the score of the best hypothesis
|
||||
are removed.
|
||||
"""
|
||||
self.items.append(hypothesis)
|
||||
self.items.sort(key=lambda h: h.score(), reverse=True)
|
||||
while len(self.items) > self.max_size:
|
||||
self.items.pop()
|
||||
self.threshold_prune()
|
||||
|
||||
def threshold_prune(self):
|
||||
if not self.items:
|
||||
return
|
||||
# log(score * beam_threshold) = log(score) + log(beam_threshold)
|
||||
threshold = self.items[0].score() + self.__log_beam_threshold
|
||||
for hypothesis in reversed(self.items):
|
||||
if hypothesis.score() < threshold:
|
||||
self.items.pop()
|
||||
else:
|
||||
break
|
||||
|
||||
def best(self):
|
||||
"""
|
||||
:return: Hypothesis with the highest score in the stack
|
||||
:rtype: _Hypothesis
|
||||
"""
|
||||
if self.items:
|
||||
return self.items[0]
|
||||
return None
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.items)
|
||||
|
||||
def __contains__(self, hypothesis):
|
||||
return hypothesis in self.items
|
||||
|
||||
def __bool__(self):
|
||||
return len(self.items) != 0
|
||||
|
||||
__nonzero__ = __bool__
|
||||
Reference in New Issue
Block a user