This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,416 @@
"""
Tests for BLEU translation evaluation metric
"""
import unittest
import numpy as np
from nltk.data import find
from nltk.translate.bleu_score import (
SmoothingFunction,
brevity_penalty,
closest_ref_length,
corpus_bleu,
modified_precision,
sentence_bleu,
)
class TestBLEU(unittest.TestCase):
def test_modified_precision(self):
"""
Examples from the original BLEU paper
https://www.aclweb.org/anthology/P02-1040.pdf
"""
# Example 1: the "the*" example.
# Reference sentences.
ref1 = "the cat is on the mat".split()
ref2 = "there is a cat on the mat".split()
# Hypothesis sentence(s).
hyp1 = "the the the the the the the".split()
references = [ref1, ref2]
# Testing modified unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
assert round(hyp1_unigram_precision, 4) == 0.2857
# With assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 0.0
# Example 2: the "of the" example.
# Reference sentences
ref1 = str(
"It is a guide to action that ensures that the military "
"will forever heed Party commands"
).split()
ref2 = str(
"It is the guiding principle which guarantees the military "
"forces always being under the command of the Party"
).split()
ref3 = str(
"It is the practical guide for the army always to heed "
"the directions of the party"
).split()
# Hypothesis sentence(s).
hyp1 = "of the".split()
references = [ref1, ref2, ref3]
# Testing modified unigram precision.
assert float(modified_precision(references, hyp1, n=1)) == 1.0
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 1.0
# Example 3: Proper MT outputs.
hyp1 = str(
"It is a guide to action which ensures that the military "
"always obeys the commands of the party"
).split()
hyp2 = str(
"It is to insure the troops forever hearing the activity "
"guidebook that party direct"
).split()
references = [ref1, ref2, ref3]
# Unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
# Test unigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
# Test unigram precision with rounding.
assert round(hyp1_unigram_precision, 4) == 0.9444
assert round(hyp2_unigram_precision, 4) == 0.5714
# Bigram precision
hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
# Test bigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
# Test bigram precision with rounding.
assert round(hyp1_bigram_precision, 4) == 0.5882
assert round(hyp2_bigram_precision, 4) == 0.0769
def test_brevity_penalty(self):
# Test case from brevity_penalty_closest function in mteval-v13a.pl.
# Same test cases as in the doctest in nltk.translate.bleu_score.py
references = [["a"] * 11, ["a"] * 8]
hypothesis = ["a"] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
self.assertAlmostEqual(
brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
)
references = [["a"] * 11, ["a"] * 8, ["a"] * 6, ["a"] * 7]
hypothesis = ["a"] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
def test_zero_matches(self):
# Test case where there's 0 matches
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = "John loves Mary".split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = (1.0 / n,) * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 0
def test_full_matches(self):
# Test case where there's 100% matches
references = ["John loves Mary".split()]
hypothesis = "John loves Mary".split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = (1.0 / n,) * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 1.0
def test_partial_matches_hypothesis_longer_than_reference(self):
references = ["John loves Mary".split()]
hypothesis = "John loves Mary who loves Mike".split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised because len(reference) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# @unittest.skip("Skipping fringe cases for BLEU.")
class TestBLEUFringeCases(unittest.TestCase):
def test_case_where_n_is_bigger_than_hypothesis_length(self):
# Test BLEU to nth order of n-grams, where n > len(hypothesis).
references = ["John loves Mary ?".split()]
hypothesis = "John loves Mary".split()
n = len(hypothesis) + 1 #
weights = (1.0 / n,) * n # Uniform weights.
# Since no n-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
# Checks that the warning has been raised because len(hypothesis) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# Test case where n > len(hypothesis) but so is n > len(reference), and
# it's a special case where reference == hypothesis.
references = ["John loves Mary".split()]
hypothesis = "John loves Mary".split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
def test_empty_hypothesis(self):
# Test case where there's hypothesis is empty.
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_length_one_hypothesis(self):
# Test case where there's hypothesis is of length 1 in Smoothing method 4.
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = ["Foo"]
method4 = SmoothingFunction().method4
try:
sentence_bleu(references, hypothesis, smoothing_function=method4)
except ValueError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
def test_empty_references(self):
# Test case where there's reference is empty.
references = [[]]
hypothesis = "John loves Mary".split()
assert sentence_bleu(references, hypothesis) == 0
def test_empty_references_and_hypothesis(self):
# Test case where both references and hypothesis is empty.
references = [[]]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_reference_or_hypothesis_shorter_than_fourgrams(self):
# Test case where the length of reference or hypothesis
# is shorter than 4.
references = ["let it go".split()]
hypothesis = "let go it".split()
# Checks that the value the hypothesis and reference returns is 0.0
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
def test_numpy_weights(self):
# Test case where there's 0 matches
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = "John loves Mary".split()
weights = np.array([0.25] * 4)
assert sentence_bleu(references, hypothesis, weights) == 0
class TestBLEUvsMteval13a(unittest.TestCase):
def test_corpus_bleu(self):
ref_file = find("models/wmt15_eval/ref.ru")
hyp_file = find("models/wmt15_eval/google.ru")
mteval_output_file = find("models/wmt15_eval/mteval-13a.output")
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file) as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with open(ref_file, encoding="utf8") as ref_fin:
with open(hyp_file, encoding="utf8") as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references, hypothesis, weights=(1.0 / i,) * i
)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references,
hypothesis,
weights=(1.0 / i,) * i,
smoothing_function=chencherry.method3,
)
assert abs(mteval_bleu - nltk_bleu) < 0.005
class TestBLEUWithBadSentence(unittest.TestCase):
def test_corpus_bleu_with_bad_sentence(self):
hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
ref = str(
"Their tasks include changing a pump on the faulty stokehold ."
"Likewise , two species that are very similar in morphology "
"were distinguished using genetics ."
)
references = [[ref.split()]]
hypotheses = [hyp.split()]
try: # Check that the warning is raised since no. of 2-grams < 0.
with self.assertWarns(UserWarning):
# Verify that the BLEU output is undesired since no. of 2-grams < 0.
self.assertAlmostEqual(
corpus_bleu(references, hypotheses), 0.0, places=4
)
except (
AttributeError
): # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)
class TestBLEUWithMultipleWeights(unittest.TestCase):
def test_corpus_bleu_with_multiple_weights(self):
hyp1 = [
"It",
"is",
"a",
"guide",
"to",
"action",
"which",
"ensures",
"that",
"the",
"military",
"always",
"obeys",
"the",
"commands",
"of",
"the",
"party",
]
ref1a = [
"It",
"is",
"a",
"guide",
"to",
"action",
"that",
"ensures",
"that",
"the",
"military",
"will",
"forever",
"heed",
"Party",
"commands",
]
ref1b = [
"It",
"is",
"the",
"guiding",
"principle",
"which",
"guarantees",
"the",
"military",
"forces",
"always",
"being",
"under",
"the",
"command",
"of",
"the",
"Party",
]
ref1c = [
"It",
"is",
"the",
"practical",
"guide",
"for",
"the",
"army",
"always",
"to",
"heed",
"the",
"directions",
"of",
"the",
"party",
]
hyp2 = [
"he",
"read",
"the",
"book",
"because",
"he",
"was",
"interested",
"in",
"world",
"history",
]
ref2a = [
"he",
"was",
"interested",
"in",
"world",
"history",
"because",
"he",
"read",
"the",
"book",
]
weight_1 = (1, 0, 0, 0)
weight_2 = (0.25, 0.25, 0.25, 0.25)
weight_3 = (0, 0, 0, 0, 1)
bleu_scores = corpus_bleu(
list_of_references=[[ref1a, ref1b, ref1c], [ref2a]],
hypotheses=[hyp1, hyp2],
weights=[weight_1, weight_2, weight_3],
)
assert bleu_scores[0] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1
)
assert bleu_scores[1] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2
)
assert bleu_scores[2] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3
)

View File

@@ -0,0 +1,154 @@
"""
Tests GDFA alignments
"""
import unittest
from nltk.translate.gdfa import grow_diag_final_and
class TestGDFA(unittest.TestCase):
def test_from_eflomal_outputs(self):
"""
Testing GDFA with first 10 eflomal outputs from issue #1829
https://github.com/nltk/nltk/issues/1829
"""
# Input.
forwards = [
"0-0 1-2",
"0-0 1-1",
"0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14",
"0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10",
"0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31",
"0-0 1-1 0-2 2-3",
"0-0 2-2 4-4",
"0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20",
"3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14",
"1-0",
]
backwards = [
"0-0 1-2",
"0-0 1-1",
"0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13",
"0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8",
"0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31",
"0-0 1-1 2-3",
"0-0 1-1 2-3 4-4",
"0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18",
"0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10",
"1-0",
]
source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
# Expected Output.
expected = [
[(0, 0), (1, 2)],
[(0, 0), (1, 1)],
[
(0, 0),
(2, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(8, 7),
(10, 10),
(11, 12),
],
[
(0, 0),
(1, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(4, 6),
(5, 7),
(6, 8),
(7, 5),
(8, 7),
(8, 9),
(9, 8),
(9, 10),
],
[
(0, 0),
(1, 8),
(2, 9),
(3, 10),
(4, 11),
(5, 8),
(6, 9),
(6, 11),
(7, 10),
(8, 11),
(31, 31),
],
[(0, 0), (0, 2), (1, 1), (2, 3)],
[(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
[
(0, 0),
(1, 1),
(2, 3),
(3, 4),
(5, 5),
(7, 6),
(8, 7),
(9, 8),
(10, 9),
(11, 10),
(12, 11),
(13, 12),
(14, 13),
(15, 14),
(16, 16),
(17, 17),
(18, 18),
(19, 19),
],
[
(0, 0),
(1, 1),
(3, 0),
(3, 2),
(4, 1),
(5, 3),
(6, 2),
(6, 4),
(7, 5),
(8, 6),
(9, 7),
(9, 12),
(10, 8),
(10, 13),
(11, 9),
(12, 8),
(12, 14),
(13, 9),
(14, 8),
(15, 9),
(16, 10),
],
[(1, 0)],
[
(0, 0),
(1, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(9, 10),
(10, 12),
(11, 13),
(12, 14),
(13, 15),
],
]
# Iterate through all 10 examples and check for expected outputs.
for fw, bw, src_len, trg_len, expect in zip(
forwards, backwards, source_lens, target_lens, expected
):
self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw))

View File

@@ -0,0 +1,73 @@
"""
Tests for IBM Model 1 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel, IBMModel1
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel1(unittest.TestCase):
def test_set_uniform_translation_probabilities(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / (target vocab size + 1)
self.assertEqual(model1.translation_table["ham"]["eier"], 1.0 / 3)
self.assertEqual(model1.translation_table["eggs"][None], 1.0 / 3)
def test_set_uniform_translation_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# examine target words that are not in the training data domain
self.assertEqual(model1.translation_table["parrot"]["eier"], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"]
trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"]
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
["UNUSED"] + trg_sentence,
None,
)
translation_table = defaultdict(lambda: defaultdict(float))
translation_table["i"]["ich"] = 0.98
translation_table["love"]["gern"] = 0.98
translation_table["to"][None] = 0.98
translation_table["eat"]["esse"] = 0.98
translation_table["smoked"]["räucherschinken"] = 0.98
translation_table["ham"]["räucherschinken"] = 0.98
model1 = IBMModel1(corpus, 0)
model1.translation_table = translation_table
# act
probability = model1.prob_t_a_given_s(alignment_info)
# assert
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
expected_probability = lexical_translation
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,86 @@
"""
Tests for IBM Model 2 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel, IBMModel2
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel2(unittest.TestCase):
def test_set_uniform_alignment_probabilities(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model2 = IBMModel2(corpus, 0)
# act
model2.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / (length of source sentence + 1)
self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4)
self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3)
def test_set_uniform_alignment_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model2 = IBMModel2(corpus, 0)
# act
model2.set_uniform_probabilities(corpus)
# assert
# examine i and j values that are not in the training data domain
self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB)
self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"]
trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"]
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
["UNUSED"] + trg_sentence,
None,
)
translation_table = defaultdict(lambda: defaultdict(float))
translation_table["i"]["ich"] = 0.98
translation_table["love"]["gern"] = 0.98
translation_table["to"][None] = 0.98
translation_table["eat"]["esse"] = 0.98
translation_table["smoked"]["räucherschinken"] = 0.98
translation_table["ham"]["räucherschinken"] = 0.98
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
alignment_table[0][3][5][6] = 0.97 # None -> to
alignment_table[1][1][5][6] = 0.97 # ich -> i
alignment_table[2][4][5][6] = 0.97 # esse -> eat
alignment_table[4][2][5][6] = 0.97 # gern -> love
alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked
alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham
model2 = IBMModel2(corpus, 0)
model2.translation_table = translation_table
model2.alignment_table = alignment_table
# act
probability = model2.prob_t_a_given_s(alignment_info)
# assert
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
expected_probability = lexical_translation * alignment
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,105 @@
"""
Tests for IBM Model 3 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel, IBMModel3
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel3(unittest.TestCase):
def test_set_uniform_distortion_probabilities(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model3 = IBMModel3(corpus, 0)
# act
model3.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / length of target sentence
self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2)
self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4)
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model3 = IBMModel3(corpus, 0)
# act
model3.set_uniform_probabilities(corpus)
# assert
# examine i and j values that are not in the training data domain
self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB)
self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB)
self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"]
trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"]
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
["UNUSED"] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
distortion_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
distortion_table[1][1][5][6] = 0.97 # i -> ich
distortion_table[2][4][5][6] = 0.97 # love -> gern
distortion_table[3][0][5][6] = 0.97 # to -> NULL
distortion_table[4][2][5][6] = 0.97 # eat -> esse
distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken
distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken
translation_table = defaultdict(lambda: defaultdict(float))
translation_table["i"]["ich"] = 0.98
translation_table["love"]["gern"] = 0.98
translation_table["to"][None] = 0.98
translation_table["eat"]["esse"] = 0.98
translation_table["smoked"]["räucherschinken"] = 0.98
translation_table["ham"]["räucherschinken"] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]["ich"] = 0.99
fertility_table[1]["esse"] = 0.99
fertility_table[0]["ja"] = 0.99
fertility_table[1]["gern"] = 0.99
fertility_table[2]["räucherschinken"] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
"p1": 0.167,
"translation_table": translation_table,
"distortion_table": distortion_table,
"fertility_table": fertility_table,
"alignment_table": None,
}
model3 = IBMModel3(corpus, 0, probabilities)
# act
probability = model3.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97
expected_probability = (
null_generation * fertility * lexical_translation * distortion
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,120 @@
"""
Tests for IBM Model 4 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel, IBMModel4
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel4(unittest.TestCase):
def test_set_uniform_distortion_probabilities_of_max_displacements(self):
# arrange
src_classes = {"schinken": 0, "eier": 0, "spam": 1}
trg_classes = {"ham": 0, "eggs": 1, "spam": 2}
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
# act
model4.set_uniform_probabilities(corpus)
# assert
# number of displacement values =
# 2 *(number of words in longest target sentence - 1)
expected_prob = 1.0 / (2 * (4 - 1))
# examine the boundary values for (displacement, src_class, trg_class)
self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob)
self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob)
self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob)
self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob)
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
# arrange
src_classes = {"schinken": 0, "eier": 0, "spam": 1}
trg_classes = {"ham": 0, "eggs": 1, "spam": 2}
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
# act
model4.set_uniform_probabilities(corpus)
# assert
# examine displacement values that are not in the training data domain
self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB)
self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB)
self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB)
self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"]
trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"]
src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4}
trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4}
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
["UNUSED"] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
head_distortion_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
head_distortion_table[1][None][3] = 0.97 # None, i
head_distortion_table[3][2][4] = 0.97 # ich, eat
head_distortion_table[-2][3][4] = 0.97 # esse, love
head_distortion_table[3][4][1] = 0.97 # gern, smoked
non_head_distortion_table = defaultdict(lambda: defaultdict(float))
non_head_distortion_table[1][0] = 0.96 # ham
translation_table = defaultdict(lambda: defaultdict(float))
translation_table["i"]["ich"] = 0.98
translation_table["love"]["gern"] = 0.98
translation_table["to"][None] = 0.98
translation_table["eat"]["esse"] = 0.98
translation_table["smoked"]["räucherschinken"] = 0.98
translation_table["ham"]["räucherschinken"] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]["ich"] = 0.99
fertility_table[1]["esse"] = 0.99
fertility_table[0]["ja"] = 0.99
fertility_table[1]["gern"] = 0.99
fertility_table[2]["räucherschinken"] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
"p1": 0.167,
"translation_table": translation_table,
"head_distortion_table": head_distortion_table,
"non_head_distortion_table": non_head_distortion_table,
"fertility_table": fertility_table,
"alignment_table": None,
}
model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities)
# act
probability = model4.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
expected_probability = (
null_generation * fertility * lexical_translation * distortion
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,160 @@
"""
Tests for IBM Model 5 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel, IBMModel4, IBMModel5
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel5(unittest.TestCase):
def test_set_uniform_vacancy_probabilities_of_max_displacements(self):
# arrange
src_classes = {"schinken": 0, "eier": 0, "spam": 1}
trg_classes = {"ham": 0, "eggs": 1, "spam": 2}
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
# act
model5.set_uniform_probabilities(corpus)
# assert
# number of vacancy difference values =
# 2 * number of words in longest target sentence
expected_prob = 1.0 / (2 * 4)
# examine the boundary values for (dv, max_v, trg_class)
self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob)
self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob)
self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
def test_set_uniform_vacancy_probabilities_of_non_domain_values(self):
# arrange
src_classes = {"schinken": 0, "eier": 0, "spam": 1}
trg_classes = {"ham": 0, "eggs": 1, "spam": 2}
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
# act
model5.set_uniform_probabilities(corpus)
# assert
# examine dv and max_v values that are not in the training data domain
self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB)
self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"]
trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"]
src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4}
trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4}
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
["UNUSED"] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i
head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat
head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love
head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked
non_head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham
translation_table = defaultdict(lambda: defaultdict(float))
translation_table["i"]["ich"] = 0.98
translation_table["love"]["gern"] = 0.98
translation_table["to"][None] = 0.98
translation_table["eat"]["esse"] = 0.98
translation_table["smoked"]["räucherschinken"] = 0.98
translation_table["ham"]["räucherschinken"] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]["ich"] = 0.99
fertility_table[1]["esse"] = 0.99
fertility_table[0]["ja"] = 0.99
fertility_table[1]["gern"] = 0.99
fertility_table[2]["räucherschinken"] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
"p1": 0.167,
"translation_table": translation_table,
"fertility_table": fertility_table,
"head_vacancy_table": head_vacancy_table,
"non_head_vacancy_table": non_head_vacancy_table,
"head_distortion_table": None,
"non_head_distortion_table": None,
"alignment_table": None,
}
model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities)
# act
probability = model5.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
expected_probability = (
null_generation * fertility * lexical_translation * vacancy
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))
def test_prune(self):
# arrange
alignment_infos = [
AlignmentInfo((1, 1), None, None, None),
AlignmentInfo((1, 2), None, None, None),
AlignmentInfo((2, 1), None, None, None),
AlignmentInfo((2, 2), None, None, None),
AlignmentInfo((0, 0), None, None, None),
]
min_factor = IBMModel5.MIN_SCORE_FACTOR
best_score = 0.9
scores = {
(1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold
(1, 2): best_score,
(2, 1): min_factor * best_score, # at threshold
(2, 2): min_factor * best_score * 0.5, # low score
(0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold
}
corpus = [AlignedSent(["a"], ["b"])]
original_prob_function = IBMModel4.model4_prob_t_a_given_s
# mock static method
IBMModel4.model4_prob_t_a_given_s = staticmethod(
lambda a, model: scores[a.alignment]
)
model5 = IBMModel5(corpus, 0, None, None)
# act
pruned_alignments = model5.prune(alignment_infos)
# assert
self.assertEqual(len(pruned_alignments), 3)
# restore static method
IBMModel4.model4_prob_t_a_given_s = original_prob_function

View File

@@ -0,0 +1,269 @@
"""
Tests for common methods of IBM translation models
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel(unittest.TestCase):
__TEST_SRC_SENTENCE = ["j'", "aime", "bien", "jambon"]
__TEST_TRG_SENTENCE = ["i", "love", "ham"]
def test_vocabularies_are_initialized(self):
parallel_corpora = [
AlignedSent(["one", "two", "three", "four"], ["un", "deux", "trois"]),
AlignedSent(["five", "one", "six"], ["quatre", "cinq", "six"]),
AlignedSent([], ["sept"]),
]
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 8)
self.assertEqual(len(ibm_model.trg_vocab), 6)
def test_vocabularies_are_initialized_even_with_empty_corpora(self):
parallel_corpora = []
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
self.assertEqual(len(ibm_model.trg_vocab), 0)
def test_best_model2_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
# None and 'bien' have zero fertility
translation_table = {
"i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0},
"love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03},
"ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
def test_best_model2_alignment_does_not_change_pegged_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
translation_table = {
"i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0},
"love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03},
"ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act: force 'love' to be pegged to 'jambon'
a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
# assert
self.assertEqual(a_info.alignment[1:], (1, 4, 4))
self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
def test_best_model2_alignment_handles_fertile_words(self):
# arrange
sentence_pair = AlignedSent(
["i", "really", ",", "really", "love", "ham"],
TestIBMModel.__TEST_SRC_SENTENCE,
)
# 'bien' produces 2 target words: 'really' and another 'really'
translation_table = {
"i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0},
"really": {"j'": 0, "aime": 0, "bien": 0.9, "jambon": 0.01, None: 0.09},
",": {"j'": 0, "aime": 0, "bien": 0.3, "jambon": 0, None: 0.7},
"love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03},
"ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
def test_best_model2_alignment_handles_empty_src_sentence(self):
# arrange
sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (0, 0, 0))
self.assertEqual(a_info.cepts, [[1, 2, 3]])
def test_best_model2_alignment_handles_empty_trg_sentence(self):
# arrange
sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], ())
self.assertEqual(a_info.cepts, [[], [], [], [], []])
def test_neighboring_finds_neighbor_alignments(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, "des", "œufs", "verts"),
("UNUSED", "green", "eggs"),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = {
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
(0, 3, 0),
(0, 3, 1),
(0, 3, 3),
# swaps
(0, 2, 3),
# original alignment
(0, 3, 2),
}
self.assertEqual(neighbor_alignments, expected_alignments)
def test_neighboring_sets_neighbor_alignment_info(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, "des", "œufs", "verts"),
("UNUSED", "green", "eggs"),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert: select a few particular alignments
for neighbor in neighbors:
if neighbor.alignment == (0, 2, 2):
moved_alignment = neighbor
elif neighbor.alignment == (0, 3, 2):
swapped_alignment = neighbor
self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
def test_neighboring_returns_neighbors_with_pegged_alignment(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, "des", "œufs", "verts"),
("UNUSED", "green", "eggs"),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act: peg 'eggs' to align with 'œufs'
neighbors = ibm_model.neighboring(a_info, 2)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = {
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
# no swaps
# original alignment
(0, 3, 2),
}
self.assertEqual(neighbor_alignments, expected_alignments)
def test_hillclimb(self):
# arrange
initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
def neighboring_mock(a, j):
if a.alignment == (0, 3, 2):
return {
AlignmentInfo((0, 2, 2), None, None, None),
AlignmentInfo((0, 1, 1), None, None, None),
}
elif a.alignment == (0, 2, 2):
return {
AlignmentInfo((0, 3, 3), None, None, None),
AlignmentInfo((0, 4, 4), None, None, None),
}
return set()
def prob_t_a_given_s_mock(a):
prob_values = {
(0, 3, 2): 0.5,
(0, 2, 2): 0.6,
(0, 1, 1): 0.4,
(0, 3, 3): 0.6,
(0, 4, 4): 0.7,
}
return prob_values.get(a.alignment, 0.01)
ibm_model = IBMModel([])
ibm_model.neighboring = neighboring_mock
ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
# act
best_alignment = ibm_model.hillclimb(initial_alignment)
# assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
self.assertEqual(best_alignment.alignment, (0, 4, 4))
def test_sample(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
ibm_model = IBMModel([])
ibm_model.prob_t_a_given_s = lambda x: 0.001
# act
samples, best_alignment = ibm_model.sample(sentence_pair)
# assert
self.assertEqual(len(samples), 61)

View File

@@ -0,0 +1,20 @@
import unittest
from nltk.translate.meteor_score import meteor_score
class TestMETEOR(unittest.TestCase):
reference = [["this", "is", "a", "test"], ["this", "is" "test"]]
candidate = ["THIS", "Is", "a", "tEST"]
def test_meteor(self):
score = meteor_score(self.reference, self.candidate, preprocess=str.lower)
assert score == 0.9921875
def test_reference_type_check(self):
str_reference = [" ".join(ref) for ref in self.reference]
self.assertRaises(TypeError, meteor_score, str_reference, self.candidate)
def test_candidate_type_check(self):
str_candidate = " ".join(self.candidate)
self.assertRaises(TypeError, meteor_score, self.reference, str_candidate)

View File

@@ -0,0 +1,36 @@
"""
Tests for NIST translation evaluation metric
"""
import io
import unittest
from nltk.data import find
from nltk.translate.nist_score import corpus_nist
class TestNIST(unittest.TestCase):
def test_sentence_nist(self):
ref_file = find("models/wmt15_eval/ref.ru")
hyp_file = find("models/wmt15_eval/google.ru")
mteval_output_file = find("models/wmt15_eval/mteval-13a.output")
# Reads the NIST scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file) as mteval_fin:
# The numbers are located in the last 4th line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1])
with open(ref_file, encoding="utf8") as ref_fin:
with open(hyp_file, encoding="utf8") as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypotheses = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
nltk_nist = corpus_nist(references, hypotheses, i)
# Check that the NIST scores difference is less than 0.5
assert abs(mteval_nist - nltk_nist) < 0.05

View File

@@ -0,0 +1,294 @@
# Natural Language Toolkit: Stack decoder
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Tests for stack decoder
"""
import unittest
from collections import defaultdict
from math import log
from nltk.translate import PhraseTable, StackDecoder
from nltk.translate.stack_decoder import _Hypothesis, _Stack
class TestStackDecoder(unittest.TestCase):
def test_find_all_src_phrases(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
stack_decoder = StackDecoder(phrase_table, None)
sentence = ("my", "hovercraft", "is", "full", "of", "eels")
# act
src_phrase_spans = stack_decoder.find_all_src_phrases(sentence)
# assert
self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft'
self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft'
self.assertEqual(src_phrase_spans[2], [3]) # 'is'
self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels'
self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of'
self.assertEqual(src_phrase_spans[5], [6]) # 'eels'
def test_distortion_score(self):
# arrange
stack_decoder = StackDecoder(None, None)
stack_decoder.distortion_factor = 0.5
hypothesis = _Hypothesis()
hypothesis.src_phrase_span = (3, 5)
# act
score = stack_decoder.distortion_score(hypothesis, (8, 10))
# assert
expected_score = log(stack_decoder.distortion_factor) * (8 - 5)
self.assertEqual(score, expected_score)
def test_distortion_score_of_first_expansion(self):
# arrange
stack_decoder = StackDecoder(None, None)
stack_decoder.distortion_factor = 0.5
hypothesis = _Hypothesis()
# act
score = stack_decoder.distortion_score(hypothesis, (8, 10))
# assert
# expansion from empty hypothesis always has zero distortion cost
self.assertEqual(score, 0.0)
def test_compute_future_costs(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
language_model = TestStackDecoder.create_fake_language_model()
stack_decoder = StackDecoder(phrase_table, language_model)
sentence = ("my", "hovercraft", "is", "full", "of", "eels")
# act
future_scores = stack_decoder.compute_future_scores(sentence)
# assert
self.assertEqual(
future_scores[1][2],
(
phrase_table.translations_for(("hovercraft",))[0].log_prob
+ language_model.probability(("hovercraft",))
),
)
self.assertEqual(
future_scores[0][2],
(
phrase_table.translations_for(("my", "hovercraft"))[0].log_prob
+ language_model.probability(("my", "hovercraft"))
),
)
def test_compute_future_costs_for_phrases_not_in_phrase_table(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
language_model = TestStackDecoder.create_fake_language_model()
stack_decoder = StackDecoder(phrase_table, language_model)
sentence = ("my", "hovercraft", "is", "full", "of", "eels")
# act
future_scores = stack_decoder.compute_future_scores(sentence)
# assert
self.assertEqual(
future_scores[1][3], # 'hovercraft is' is not in phrase table
future_scores[1][2] + future_scores[2][3],
) # backoff
def test_future_score(self):
# arrange: sentence with 8 words; words 2, 3, 4 already translated
hypothesis = _Hypothesis()
hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock
future_score_table = defaultdict(lambda: defaultdict(float))
future_score_table[0][2] = 0.4
future_score_table[5][8] = 0.5
stack_decoder = StackDecoder(None, None)
# act
future_score = stack_decoder.future_score(hypothesis, future_score_table, 8)
# assert
self.assertEqual(future_score, 0.4 + 0.5)
def test_valid_phrases(self):
# arrange
hypothesis = _Hypothesis()
# mock untranslated_spans method
hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)]
all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]]
# act
phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis)
# assert
self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)])
@staticmethod
def create_fake_phrase_table():
phrase_table = PhraseTable()
phrase_table.add(("hovercraft",), ("",), 0.8)
phrase_table.add(("my", "hovercraft"), ("", ""), 0.7)
phrase_table.add(("my", "cheese"), ("", ""), 0.7)
phrase_table.add(("is",), ("",), 0.8)
phrase_table.add(("is",), ("",), 0.5)
phrase_table.add(("full", "of"), ("", ""), 0.01)
phrase_table.add(("full", "of", "eels"), ("", "", ""), 0.5)
phrase_table.add(("full", "of", "spam"), ("", ""), 0.5)
phrase_table.add(("eels",), ("",), 0.5)
phrase_table.add(("spam",), ("",), 0.5)
return phrase_table
@staticmethod
def create_fake_language_model():
# nltk.model should be used here once it is implemented
language_prob = defaultdict(lambda: -999.0)
language_prob[("my",)] = log(0.1)
language_prob[("hovercraft",)] = log(0.1)
language_prob[("is",)] = log(0.1)
language_prob[("full",)] = log(0.1)
language_prob[("of",)] = log(0.1)
language_prob[("eels",)] = log(0.1)
language_prob[("my", "hovercraft")] = log(0.3)
language_model = type(
"", (object,), {"probability": lambda _, phrase: language_prob[phrase]}
)()
return language_model
class TestHypothesis(unittest.TestCase):
def setUp(self):
root = _Hypothesis()
child = _Hypothesis(
raw_score=0.5,
src_phrase_span=(3, 7),
trg_phrase=("hello", "world"),
previous=root,
)
grandchild = _Hypothesis(
raw_score=0.4,
src_phrase_span=(1, 2),
trg_phrase=("and", "goodbye"),
previous=child,
)
self.hypothesis_chain = grandchild
def test_translation_so_far(self):
# act
translation = self.hypothesis_chain.translation_so_far()
# assert
self.assertEqual(translation, ["hello", "world", "and", "goodbye"])
def test_translation_so_far_for_empty_hypothesis(self):
# arrange
hypothesis = _Hypothesis()
# act
translation = hypothesis.translation_so_far()
# assert
self.assertEqual(translation, [])
def test_total_translated_words(self):
# act
total_translated_words = self.hypothesis_chain.total_translated_words()
# assert
self.assertEqual(total_translated_words, 5)
def test_translated_positions(self):
# act
translated_positions = self.hypothesis_chain.translated_positions()
# assert
translated_positions.sort()
self.assertEqual(translated_positions, [1, 3, 4, 5, 6])
def test_untranslated_spans(self):
# act
untranslated_spans = self.hypothesis_chain.untranslated_spans(10)
# assert
self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)])
def test_untranslated_spans_for_empty_hypothesis(self):
# arrange
hypothesis = _Hypothesis()
# act
untranslated_spans = hypothesis.untranslated_spans(10)
# assert
self.assertEqual(untranslated_spans, [(0, 10)])
class TestStack(unittest.TestCase):
def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self):
# arrange
stack = _Stack(3)
poor_hypothesis = _Hypothesis(0.01)
# act
stack.push(_Hypothesis(0.2))
stack.push(poor_hypothesis)
stack.push(_Hypothesis(0.1))
stack.push(_Hypothesis(0.3))
# assert
self.assertFalse(poor_hypothesis in stack)
def test_push_removes_hypotheses_that_fall_below_beam_threshold(self):
# arrange
stack = _Stack(3, 0.5)
poor_hypothesis = _Hypothesis(0.01)
worse_hypothesis = _Hypothesis(0.009)
# act
stack.push(poor_hypothesis)
stack.push(worse_hypothesis)
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
# assert
self.assertFalse(poor_hypothesis in stack)
self.assertFalse(worse_hypothesis in stack)
def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self):
# arrange
stack = _Stack(3, 0.5)
poor_hypothesis = _Hypothesis(0.01)
# act
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
stack.push(poor_hypothesis)
# assert
self.assertFalse(poor_hypothesis in stack)
def test_best_returns_the_best_hypothesis(self):
# arrange
stack = _Stack(3)
best_hypothesis = _Hypothesis(0.99)
# act
stack.push(_Hypothesis(0.0))
stack.push(best_hypothesis)
stack.push(_Hypothesis(0.5))
# assert
self.assertEqual(stack.best(), best_hypothesis)
def test_best_returns_none_when_stack_is_empty(self):
# arrange
stack = _Stack(3)
# assert
self.assertEqual(stack.best(), None)