This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,116 @@
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import unittest
import pytest
from nltk import FreqDist
from nltk.lm import NgramCounter
from nltk.util import everygrams
class TestNgramCounter:
"""Tests for NgramCounter that only involve lookup, no modification."""
@classmethod
def setup_class(self):
text = [list("abcd"), list("egdbe")]
self.trigram_counter = NgramCounter(
everygrams(sent, max_len=3) for sent in text
)
self.bigram_counter = NgramCounter(everygrams(sent, max_len=2) for sent in text)
self.case = unittest.TestCase()
def test_N(self):
assert self.bigram_counter.N() == 16
assert self.trigram_counter.N() == 21
def test_counter_len_changes_with_lookup(self):
assert len(self.bigram_counter) == 2
self.bigram_counter[50]
assert len(self.bigram_counter) == 3
def test_ngram_order_access_unigrams(self):
assert self.bigram_counter[1] == self.bigram_counter.unigrams
def test_ngram_conditional_freqdist(self):
case = unittest.TestCase()
expected_trigram_contexts = [
("a", "b"),
("b", "c"),
("e", "g"),
("g", "d"),
("d", "b"),
]
expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)]
bigrams = self.trigram_counter[2]
trigrams = self.trigram_counter[3]
self.case.assertCountEqual(expected_bigram_contexts, bigrams.conditions())
self.case.assertCountEqual(expected_trigram_contexts, trigrams.conditions())
def test_bigram_counts_seen_ngrams(self):
assert self.bigram_counter[["a"]]["b"] == 1
assert self.bigram_counter[["b"]]["c"] == 1
def test_bigram_counts_unseen_ngrams(self):
assert self.bigram_counter[["b"]]["z"] == 0
def test_unigram_counts_seen_words(self):
assert self.bigram_counter["b"] == 2
def test_unigram_counts_completely_unseen_words(self):
assert self.bigram_counter["z"] == 0
class TestNgramCounterTraining:
@classmethod
def setup_class(self):
self.counter = NgramCounter()
self.case = unittest.TestCase()
@pytest.mark.parametrize("case", ["", [], None])
def test_empty_inputs(self, case):
test = NgramCounter(case)
assert 2 not in test
assert test[1] == FreqDist()
def test_train_on_unigrams(self):
words = list("abcd")
counter = NgramCounter([[(w,) for w in words]])
assert not counter[3]
assert not counter[2]
self.case.assertCountEqual(words, counter[1].keys())
def test_train_on_illegal_sentences(self):
str_sent = ["Check", "this", "out", "!"]
list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]]
with pytest.raises(TypeError):
NgramCounter([str_sent])
with pytest.raises(TypeError):
NgramCounter([list_sent])
def test_train_on_bigrams(self):
bigram_sent = [("a", "b"), ("c", "d")]
counter = NgramCounter([bigram_sent])
assert not bool(counter[3])
def test_train_on_mix(self):
mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)]
counter = NgramCounter([mixed_sent])
unigrams = ["h"]
bigram_contexts = [("a",), ("c",)]
trigram_contexts = [("e", "f")]
self.case.assertCountEqual(unigrams, counter[1].keys())
self.case.assertCountEqual(bigram_contexts, counter[2].keys())
self.case.assertCountEqual(trigram_contexts, counter[3].keys())

View File

@@ -0,0 +1,611 @@
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import math
from math import fsum as sum
from operator import itemgetter
import pytest
from nltk.lm import (
MLE,
AbsoluteDiscountingInterpolated,
KneserNeyInterpolated,
Laplace,
Lidstone,
StupidBackoff,
Vocabulary,
WittenBellInterpolated,
)
from nltk.lm.preprocessing import padded_everygrams
@pytest.fixture(scope="session")
def vocabulary():
return Vocabulary(["a", "b", "c", "d", "z", "<s>", "</s>"], unk_cutoff=1)
@pytest.fixture(scope="session")
def training_data():
return [["a", "b", "c", "d"], ["e", "g", "a", "d", "b", "e"]]
@pytest.fixture(scope="session")
def bigram_training_data(training_data):
return [list(padded_everygrams(2, sent)) for sent in training_data]
@pytest.fixture(scope="session")
def trigram_training_data(training_data):
return [list(padded_everygrams(3, sent)) for sent in training_data]
@pytest.fixture
def mle_bigram_model(vocabulary, bigram_training_data):
model = MLE(2, vocabulary=vocabulary)
model.fit(bigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
("d", ["c"], 1),
# Unseen ngrams should yield 0
("d", ["e"], 0),
# Unigrams should also be 0
("z", None, 0),
# N unigrams = 14
# count('a') = 2
("a", None, 2.0 / 14),
# count('y') = 3
("y", None, 3.0 / 14),
],
)
def test_mle_bigram_scores(mle_bigram_model, word, context, expected_score):
assert pytest.approx(mle_bigram_model.score(word, context), 1e-4) == expected_score
def test_mle_bigram_logscore_for_zero_score(mle_bigram_model):
assert math.isinf(mle_bigram_model.logscore("d", ["e"]))
def test_mle_bigram_entropy_perplexity_seen(mle_bigram_model):
# ngrams seen during training
trained = [
("<s>", "a"),
("a", "b"),
("b", "<UNK>"),
("<UNK>", "a"),
("a", "d"),
("d", "</s>"),
]
# Ngram = Log score
# <s>, a = -1
# a, b = -1
# b, UNK = -1
# UNK, a = -1.585
# a, d = -1
# d, </s> = -1
# TOTAL logscores = -6.585
# - AVG logscores = 1.0975
H = 1.0975
perplexity = 2.1398
assert pytest.approx(mle_bigram_model.entropy(trained), 1e-4) == H
assert pytest.approx(mle_bigram_model.perplexity(trained), 1e-4) == perplexity
def test_mle_bigram_entropy_perplexity_unseen(mle_bigram_model):
# In MLE, even one unseen ngram should make entropy and perplexity infinite
untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]
assert math.isinf(mle_bigram_model.entropy(untrained))
assert math.isinf(mle_bigram_model.perplexity(untrained))
def test_mle_bigram_entropy_perplexity_unigrams(mle_bigram_model):
# word = score, log score
# <s> = 0.1429, -2.8074
# a = 0.1429, -2.8074
# c = 0.0714, -3.8073
# UNK = 0.2143, -2.2224
# d = 0.1429, -2.8074
# c = 0.0714, -3.8073
# </s> = 0.1429, -2.8074
# TOTAL logscores = -21.6243
# - AVG logscores = 3.0095
H = 3.0095
perplexity = 8.0529
text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]
assert pytest.approx(mle_bigram_model.entropy(text), 1e-4) == H
assert pytest.approx(mle_bigram_model.perplexity(text), 1e-4) == perplexity
@pytest.fixture
def mle_trigram_model(trigram_training_data, vocabulary):
model = MLE(order=3, vocabulary=vocabulary)
model.fit(trigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# count(d | b, c) = 1
# count(b, c) = 1
("d", ("b", "c"), 1),
# count(d | c) = 1
# count(c) = 1
("d", ["c"], 1),
# total number of tokens is 18, of which "a" occurred 2 times
("a", None, 2.0 / 18),
# in vocabulary but unseen
("z", None, 0),
# out of vocabulary should use "UNK" score
("y", None, 3.0 / 18),
],
)
def test_mle_trigram_scores(mle_trigram_model, word, context, expected_score):
assert pytest.approx(mle_trigram_model.score(word, context), 1e-4) == expected_score
@pytest.fixture
def lidstone_bigram_model(bigram_training_data, vocabulary):
model = Lidstone(0.1, order=2, vocabulary=vocabulary)
model.fit(bigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# count(d | c) = 1
# *count(d | c) = 1.1
# Count(w | c for w in vocab) = 1
# *Count(w | c for w in vocab) = 1.8
("d", ["c"], 1.1 / 1.8),
# Total unigrams: 14
# Vocab size: 8
# Denominator: 14 + 0.8 = 14.8
# count("a") = 2
# *count("a") = 2.1
("a", None, 2.1 / 14.8),
# in vocabulary but unseen
# count("z") = 0
# *count("z") = 0.1
("z", None, 0.1 / 14.8),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
# *count("<UNK>") = 3.1
("y", None, 3.1 / 14.8),
],
)
def test_lidstone_bigram_score(lidstone_bigram_model, word, context, expected_score):
assert (
pytest.approx(lidstone_bigram_model.score(word, context), 1e-4)
== expected_score
)
def test_lidstone_entropy_perplexity(lidstone_bigram_model):
text = [
("<s>", "a"),
("a", "c"),
("c", "<UNK>"),
("<UNK>", "d"),
("d", "c"),
("c", "</s>"),
]
# Unlike MLE this should be able to handle completely novel ngrams
# Ngram = score, log score
# <s>, a = 0.3929, -1.3479
# a, c = 0.0357, -4.8074
# c, UNK = 0.0(5), -4.1699
# UNK, d = 0.0263, -5.2479
# d, c = 0.0357, -4.8074
# c, </s> = 0.0(5), -4.1699
# TOTAL logscore: 24.5504
# - AVG logscore: 4.0917
H = 4.0917
perplexity = 17.0504
assert pytest.approx(lidstone_bigram_model.entropy(text), 1e-4) == H
assert pytest.approx(lidstone_bigram_model.perplexity(text), 1e-4) == perplexity
@pytest.fixture
def lidstone_trigram_model(trigram_training_data, vocabulary):
model = Lidstone(0.1, order=3, vocabulary=vocabulary)
model.fit(trigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# Logic behind this is the same as for bigram model
("d", ["c"], 1.1 / 1.8),
# if we choose a word that hasn't appeared after (b, c)
("e", ["c"], 0.1 / 1.8),
# Trigram score now
("d", ["b", "c"], 1.1 / 1.8),
("e", ["b", "c"], 0.1 / 1.8),
],
)
def test_lidstone_trigram_score(lidstone_trigram_model, word, context, expected_score):
assert (
pytest.approx(lidstone_trigram_model.score(word, context), 1e-4)
== expected_score
)
@pytest.fixture
def laplace_bigram_model(bigram_training_data, vocabulary):
model = Laplace(2, vocabulary=vocabulary)
model.fit(bigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# basic sanity-check:
# count(d | c) = 1
# *count(d | c) = 2
# Count(w | c for w in vocab) = 1
# *Count(w | c for w in vocab) = 9
("d", ["c"], 2.0 / 9),
# Total unigrams: 14
# Vocab size: 8
# Denominator: 14 + 8 = 22
# count("a") = 2
# *count("a") = 3
("a", None, 3.0 / 22),
# in vocabulary but unseen
# count("z") = 0
# *count("z") = 1
("z", None, 1.0 / 22),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
# *count("<UNK>") = 4
("y", None, 4.0 / 22),
],
)
def test_laplace_bigram_score(laplace_bigram_model, word, context, expected_score):
assert (
pytest.approx(laplace_bigram_model.score(word, context), 1e-4) == expected_score
)
def test_laplace_bigram_entropy_perplexity(laplace_bigram_model):
text = [
("<s>", "a"),
("a", "c"),
("c", "<UNK>"),
("<UNK>", "d"),
("d", "c"),
("c", "</s>"),
]
# Unlike MLE this should be able to handle completely novel ngrams
# Ngram = score, log score
# <s>, a = 0.2, -2.3219
# a, c = 0.1, -3.3219
# c, UNK = 0.(1), -3.1699
# UNK, d = 0.(09), 3.4594
# d, c = 0.1 -3.3219
# c, </s> = 0.(1), -3.1699
# Total logscores: 18.7651
# - AVG logscores: 3.1275
H = 3.1275
perplexity = 8.7393
assert pytest.approx(laplace_bigram_model.entropy(text), 1e-4) == H
assert pytest.approx(laplace_bigram_model.perplexity(text), 1e-4) == perplexity
def test_laplace_gamma(laplace_bigram_model):
assert laplace_bigram_model.gamma == 1
@pytest.fixture
def wittenbell_trigram_model(trigram_training_data, vocabulary):
model = WittenBellInterpolated(3, vocabulary=vocabulary)
model.fit(trigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# For unigram scores by default revert to regular MLE
# Total unigrams: 18
# Vocab Size = 7
# count('c'): 1
("c", None, 1.0 / 18),
# in vocabulary but unseen
# count("z") = 0
("z", None, 0 / 18),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
("y", None, 3.0 / 18),
# 2 words follow b and b occurred a total of 2 times
# gamma(['b']) = 2 / (2 + 2) = 0.5
# mle.score('c', ['b']) = 0.5
# mle('c') = 1 / 18 = 0.055
# (1 - gamma) * mle + gamma * mle('c') ~= 0.27 + 0.055
("c", ["b"], (1 - 0.5) * 0.5 + 0.5 * 1 / 18),
# building on that, let's try 'a b c' as the trigram
# 1 word follows 'a b' and 'a b' occurred 1 time
# gamma(['a', 'b']) = 1 / (1 + 1) = 0.5
# mle("c", ["a", "b"]) = 1
("c", ["a", "b"], (1 - 0.5) + 0.5 * ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)),
# P(c|zb)
# The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332.
("c", ["z", "b"], ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)),
],
)
def test_wittenbell_trigram_score(
wittenbell_trigram_model, word, context, expected_score
):
assert (
pytest.approx(wittenbell_trigram_model.score(word, context), 1e-4)
== expected_score
)
###############################################################################
# Notation Explained #
###############################################################################
# For all subsequent calculations we use the following notation:
# 1. '*': Placeholder for any word/character. E.g. '*b' stands for
# all bigrams that end in 'b'. '*b*' stands for all trigrams that
# contain 'b' in the middle.
# 1. count(ngram): Count all instances (tokens) of an ngram.
# 1. unique(ngram): Count unique instances (types) of an ngram.
@pytest.fixture
def kneserney_trigram_model(trigram_training_data, vocabulary):
model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary)
model.fit(trigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# P(c) = count('*c') / unique('**')
# = 1 / 14
("c", None, 1.0 / 14),
# P(z) = count('*z') / unique('**')
# = 0 / 14
# 'z' is in the vocabulary, but it was not seen during training.
("z", None, 0.0 / 14),
# P(y)
# Out of vocabulary should use "UNK" score.
# P(y) = P(UNK) = count('*UNK') / unique('**')
("y", None, 3 / 14),
# We start with P(c|b)
# P(c|b) = alpha('bc') + gamma('b') * P(c)
# alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*')
# = max(1 - 0.75, 0) / 2
# = 0.125
# gamma('b') = discount * unique('b*') / unique('*b*')
# = (0.75 * 2) / 2
# = 0.75
("c", ["b"], (0.125 + 0.75 * (1 / 14))),
# Building on that, let's try P(c|ab).
# P(c|ab) = alpha('abc') + gamma('ab') * P(c|b)
# alpha('abc') = max(count('abc') - discount, 0) / count('ab*')
# = max(1 - 0.75, 0) / 1
# = 0.25
# gamma('ab') = (discount * unique('ab*')) / count('ab*')
# = 0.75 * 1 / 1
("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))),
# P(c|zb)
# The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332.
("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))),
],
)
def test_kneserney_trigram_score(
kneserney_trigram_model, word, context, expected_score
):
assert (
pytest.approx(kneserney_trigram_model.score(word, context), 1e-4)
== expected_score
)
@pytest.fixture
def absolute_discounting_trigram_model(trigram_training_data, vocabulary):
model = AbsoluteDiscountingInterpolated(order=3, vocabulary=vocabulary)
model.fit(trigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# For unigram scores revert to uniform
# P(c) = count('c') / count('**')
("c", None, 1.0 / 18),
# in vocabulary but unseen
# count('z') = 0
("z", None, 0.0 / 18),
# out of vocabulary should use "UNK" score
# count('<UNK>') = 3
("y", None, 3 / 18),
# P(c|b) = alpha('bc') + gamma('b') * P(c)
# alpha('bc') = max(count('bc') - discount, 0) / count('b*')
# = max(1 - 0.75, 0) / 2
# = 0.125
# gamma('b') = discount * unique('b*') / count('b*')
# = (0.75 * 2) / 2
# = 0.75
("c", ["b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))),
# Building on that, let's try P(c|ab).
# P(c|ab) = alpha('abc') + gamma('ab') * P(c|b)
# alpha('abc') = max(count('abc') - discount, 0) / count('ab*')
# = max(1 - 0.75, 0) / 1
# = 0.25
# gamma('ab') = (discount * unique('ab*')) / count('ab*')
# = 0.75 * 1 / 1
("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (2 / 2) * (1 / 18))),
# P(c|zb)
# The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332.
("c", ["z", "b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))),
],
)
def test_absolute_discounting_trigram_score(
absolute_discounting_trigram_model, word, context, expected_score
):
assert (
pytest.approx(absolute_discounting_trigram_model.score(word, context), 1e-4)
== expected_score
)
@pytest.fixture
def stupid_backoff_trigram_model(trigram_training_data, vocabulary):
model = StupidBackoff(order=3, vocabulary=vocabulary)
model.fit(trigram_training_data)
return model
@pytest.mark.parametrize(
"word, context, expected_score",
[
# For unigram scores revert to uniform
# total bigrams = 18
("c", None, 1.0 / 18),
# in vocabulary but unseen
# bigrams ending with z = 0
("z", None, 0.0 / 18),
# out of vocabulary should use "UNK" score
# count('<UNK>'): 3
("y", None, 3 / 18),
# c follows 1 time out of 2 after b
("c", ["b"], 1 / 2),
# c always follows ab
("c", ["a", "b"], 1 / 1),
# The ngram 'z b c' was not seen, so we backoff to
# the score of the ngram 'b c' * smoothing factor
("c", ["z", "b"], (0.4 * (1 / 2))),
],
)
def test_stupid_backoff_trigram_score(
stupid_backoff_trigram_model, word, context, expected_score
):
assert (
pytest.approx(stupid_backoff_trigram_model.score(word, context), 1e-4)
== expected_score
)
###############################################################################
# Probability Distributions Should Sum up to Unity #
###############################################################################
@pytest.fixture(scope="session")
def kneserney_bigram_model(bigram_training_data, vocabulary):
model = KneserNeyInterpolated(order=2, vocabulary=vocabulary)
model.fit(bigram_training_data)
return model
@pytest.mark.parametrize(
"model_fixture",
[
"mle_bigram_model",
"mle_trigram_model",
"lidstone_bigram_model",
"laplace_bigram_model",
"wittenbell_trigram_model",
"absolute_discounting_trigram_model",
"kneserney_bigram_model",
pytest.param(
"stupid_backoff_trigram_model",
marks=pytest.mark.xfail(
reason="Stupid Backoff is not a valid distribution"
),
),
],
)
@pytest.mark.parametrize(
"context",
[("a",), ("c",), ("<s>",), ("b",), ("<UNK>",), ("d",), ("e",), ("r",), ("w",)],
ids=itemgetter(0),
)
def test_sums_to_1(model_fixture, context, request):
model = request.getfixturevalue(model_fixture)
scores_for_context = sum(model.score(w, context) for w in model.vocab)
assert pytest.approx(scores_for_context, 1e-7) == 1.0
###############################################################################
# Generating Text #
###############################################################################
def test_generate_one_no_context(mle_trigram_model):
assert mle_trigram_model.generate(random_seed=3) == "<UNK>"
def test_generate_one_from_limiting_context(mle_trigram_model):
# We don't need random_seed for contexts with only one continuation
assert mle_trigram_model.generate(text_seed=["c"]) == "d"
assert mle_trigram_model.generate(text_seed=["b", "c"]) == "d"
assert mle_trigram_model.generate(text_seed=["a", "c"]) == "d"
def test_generate_one_from_varied_context(mle_trigram_model):
# When context doesn't limit our options enough, seed the random choice
assert mle_trigram_model.generate(text_seed=("a", "<s>"), random_seed=2) == "a"
def test_generate_cycle(mle_trigram_model):
# Add a cycle to the model: bd -> b, db -> d
more_training_text = [padded_everygrams(mle_trigram_model.order, list("bdbdbd"))]
mle_trigram_model.fit(more_training_text)
# Test that we can escape the cycle
assert mle_trigram_model.generate(7, text_seed=("b", "d"), random_seed=5) == [
"b",
"d",
"b",
"d",
"b",
"d",
"</s>",
]
def test_generate_with_text_seed(mle_trigram_model):
assert mle_trigram_model.generate(5, text_seed=("<s>", "e"), random_seed=3) == [
"<UNK>",
"a",
"d",
"b",
"<UNK>",
]
def test_generate_oov_text_seed(mle_trigram_model):
assert mle_trigram_model.generate(
text_seed=("aliens",), random_seed=3
) == mle_trigram_model.generate(text_seed=("<UNK>",), random_seed=3)
def test_generate_None_text_seed(mle_trigram_model):
# should crash with type error when we try to look it up in vocabulary
with pytest.raises(TypeError):
mle_trigram_model.generate(text_seed=(None,))
# This will work
assert mle_trigram_model.generate(
text_seed=None, random_seed=3
) == mle_trigram_model.generate(random_seed=3)

View File

@@ -0,0 +1,30 @@
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import unittest
from nltk.lm.preprocessing import padded_everygram_pipeline
class TestPreprocessing(unittest.TestCase):
def test_padded_everygram_pipeline(self):
expected_train = [
[
("<s>",),
("<s>", "a"),
("a",),
("a", "b"),
("b",),
("b", "c"),
("c",),
("c", "</s>"),
("</s>",),
]
]
expected_vocab = ["<s>", "a", "b", "c", "</s>"]
train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
self.assertEqual([list(sent) for sent in train_data], expected_train)
self.assertEqual(list(vocab_data), expected_vocab)

View File

@@ -0,0 +1,156 @@
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import unittest
from collections import Counter
from timeit import timeit
from nltk.lm import Vocabulary
class NgramModelVocabularyTests(unittest.TestCase):
"""tests Vocabulary Class"""
@classmethod
def setUpClass(cls):
cls.vocab = Vocabulary(
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"],
unk_cutoff=2,
)
def test_truthiness(self):
self.assertTrue(self.vocab)
def test_cutoff_value_set_correctly(self):
self.assertEqual(self.vocab.cutoff, 2)
def test_unable_to_change_cutoff(self):
with self.assertRaises(AttributeError):
self.vocab.cutoff = 3
def test_cutoff_setter_checks_value(self):
with self.assertRaises(ValueError) as exc_info:
Vocabulary("abc", unk_cutoff=0)
expected_error_msg = "Cutoff value cannot be less than 1. Got: 0"
self.assertEqual(expected_error_msg, str(exc_info.exception))
def test_counts_set_correctly(self):
self.assertEqual(self.vocab.counts["a"], 2)
self.assertEqual(self.vocab.counts["b"], 2)
self.assertEqual(self.vocab.counts["c"], 1)
def test_membership_check_respects_cutoff(self):
# a was seen 2 times, so it should be considered part of the vocabulary
self.assertTrue("a" in self.vocab)
# "c" was seen once, it shouldn't be considered part of the vocab
self.assertFalse("c" in self.vocab)
# "z" was never seen at all, also shouldn't be considered in the vocab
self.assertFalse("z" in self.vocab)
def test_vocab_len_respects_cutoff(self):
# Vocab size is the number of unique tokens that occur at least as often
# as the cutoff value, plus 1 to account for unknown words.
self.assertEqual(5, len(self.vocab))
def test_vocab_iter_respects_cutoff(self):
vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"]
vocab_items = ["a", "b", "d", "e", "<UNK>"]
self.assertCountEqual(vocab_counts, list(self.vocab.counts.keys()))
self.assertCountEqual(vocab_items, list(self.vocab))
def test_update_empty_vocab(self):
empty = Vocabulary(unk_cutoff=2)
self.assertEqual(len(empty), 0)
self.assertFalse(empty)
self.assertIn(empty.unk_label, empty)
empty.update(list("abcde"))
self.assertIn(empty.unk_label, empty)
def test_lookup(self):
self.assertEqual(self.vocab.lookup("a"), "a")
self.assertEqual(self.vocab.lookup("c"), "<UNK>")
def test_lookup_iterables(self):
self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b"))
self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b"))
self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "<UNK>"))
self.assertEqual(
self.vocab.lookup(map(str, range(3))), ("<UNK>", "<UNK>", "<UNK>")
)
def test_lookup_empty_iterables(self):
self.assertEqual(self.vocab.lookup(()), ())
self.assertEqual(self.vocab.lookup([]), ())
self.assertEqual(self.vocab.lookup(iter([])), ())
self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ())
def test_lookup_recursive(self):
self.assertEqual(
self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "<UNK>"))
)
self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "<UNK>"))
self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),))
def test_lookup_None(self):
with self.assertRaises(TypeError):
self.vocab.lookup(None)
with self.assertRaises(TypeError):
list(self.vocab.lookup([None, None]))
def test_lookup_int(self):
with self.assertRaises(TypeError):
self.vocab.lookup(1)
with self.assertRaises(TypeError):
list(self.vocab.lookup([1, 2]))
def test_lookup_empty_str(self):
self.assertEqual(self.vocab.lookup(""), "<UNK>")
def test_eqality(self):
v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah")
v4 = Vocabulary(["a", "b"], unk_cutoff=1)
self.assertEqual(v1, v2)
self.assertNotEqual(v1, v3)
self.assertNotEqual(v1, v4)
def test_str(self):
self.assertEqual(
str(self.vocab), "<Vocabulary with cutoff=2 unk_label='<UNK>' and 5 items>"
)
def test_creation_with_counter(self):
self.assertEqual(
self.vocab,
Vocabulary(
Counter(
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"]
),
unk_cutoff=2,
),
)
@unittest.skip(
reason="Test is known to be flaky as it compares (runtime) performance."
)
def test_len_is_constant(self):
# Given an obviously small and an obviously large vocabulary.
small_vocab = Vocabulary("abcde")
from nltk.corpus.europarl_raw import english
large_vocab = Vocabulary(english.words())
# If we time calling `len` on them.
small_vocab_len_time = timeit("len(small_vocab)", globals=locals())
large_vocab_len_time = timeit("len(large_vocab)", globals=locals())
# The timing should be the same order of magnitude.
self.assertAlmostEqual(small_vocab_len_time, large_vocab_len_time, places=1)

View File

@@ -0,0 +1,49 @@
"""
Test Aline algorithm for aligning phonetic sequences
"""
from nltk.metrics import aline
def test_aline():
result = aline.align("θin", "tenwis")
expected = [[("θ", "t"), ("i", "e"), ("n", "n")]]
assert result == expected
result = aline.align("jo", "ʒə")
expected = [[("j", "ʒ"), ("o", "ə")]]
assert result == expected
result = aline.align("pematesiweni", "pematesewen")
expected = [
[
("p", "p"),
("e", "e"),
("m", "m"),
("a", "a"),
("t", "t"),
("e", "e"),
("s", "s"),
("i", "e"),
("w", "w"),
("e", "e"),
("n", "n"),
]
]
assert result == expected
result = aline.align("tuwθ", "dentis")
expected = [[("t", "t"), ("u", "i"), ("w", "-"), ("θ", "s")]]
assert result == expected
def test_aline_delta():
"""
Test aline for computing the difference between two segments
"""
assert aline.delta("p", "q") == 20.0
assert aline.delta("a", "A") == 0.0

View File

@@ -0,0 +1,42 @@
import pytest
from nltk.data import find
from nltk.parse.bllip import BllipParser
from nltk.tree import Tree
@pytest.fixture(scope="module")
def parser():
model_dir = find("models/bllip_wsj_no_aux").path
return BllipParser.from_unified_model_dir(model_dir)
def setup_module():
pytest.importorskip("bllipparser")
class TestBllipParser:
def test_parser_loads_a_valid_tree(self, parser):
parsed = parser.parse("I saw the man with the telescope")
tree = next(parsed)
assert isinstance(tree, Tree)
assert (
tree.pformat()
== """
(S1
(S
(NP (PRP I))
(VP
(VBD saw)
(NP (DT the) (NN man))
(PP (IN with) (NP (DT the) (NN telescope))))))
""".strip()
)
def test_tagged_parse_finds_matching_element(self, parser):
parsed = parser.parse("I saw the man with the telescope")
tagged_tree = next(parser.tagged_parse([("telescope", "NN")]))
assert isinstance(tagged_tree, Tree)
assert tagged_tree.pformat() == "(S1 (NP (NN telescope)))"

View File

@@ -0,0 +1,34 @@
"""
Tests for Brill tagger.
"""
import unittest
from nltk.corpus import treebank
from nltk.tag import UnigramTagger, brill, brill_trainer
from nltk.tbl import demo
class TestBrill(unittest.TestCase):
def test_pos_template(self):
train_sents = treebank.tagged_sents()[:1000]
tagger = UnigramTagger(train_sents)
trainer = brill_trainer.BrillTaggerTrainer(
tagger, [brill.Template(brill.Pos([-1]))]
)
brill_tagger = trainer.train(train_sents)
# Example from https://github.com/nltk/nltk/issues/769
result = brill_tagger.tag("This is a foo bar sentence".split())
expected = [
("This", "DT"),
("is", "VBZ"),
("a", "DT"),
("foo", None),
("bar", "NN"),
("sentence", None),
]
self.assertEqual(result, expected)
@unittest.skip("Should be tested in __main__ of nltk.tbl.demo")
def test_brill_demo(self):
demo()

View File

@@ -0,0 +1,39 @@
import unittest
import pytest
from nltk import ConditionalFreqDist, tokenize
class TestEmptyCondFreq(unittest.TestCase):
def test_tabulate(self):
empty = ConditionalFreqDist()
self.assertEqual(empty.conditions(), [])
with pytest.raises(ValueError):
empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
self.assertEqual(empty.conditions(), [])
def test_plot(self):
empty = ConditionalFreqDist()
self.assertEqual(empty.conditions(), [])
empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added
self.assertEqual(empty.conditions(), [])
def test_increment(self):
# make sure that we can still mutate cfd normally
text = "cow cat mouse cat tiger"
cfd = ConditionalFreqDist()
# create cfd with word length as condition
for word in tokenize.word_tokenize(text):
condition = len(word)
cfd[condition][word] += 1
self.assertEqual(cfd.conditions(), [3, 5])
# incrementing previously unseen key is still possible
cfd[2]["hi"] += 1
self.assertCountEqual(cfd.conditions(), [3, 5, 2]) # new condition added
self.assertEqual(
cfd[2]["hi"], 1
) # key's frequency incremented from 0 (unseen) to 1

View File

@@ -0,0 +1,49 @@
import unittest
import nltk
from nltk.grammar import CFG
class ChomskyNormalFormForCFGTest(unittest.TestCase):
def test_simple(self):
grammar = CFG.fromstring(
"""
S -> NP VP
PP -> P NP
NP -> Det N | NP PP P
VP -> V NP | VP PP
VP -> Det
Det -> 'a' | 'the'
N -> 'dog' | 'cat'
V -> 'chased' | 'sat'
P -> 'on' | 'in'
"""
)
self.assertFalse(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())
grammar = grammar.chomsky_normal_form(flexible=True)
self.assertTrue(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())
grammar2 = CFG.fromstring(
"""
S -> NP VP
NP -> VP N P
VP -> P
N -> 'dog' | 'cat'
P -> 'on' | 'in'
"""
)
self.assertFalse(grammar2.is_flexible_chomsky_normal_form())
self.assertFalse(grammar2.is_chomsky_normal_form())
grammar2 = grammar2.chomsky_normal_form()
self.assertTrue(grammar2.is_flexible_chomsky_normal_form())
self.assertTrue(grammar2.is_chomsky_normal_form())
def test_complex(self):
grammar = nltk.data.load("grammars/large_grammars/atis.cfg")
self.assertFalse(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())
grammar = grammar.chomsky_normal_form(flexible=True)
self.assertTrue(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())

View File

@@ -0,0 +1,85 @@
import unittest
from nltk import RegexpParser
class TestChunkRule(unittest.TestCase):
def test_tag_pattern2re_pattern_quantifier(self):
"""Test for bug https://github.com/nltk/nltk/issues/1597
Ensures that curly bracket quantifiers can be used inside a chunk rule.
This type of quantifier has been used for the supplementary example
in https://www.nltk.org/book/ch07.html#exploring-text-corpora.
"""
sent = [
("The", "AT"),
("September-October", "NP"),
("term", "NN"),
("jury", "NN"),
("had", "HVD"),
("been", "BEN"),
("charged", "VBN"),
("by", "IN"),
("Fulton", "NP-TL"),
("Superior", "JJ-TL"),
("Court", "NN-TL"),
("Judge", "NN-TL"),
("Durwood", "NP"),
("Pye", "NP"),
("to", "TO"),
("investigate", "VB"),
("reports", "NNS"),
("of", "IN"),
("possible", "JJ"),
("``", "``"),
("irregularities", "NNS"),
("''", "''"),
("in", "IN"),
("the", "AT"),
("hard-fought", "JJ"),
("primary", "NN"),
("which", "WDT"),
("was", "BEDZ"),
("won", "VBN"),
("by", "IN"),
("Mayor-nominate", "NN-TL"),
("Ivan", "NP"),
("Allen", "NP"),
("Jr.", "NP"),
(".", "."),
] # source: brown corpus
cp = RegexpParser("CHUNK: {<N.*>{4,}}")
tree = cp.parse(sent)
assert (
tree.pformat()
== """(S
The/AT
September-October/NP
term/NN
jury/NN
had/HVD
been/BEN
charged/VBN
by/IN
Fulton/NP-TL
Superior/JJ-TL
(CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
to/TO
investigate/VB
reports/NNS
of/IN
possible/JJ
``/``
irregularities/NNS
''/''
in/IN
the/AT
hard-fought/JJ
primary/NN
which/WDT
was/BEDZ
won/VBN
by/IN
(CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
./.)"""
)

View File

@@ -0,0 +1,50 @@
"""
Unit tests for nltk.classify. See also: nltk/test/classify.doctest
"""
import pytest
from nltk import classify
TRAIN = [
(dict(a=1, b=1, c=1), "y"),
(dict(a=1, b=1, c=1), "x"),
(dict(a=1, b=1, c=0), "y"),
(dict(a=0, b=1, c=1), "x"),
(dict(a=0, b=1, c=1), "y"),
(dict(a=0, b=0, c=1), "y"),
(dict(a=0, b=1, c=0), "x"),
(dict(a=0, b=0, c=0), "x"),
(dict(a=0, b=1, c=1), "y"),
]
TEST = [
(dict(a=1, b=0, c=1)), # unseen
(dict(a=1, b=0, c=0)), # unseen
(dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x
(dict(a=0, b=1, c=0)), # seen 1 time, label=x
]
RESULTS = [(0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24)]
def assert_classifier_correct(algorithm):
try:
classifier = classify.MaxentClassifier.train(
TRAIN, algorithm, trace=0, max_iter=1000
)
except (LookupError, AttributeError) as e:
pytest.skip(str(e))
for (px, py), featureset in zip(RESULTS, TEST):
pdist = classifier.prob_classify(featureset)
assert abs(pdist.prob("x") - px) < 1e-2, (pdist.prob("x"), px)
assert abs(pdist.prob("y") - py) < 1e-2, (pdist.prob("y"), py)
def test_megam():
assert_classifier_correct("MEGAM")
def test_tadm():
assert_classifier_correct("TADM")

View File

@@ -0,0 +1,120 @@
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
## Test bigram counters with discontinuous bigrams and repeated words
_EPSILON = 1e-8
SENT = "this this is is a a test test".split()
def close_enough(x, y):
"""Verify that two sequences of n-gram association values are within
_EPSILON of each other.
"""
return all(abs(x1[1] - y1[1]) <= _EPSILON for x1, y1 in zip(x, y))
def test_bigram2():
b = BigramCollocationFinder.from_words(SENT)
assert sorted(b.ngram_fd.items()) == [
(("a", "a"), 1),
(("a", "test"), 1),
(("is", "a"), 1),
(("is", "is"), 1),
(("test", "test"), 1),
(("this", "is"), 1),
(("this", "this"), 1),
]
assert sorted(b.word_fd.items()) == [("a", 2), ("is", 2), ("test", 2), ("this", 2)]
assert len(SENT) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
assert close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
[
(("a", "a"), 1.0),
(("a", "test"), 1.0),
(("is", "a"), 1.0),
(("is", "is"), 1.0),
(("test", "test"), 1.0),
(("this", "is"), 1.0),
(("this", "this"), 1.0),
],
)
def test_bigram3():
b = BigramCollocationFinder.from_words(SENT, window_size=3)
assert sorted(b.ngram_fd.items()) == sorted(
[
(("a", "test"), 3),
(("is", "a"), 3),
(("this", "is"), 3),
(("a", "a"), 1),
(("is", "is"), 1),
(("test", "test"), 1),
(("this", "this"), 1),
]
)
assert sorted(b.word_fd.items()) == sorted(
[("a", 2), ("is", 2), ("test", 2), ("this", 2)]
)
assert (
len(SENT) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
)
assert close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(("a", "test"), 1.584962500721156),
(("is", "a"), 1.584962500721156),
(("this", "is"), 1.584962500721156),
(("a", "a"), 0.0),
(("is", "is"), 0.0),
(("test", "test"), 0.0),
(("this", "this"), 0.0),
]
),
)
def test_bigram5():
b = BigramCollocationFinder.from_words(SENT, window_size=5)
assert sorted(b.ngram_fd.items()) == sorted(
[
(("a", "test"), 4),
(("is", "a"), 4),
(("this", "is"), 4),
(("is", "test"), 3),
(("this", "a"), 3),
(("a", "a"), 1),
(("is", "is"), 1),
(("test", "test"), 1),
(("this", "this"), 1),
]
)
assert sorted(b.word_fd.items()) == sorted(
[("a", 2), ("is", 2), ("test", 2), ("this", 2)]
)
n_word_fd = sum(b.word_fd.values())
n_ngram_fd = (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
assert len(SENT) == n_word_fd == n_ngram_fd
assert close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(("a", "test"), 1.0),
(("is", "a"), 1.0),
(("this", "is"), 1.0),
(("is", "test"), 0.5849625007211562),
(("this", "a"), 0.5849625007211562),
(("a", "a"), -1.0),
(("is", "is"), -1.0),
(("test", "test"), -1.0),
(("this", "this"), -1.0),
]
),
)

View File

@@ -0,0 +1,98 @@
import contextlib
import sys
import unittest
from io import StringIO
from nltk.corpus import gutenberg
from nltk.text import Text
@contextlib.contextmanager
def stdout_redirect(where):
sys.stdout = where
try:
yield where
finally:
sys.stdout = sys.__stdout__
class TestConcordance(unittest.TestCase):
"""Text constructed using: https://www.nltk.org/book/ch01.html"""
@classmethod
def setUpClass(cls):
cls.corpus = gutenberg.words("melville-moby_dick.txt")
@classmethod
def tearDownClass(cls):
pass
def setUp(self):
self.text = Text(TestConcordance.corpus)
self.query = "monstrous"
self.maxDiff = None
self.list_out = [
"ong the former , one was of a most monstrous size . ... This came towards us , ",
'ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r',
"ll over with a heathenish array of monstrous clubs and spears . Some were thick",
"d as you gazed , and wondered what monstrous cannibal and savage could ever hav",
"that has survived the flood ; most monstrous and most mountainous ! That Himmal",
"they might scout at Moby Dick as a monstrous fable , or still worse and more de",
"th of Radney .'\" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l",
"ing Scenes . In connexion with the monstrous pictures of whales , I am strongly",
"ere to enter upon those still more monstrous stories of them which are to be fo",
"ght have been rummaged out of this monstrous cabinet there is no telling . But ",
"of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u",
]
def tearDown(self):
pass
def test_concordance_list(self):
concordance_out = self.text.concordance_list(self.query)
self.assertEqual(self.list_out, [c.line for c in concordance_out])
def test_concordance_width(self):
list_out = [
"monstrous",
"monstrous",
"monstrous",
"monstrous",
"monstrous",
"monstrous",
"Monstrous",
"monstrous",
"monstrous",
"monstrous",
"monstrous",
]
concordance_out = self.text.concordance_list(self.query, width=0)
self.assertEqual(list_out, [c.query for c in concordance_out])
def test_concordance_lines(self):
concordance_out = self.text.concordance_list(self.query, lines=3)
self.assertEqual(self.list_out[:3], [c.line for c in concordance_out])
def test_concordance_print(self):
print_out = """Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us ,
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u
"""
with stdout_redirect(StringIO()) as stdout:
self.text.concordance(self.query)
def strip_space(raw_str):
return raw_str.replace(" ", "")
self.assertEqual(strip_space(print_out), strip_space(stdout.getvalue()))

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,275 @@
import unittest
import pytest
from nltk.corpus import ( # mwa_ppdb
cess_cat,
cess_esp,
conll2007,
floresta,
indian,
ptb,
sinica_treebank,
udhr,
)
from nltk.tree import Tree
class TestUdhr(unittest.TestCase):
def test_words(self):
for name in udhr.fileids():
words = list(udhr.words(name))
self.assertTrue(words)
def test_raw_unicode(self):
for name in udhr.fileids():
txt = udhr.raw(name)
assert not isinstance(txt, bytes), name
def test_polish_encoding(self):
text_pl = udhr.raw("Polish-Latin2")[:164]
text_ppl = udhr.raw("Polish_Polski-Latin2")[:164]
expected = """POWSZECHNA DEKLARACJA PRAW CZŁOWIEKA
[Preamble]
Trzecia Sesja Ogólnego Zgromadzenia ONZ, obradująca w Paryżu, \
uchwaliła 10 grudnia 1948 roku jednomyślnie Powszechną"""
assert text_pl == expected, "Polish-Latin2"
assert text_ppl == expected, "Polish_Polski-Latin2"
class TestIndian(unittest.TestCase):
def test_words(self):
words = indian.words()[:3]
self.assertEqual(words, ["মহিষের", "সন্তান", ":"])
def test_tagged_words(self):
tagged_words = indian.tagged_words()[:3]
self.assertEqual(
tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")]
)
class TestCess(unittest.TestCase):
def test_catalan(self):
words = cess_cat.words()[:15]
txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
self.assertEqual(words, txt.split())
self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
def test_esp(self):
words = cess_esp.words()[:15]
txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
self.assertEqual(words, txt.split())
self.assertEqual(cess_esp.words()[115], "años")
class TestFloresta(unittest.TestCase):
def test_words(self):
words = floresta.words()[:10]
txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
self.assertEqual(words, txt.split())
class TestSinicaTreebank(unittest.TestCase):
def test_sents(self):
first_3_sents = sinica_treebank.sents()[:3]
self.assertEqual(
first_3_sents,
[[""], ["友情"], ["嘉珍", "", "", "住在", "同一條", "巷子"]],
)
def test_parsed_sents(self):
parsed_sents = sinica_treebank.parsed_sents()[25]
self.assertEqual(
parsed_sents,
Tree(
"S",
[
Tree("NP", [Tree("Nba", ["嘉珍"])]),
Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", [""])]),
Tree("VA4", ["哭泣"]),
],
),
)
class TestCoNLL2007(unittest.TestCase):
# Reading the CoNLL 2007 Dependency Treebanks
def test_sents(self):
sents = conll2007.sents("esp.train")[0]
self.assertEqual(
sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"]
)
def test_parsed_sents(self):
parsed_sents = conll2007.parsed_sents("esp.train")[0]
self.assertEqual(
parsed_sents.tree(),
Tree(
"fortaleció",
[
Tree(
"aumento",
[
"El",
Tree(
"del",
[
Tree(
"índice",
[
Tree(
"de",
[Tree("desempleo", ["estadounidense"])],
)
],
)
],
),
],
),
"hoy",
"considerablemente",
Tree(
"al",
[
Tree(
"euro",
[
Tree(
"cotizaba",
[
",",
"que",
Tree("a", [Tree("15.35", ["las", "GMT"])]),
"se",
Tree(
"en",
[
Tree(
"mercado",
[
"el",
Tree("de", ["divisas"]),
Tree("de", ["Fráncfort"]),
],
)
],
),
Tree("a", ["0,9452_dólares"]),
Tree(
"frente_a",
[
",",
Tree(
"0,9349_dólares",
[
"los",
Tree(
"de",
[
Tree(
"mañana",
["esta"],
)
],
),
],
),
],
),
],
)
],
)
],
),
".",
],
),
)
@pytest.mark.skipif(
not ptb.fileids(),
reason="A full installation of the Penn Treebank is not available",
)
class TestPTB(unittest.TestCase):
def test_fileids(self):
self.assertEqual(
ptb.fileids()[:4],
[
"BROWN/CF/CF01.MRG",
"BROWN/CF/CF02.MRG",
"BROWN/CF/CF03.MRG",
"BROWN/CF/CF04.MRG",
],
)
def test_words(self):
self.assertEqual(
ptb.words("WSJ/00/WSJ_0003.MRG")[:7],
["A", "form", "of", "asbestos", "once", "used", "*"],
)
def test_tagged_words(self):
self.assertEqual(
ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3],
[("A", "DT"), ("form", "NN"), ("of", "IN")],
)
def test_categories(self):
self.assertEqual(
ptb.categories(),
[
"adventure",
"belles_lettres",
"fiction",
"humor",
"lore",
"mystery",
"news",
"romance",
"science_fiction",
],
)
def test_news_fileids(self):
self.assertEqual(
ptb.fileids("news")[:3],
["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"],
)
def test_category_words(self):
self.assertEqual(
ptb.words(categories=["humor", "fiction"])[:6],
["Thirty-three", "Scotty", "did", "not", "go", "back"],
)
@pytest.mark.skip("Skipping test for mwa_ppdb.")
class TestMWAPPDB(unittest.TestCase):
def test_fileids(self):
self.assertEqual(
mwa_ppdb.fileids(), ["ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"]
)
def test_entries(self):
self.assertEqual(
mwa_ppdb.entries()[:10],
[
("10/17/01", "17/10/2001"),
("102,70", "102.70"),
("13,53", "13.53"),
("3.2.5.3.2.1", "3.2.5.3.2.1."),
("53,76", "53.76"),
("6.9.5", "6.9.5."),
("7.7.6.3", "7.7.6.3."),
("76,20", "76.20"),
("79,85", "79.85"),
("93,65", "93.65"),
],
)

View File

@@ -0,0 +1,48 @@
"""
Corpus View Regression Tests
"""
import unittest
import nltk.data
from nltk.corpus.reader.util import (
StreamBackedCorpusView,
read_line_block,
read_whitespace_block,
)
class TestCorpusViews(unittest.TestCase):
linetok = nltk.LineTokenizer(blanklines="keep")
names = [
"corpora/inaugural/README", # A very short file (160 chars)
"corpora/inaugural/1793-Washington.txt", # A relatively short file (791 chars)
"corpora/inaugural/1909-Taft.txt", # A longer file (32k chars)
]
def data(self):
for name in self.names:
f = nltk.data.find(name)
with f.open() as fp:
file_data = fp.read().decode("utf8")
yield f, file_data
def test_correct_values(self):
# Check that corpus views produce the correct sequence of values.
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(list(v), file_data.split())
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(list(v), self.linetok.tokenize(file_data))
def test_correct_length(self):
# Check that the corpus views report the correct lengths:
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(len(v), len(file_data.split()))
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))

View File

@@ -0,0 +1,15 @@
import pytest
import nltk.data
def test_find_raises_exception():
with pytest.raises(LookupError):
nltk.data.find("no_such_resource/foo")
def test_find_raises_exception_with_full_resource_name():
no_such_thing = "no_such_thing/bar"
with pytest.raises(LookupError) as exc:
nltk.data.find(no_such_thing)
assert no_such_thing in str(exc)

View File

@@ -0,0 +1,160 @@
import unittest
from nltk.metrics.agreement import AnnotationTask
class TestDisagreement(unittest.TestCase):
"""
Class containing unit tests for nltk.metrics.agreement.Disagreement.
"""
def test_easy(self):
"""
Simple test, based on
https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf.
"""
data = [
("coder1", "dress1", "YES"),
("coder2", "dress1", "NO"),
("coder3", "dress1", "NO"),
("coder1", "dress2", "YES"),
("coder2", "dress2", "NO"),
("coder3", "dress3", "NO"),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def test_easy2(self):
"""
Same simple test with 1 rating removed.
Removal of that rating should not matter: K-Apha ignores items with
only 1 rating.
"""
data = [
("coder1", "dress1", "YES"),
("coder2", "dress1", "NO"),
("coder3", "dress1", "NO"),
("coder1", "dress2", "YES"),
("coder2", "dress2", "NO"),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def test_easy3(self):
"""
If expected disagreement is 0, K-Apha should be 1.
"""
data = [
("coder1", "1", 1),
("coder2", "1", 1),
("coder1", "2", 2),
("coder2", "2", 2),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), 1.0)
data = [("coder1", "1", 1), ("coder2", "1", 1), ("coder1", "2", 2)]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), 1.0)
def test_advanced(self):
"""
More advanced test, based on
http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf
"""
data = [
("A", "1", "1"),
("B", "1", "1"),
("D", "1", "1"),
("A", "2", "2"),
("B", "2", "2"),
("C", "2", "3"),
("D", "2", "2"),
("A", "3", "3"),
("B", "3", "3"),
("C", "3", "3"),
("D", "3", "3"),
("A", "4", "3"),
("B", "4", "3"),
("C", "4", "3"),
("D", "4", "3"),
("A", "5", "2"),
("B", "5", "2"),
("C", "5", "2"),
("D", "5", "2"),
("A", "6", "1"),
("B", "6", "2"),
("C", "6", "3"),
("D", "6", "4"),
("A", "7", "4"),
("B", "7", "4"),
("C", "7", "4"),
("D", "7", "4"),
("A", "8", "1"),
("B", "8", "1"),
("C", "8", "2"),
("D", "8", "1"),
("A", "9", "2"),
("B", "9", "2"),
("C", "9", "2"),
("D", "9", "2"),
("B", "10", "5"),
("C", "10", "5"),
("D", "10", "5"),
("C", "11", "1"),
("D", "11", "1"),
("C", "12", "3"),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_advanced2(self):
"""
Same more advanced example, but with 1 rating removed.
Again, removal of that 1 rating should not matter.
"""
data = [
("A", "1", "1"),
("B", "1", "1"),
("D", "1", "1"),
("A", "2", "2"),
("B", "2", "2"),
("C", "2", "3"),
("D", "2", "2"),
("A", "3", "3"),
("B", "3", "3"),
("C", "3", "3"),
("D", "3", "3"),
("A", "4", "3"),
("B", "4", "3"),
("C", "4", "3"),
("D", "4", "3"),
("A", "5", "2"),
("B", "5", "2"),
("C", "5", "2"),
("D", "5", "2"),
("A", "6", "1"),
("B", "6", "2"),
("C", "6", "3"),
("D", "6", "4"),
("A", "7", "4"),
("B", "7", "4"),
("C", "7", "4"),
("D", "7", "4"),
("A", "8", "1"),
("B", "8", "1"),
("C", "8", "2"),
("D", "8", "1"),
("A", "9", "2"),
("B", "9", "2"),
("C", "9", "2"),
("D", "9", "2"),
("B", "10", "5"),
("C", "10", "5"),
("D", "10", "5"),
("C", "11", "1"),
("D", "11", "1"),
("C", "12", "3"),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)

View File

@@ -0,0 +1,129 @@
from typing import Tuple
import pytest
from nltk.metrics.distance import edit_distance
class TestEditDistance:
@pytest.mark.parametrize(
"left,right,substitution_cost,expecteds",
[
# Allowing transpositions reduces the number of edits required.
# with transpositions:
# e.g. "abc" -T-> "cba" -D-> "ca": 2 steps
#
# without transpositions:
# e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps
("abc", "ca", 1, (2, 3)),
("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions
# Note, a substition_cost of higher than 2 doesn't make much
# sense, as a deletion + insertion is identical, and always
# costs 2.
#
#
# Transpositions don't always reduce the number of edits required:
# with or without transpositions:
# e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps
("wants", "wasp", 1, (3, 3)),
("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions
#
#
# Ought to have the same results with and without transpositions
# with or without transpositions:
# e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps
# (but cost 5 if substitution_cost=2)
("rain", "shine", 1, (3, 3)),
("rain", "shine", 2, (5, 5)), # Does *require* substitutions
#
#
# Several potentially interesting typos
# with transpositions:
# e.g. "acbdef" -T-> "abcdef": 1 step
#
# without transpositions:
# e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps
("acbdef", "abcdef", 1, (1, 2)),
("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions
#
#
# with transpositions:
# e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps
#
# without transpositions:
# e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps
("lnaguaeg", "language", 1, (2, 4)),
("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions
#
#
# with transpositions:
# e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps
#
# without transpositions:
# e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps
# (but one substitution, so a cost of 4 if substition_cost = 2)
("lnaugage", "language", 1, (2, 3)),
("lnaugage", "language", 2, (2, 4)),
# Does *require* substitutions if no transpositions
#
#
# with transpositions:
# e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps
# without transpositions:
# e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps
("lngauage", "language", 1, (2, 2)),
("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions
#
#
# with or without transpositions:
# e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps
#
# with substitution_cost=2 and transpositions:
# e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw"
# -I-> "swi" -I-> "swim": 6 steps
#
# with substitution_cost=2 and no transpositions:
# e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw"
# -I-> "swi" -I-> "swim": 7 steps
("wants", "swim", 1, (5, 5)),
("wants", "swim", 2, (6, 7)),
#
#
# with or without transpositions:
# e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps
# (but cost 5 if substitution_cost=2)
("kitten", "sitting", 1, (3, 3)),
("kitten", "sitting", 2, (5, 5)),
#
# duplicated letter
# e.g. "duplicated" -D-> "duplicated"
("duplicated", "duuplicated", 1, (1, 1)),
("duplicated", "duuplicated", 2, (1, 1)),
("very duplicated", "very duuplicateed", 2, (2, 2)),
],
)
def test_with_transpositions(
self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int]
):
"""
Test `edit_distance` between two strings, given some `substitution_cost`,
and whether transpositions are allowed.
:param str left: First input string to `edit_distance`.
:param str right: Second input string to `edit_distance`.
:param int substitution_cost: The cost of a substitution action in `edit_distance`.
:param Tuple[int, int] expecteds: A tuple of expected outputs, such that `expecteds[0]` is
the expected output with `transpositions=True`, and `expecteds[1]` is
the expected output with `transpositions=False`.
"""
# Test the input strings in both orderings
for s1, s2 in ((left, right), (right, left)):
# zip with [True, False] to get the transpositions value
for expected, transpositions in zip(expecteds, [True, False]):
predicted = edit_distance(
s1,
s2,
substitution_cost=substitution_cost,
transpositions=transpositions,
)
assert predicted == expected

View File

@@ -0,0 +1,83 @@
import os
import shutil
import unittest.mock
from nltk import download
from nltk.downloader import build_index
def test_downloader_using_existing_parent_download_dir(tmp_path):
"""Test that download works properly when the parent folder of the download_dir exists"""
download_dir = str(tmp_path.joinpath("another_dir"))
download_status = download("mwa_ppdb", download_dir)
assert download_status is True
def test_downloader_using_non_existing_parent_download_dir(tmp_path):
"""Test that download works properly when the parent folder of the download_dir does not exist"""
download_dir = str(
tmp_path.joinpath("non-existing-parent-folder", "another-non-existing-folder")
)
download_status = download("mwa_ppdb", download_dir)
assert download_status is True
def test_downloader_redownload(tmp_path):
"""Test that a second download correctly triggers the 'already up-to-date' message"""
first_download = 0
second_download = 1
download_dir = str(tmp_path.joinpath("test_repeat_download"))
for i in range(first_download, second_download + 1):
# capsys doesn't capture functools.partial stdout, which nltk.download.show uses, so just mock print
with unittest.mock.patch("builtins.print") as print_mock:
download_status = download("stopwords", download_dir)
assert download_status is True
if i == first_download:
expected_second_call = unittest.mock.call(
"[nltk_data] Unzipping %s."
% os.path.join("corpora", "stopwords.zip")
)
assert print_mock.call_args_list[1].args == expected_second_call.args
elif i == second_download:
expected_second_call = unittest.mock.call(
"[nltk_data] Package stopwords is already up-to-date!"
)
assert print_mock.call_args_list[1].args == expected_second_call.args
def test_build_index(tmp_path):
"""Test building index with both checksums."""
test_pkg_dir = str(tmp_path.joinpath("packages"))
test_pkg_name = "test_package"
test_pkg_path = os.path.join(test_pkg_dir, f"{test_pkg_name}")
os.makedirs(test_pkg_path, exist_ok=True)
test_xml_path = os.path.join(test_pkg_path, f"{test_pkg_name}.xml")
with open(test_xml_path, "w") as fi:
fi.write(
f'<package id="{test_pkg_name}" name="A Test Package" webpage="http://www.somefake.url/"'
' unzip="1"/>'
)
# Cannot mock a zip here as we are trying to validate file checksums, so just create a simple one with the XML
zip_path = os.path.join(test_pkg_path, f"{test_pkg_name}")
shutil.make_archive(
base_name=zip_path,
format="zip",
root_dir=test_pkg_dir,
base_dir=os.path.basename(test_pkg_path),
)
xml_index = build_index(
root=os.path.dirname(test_pkg_dir), base_url="https://someurl"
)
package_element = xml_index[0][0]
assert package_element.get("id") == "test_package"
md5_checksum = package_element.get("checksum")
assert isinstance(md5_checksum, str)
assert len(md5_checksum) > 5
sha256_checksum = package_element.get("sha256_checksum")
assert isinstance(sha256_checksum, str)
assert len(sha256_checksum) > 5

View File

@@ -0,0 +1,7 @@
import nltk
def test_iterating_returns_an_iterator_ordered_by_frequency():
samples = ["one", "two", "two"]
distribution = nltk.FreqDist(samples)
assert list(distribution) == ["two", "one"]

View File

@@ -0,0 +1,82 @@
import pytest
from nltk.tag import hmm
def _wikipedia_example_hmm():
# Example from wikipedia
# (https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm)
states = ["rain", "no rain"]
symbols = ["umbrella", "no umbrella"]
A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities
B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities
pi = [0.5, 0.5] # initial probabilities
seq = ["umbrella", "umbrella", "no umbrella", "umbrella", "umbrella"]
seq = list(zip(seq, [None] * len(seq)))
model = hmm._create_hmm_tagger(states, symbols, A, B, pi)
return model, states, symbols, seq
def test_forward_probability():
from numpy.testing import assert_array_almost_equal
# example from p. 385, Huang et al
model, states, symbols = hmm._market_hmm_example()
seq = [("up", None), ("up", None)]
expected = [[0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357]]
fp = 2 ** model._forward_probability(seq)
assert_array_almost_equal(fp, expected)
def test_forward_probability2():
from numpy.testing import assert_array_almost_equal
model, states, symbols, seq = _wikipedia_example_hmm()
fp = 2 ** model._forward_probability(seq)
# examples in wikipedia are normalized
fp = (fp.T / fp.sum(axis=1)).T
wikipedia_results = [
[0.8182, 0.1818],
[0.8834, 0.1166],
[0.1907, 0.8093],
[0.7308, 0.2692],
[0.8673, 0.1327],
]
assert_array_almost_equal(wikipedia_results, fp, 4)
def test_backward_probability():
from numpy.testing import assert_array_almost_equal
model, states, symbols, seq = _wikipedia_example_hmm()
bp = 2 ** model._backward_probability(seq)
# examples in wikipedia are normalized
bp = (bp.T / bp.sum(axis=1)).T
wikipedia_results = [
# Forward-backward algorithm doesn't need b0_5,
# so .backward_probability doesn't compute it.
# [0.6469, 0.3531],
[0.5923, 0.4077],
[0.3763, 0.6237],
[0.6533, 0.3467],
[0.6273, 0.3727],
[0.5, 0.5],
]
assert_array_almost_equal(wikipedia_results, bp, 4)
def setup_module(module):
pytest.importorskip("numpy")

View File

@@ -0,0 +1,210 @@
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Regression tests for `json2csv()` and `json2csv_entities()` in Twitter
package.
"""
from pathlib import Path
import pytest
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv, json2csv_entities
def files_are_identical(pathA, pathB):
"""
Compare two files, ignoring carriage returns,
leading whitespace, and trailing whitespace
"""
f1 = [l.strip() for l in pathA.read_bytes().splitlines()]
f2 = [l.strip() for l in pathB.read_bytes().splitlines()]
return f1 == f2
subdir = Path(__file__).parent / "files"
@pytest.fixture
def infile():
with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
return [next(infile) for x in range(100)]
def test_textoutput(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.text.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.text.csv"
json2csv(infile, outfn, ["text"], gzip_compress=False)
assert files_are_identical(outfn, ref_fn)
def test_tweet_metadata(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.tweet.csv.ref"
fields = [
"created_at",
"favorite_count",
"id",
"in_reply_to_status_id",
"in_reply_to_user_id",
"retweet_count",
"retweeted",
"text",
"truncated",
"user.id",
]
outfn = tmp_path / "tweets.20150430-223406.tweet.csv"
json2csv(infile, outfn, fields, gzip_compress=False)
assert files_are_identical(outfn, ref_fn)
def test_user_metadata(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.user.csv.ref"
fields = ["id", "text", "user.id", "user.followers_count", "user.friends_count"]
outfn = tmp_path / "tweets.20150430-223406.user.csv"
json2csv(infile, outfn, fields, gzip_compress=False)
assert files_are_identical(outfn, ref_fn)
def test_tweet_hashtag(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.hashtag.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.hashtag.csv"
json2csv_entities(
infile,
outfn,
["id", "text"],
"hashtags",
["text"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_tweet_usermention(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.usermention.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.usermention.csv"
json2csv_entities(
infile,
outfn,
["id", "text"],
"user_mentions",
["id", "screen_name"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_tweet_media(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.media.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.media.csv"
json2csv_entities(
infile,
outfn,
["id"],
"media",
["media_url", "url"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_tweet_url(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.url.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.url.csv"
json2csv_entities(
infile,
outfn,
["id"],
"urls",
["url", "expanded_url"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_userurl(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.userurl.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.userurl.csv"
json2csv_entities(
infile,
outfn,
["id", "screen_name"],
"user.urls",
["url", "expanded_url"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_tweet_place(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.place.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.place.csv"
json2csv_entities(
infile,
outfn,
["id", "text"],
"place",
["name", "country"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_tweet_place_boundingbox(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.placeboundingbox.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.placeboundingbox.csv"
json2csv_entities(
infile,
outfn,
["id", "name"],
"place.bounding_box",
["coordinates"],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_retweet_original_tweet(tmp_path, infile):
ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.retweet.csv"
json2csv_entities(
infile,
outfn,
["id"],
"retweeted_status",
[
"created_at",
"favorite_count",
"id",
"in_reply_to_status_id",
"in_reply_to_user_id",
"retweet_count",
"text",
"truncated",
"user.id",
],
gzip_compress=False,
)
assert files_are_identical(outfn, ref_fn)
def test_file_is_wrong(tmp_path, infile):
"""
Sanity check that file comparison is not giving false positives.
"""
ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref"
outfn = tmp_path / "tweets.20150430-223406.text.csv"
json2csv(infile, outfn, ["text"], gzip_compress=False)
assert not files_are_identical(outfn, ref_fn)

View File

@@ -0,0 +1,95 @@
import unittest
from nltk.corpus import brown
from nltk.jsontags import JSONTaggedDecoder, JSONTaggedEncoder
from nltk.tag import (
AffixTagger,
BigramTagger,
BrillTagger,
BrillTaggerTrainer,
DefaultTagger,
NgramTagger,
PerceptronTagger,
RegexpTagger,
TrigramTagger,
UnigramTagger,
)
from nltk.tag.brill import nltkdemo18
class TestJSONSerialization(unittest.TestCase):
def setUp(self):
self.corpus = brown.tagged_sents()[:35]
self.decoder = JSONTaggedDecoder()
self.encoder = JSONTaggedEncoder()
self.default_tagger = DefaultTagger("NN")
def test_default_tagger(self):
encoded = self.encoder.encode(self.default_tagger)
decoded = self.decoder.decode(encoded)
self.assertEqual(repr(self.default_tagger), repr(decoded))
self.assertEqual(self.default_tagger._tag, decoded._tag)
def test_regexp_tagger(self):
tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger)
encoded = self.encoder.encode(tagger)
decoded = self.decoder.decode(encoded)
self.assertEqual(repr(tagger), repr(decoded))
self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
self.assertEqual(tagger._regexps, decoded._regexps)
def test_affix_tagger(self):
tagger = AffixTagger(self.corpus, backoff=self.default_tagger)
encoded = self.encoder.encode(tagger)
decoded = self.decoder.decode(encoded)
self.assertEqual(repr(tagger), repr(decoded))
self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
self.assertEqual(tagger._affix_length, decoded._affix_length)
self.assertEqual(tagger._min_word_length, decoded._min_word_length)
self.assertEqual(tagger._context_to_tag, decoded._context_to_tag)
def test_ngram_taggers(self):
unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger)
bitagger = BigramTagger(self.corpus, backoff=unitagger)
tritagger = TrigramTagger(self.corpus, backoff=bitagger)
ntagger = NgramTagger(4, self.corpus, backoff=tritagger)
encoded = self.encoder.encode(ntagger)
decoded = self.decoder.decode(encoded)
self.assertEqual(repr(ntagger), repr(decoded))
self.assertEqual(repr(tritagger), repr(decoded.backoff))
self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff))
self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff))
self.assertEqual(
repr(self.default_tagger), repr(decoded.backoff.backoff.backoff.backoff)
)
def test_perceptron_tagger(self):
tagger = PerceptronTagger(load=False)
tagger.train(self.corpus)
encoded = self.encoder.encode(tagger)
decoded = self.decoder.decode(encoded)
self.assertEqual(tagger.model.weights, decoded.model.weights)
self.assertEqual(tagger.tagdict, decoded.tagdict)
self.assertEqual(tagger.classes, decoded.classes)
def test_brill_tagger(self):
trainer = BrillTaggerTrainer(
self.default_tagger, nltkdemo18(), deterministic=True
)
tagger = trainer.train(self.corpus, max_rules=30)
encoded = self.encoder.encode(tagger)
decoded = self.decoder.decode(encoded)
self.assertEqual(repr(tagger._initial_tagger), repr(decoded._initial_tagger))
self.assertEqual(tagger._rules, decoded._rules)
self.assertEqual(tagger._training_stats, decoded._training_stats)

View File

@@ -0,0 +1,66 @@
import unittest
from nltk.metrics import (
BigramAssocMeasures,
QuadgramAssocMeasures,
TrigramAssocMeasures,
)
## Test the likelihood ratio metric
_DELTA = 1e-8
class TestLikelihoodRatio(unittest.TestCase):
def test_lr_bigram(self):
self.assertAlmostEqual(
BigramAssocMeasures.likelihood_ratio(2, (4, 4), 20),
2.4142743368419755,
delta=_DELTA,
)
self.assertAlmostEqual(
BigramAssocMeasures.likelihood_ratio(1, (1, 1), 1), 0.0, delta=_DELTA
)
self.assertRaises(
ValueError,
BigramAssocMeasures.likelihood_ratio,
*(0, (2, 2), 2),
)
def test_lr_trigram(self):
self.assertAlmostEqual(
TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 2),
5.545177444479562,
delta=_DELTA,
)
self.assertAlmostEqual(
TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 1),
0.0,
delta=_DELTA,
)
self.assertRaises(
ValueError,
TrigramAssocMeasures.likelihood_ratio,
*(1, (1, 1, 2), (1, 1, 2), 2),
)
def test_lr_quadgram(self):
self.assertAlmostEqual(
QuadgramAssocMeasures.likelihood_ratio(
1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 2
),
8.317766166719343,
delta=_DELTA,
)
self.assertAlmostEqual(
QuadgramAssocMeasures.likelihood_ratio(
1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 1
),
0.0,
delta=_DELTA,
)
self.assertRaises(
ValueError,
QuadgramAssocMeasures.likelihood_ratio,
*(1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 2), (1, 1, 1, 1), 1),
)

View File

@@ -0,0 +1,21 @@
import unittest
from nltk.classify.naivebayes import NaiveBayesClassifier
class NaiveBayesClassifierTest(unittest.TestCase):
def test_simple(self):
training_features = [
({"nice": True, "good": True}, "positive"),
({"bad": True, "mean": True}, "negative"),
]
classifier = NaiveBayesClassifier.train(training_features)
result = classifier.prob_classify({"nice": True})
self.assertTrue(result.prob("positive") > result.prob("negative"))
self.assertEqual(result.max(), "positive")
result = classifier.prob_classify({"bad": True})
self.assertTrue(result.prob("positive") < result.prob("negative"))
self.assertEqual(result.max(), "negative")

View File

@@ -0,0 +1,27 @@
"""
Unit tests for nltk.corpus.nombank
"""
import unittest
from nltk.corpus import nombank
# Load the nombank once.
nombank.nouns()
class NombankDemo(unittest.TestCase):
def test_numbers(self):
# No. of instances.
self.assertEqual(len(nombank.instances()), 114574)
# No. of rolesets
self.assertEqual(len(nombank.rolesets()), 5577)
# No. of nouns.
self.assertEqual(len(nombank.nouns()), 4704)
def test_instance(self):
self.assertEqual(nombank.instances()[0].roleset, "perc-sign.01")
def test_framefiles_fileids(self):
self.assertEqual(len(nombank.fileids()), 4705)
self.assertTrue(all(fileid.endswith(".xml") for fileid in nombank.fileids()))

View File

@@ -0,0 +1,13 @@
import unittest
import nltk
from nltk.corpus.reader import pl196x
class TestCorpusViews(unittest.TestCase):
def test_corpus_reader(self):
pl196x_dir = nltk.data.find("corpora/pl196x")
pl = pl196x.Pl196xCorpusReader(
pl196x_dir, r".*\.xml", textids="textids.txt", cat_file="cats.txt"
)
pl.tagged_words(fileids=pl.fileids(), categories="cats.txt")

View File

@@ -0,0 +1,117 @@
"""
Tests for nltk.pos_tag
"""
import io
import unittest
import unittest.mock
from nltk import pos_tag, word_tokenize
from nltk.help import brown_tagset, claws5_tagset, upenn_tagset
UPENN_TAGSET_DOLLAR_TEST = """$: dollar
$ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
PRP$: pronoun, possessive
her his mine my our ours their thy your
WP$: WH-pronoun, possessive
whose
"""
BROWN_TAGSET_NNS_TEST = """NNS: noun, plural, common
irregularities presentments thanks reports voters laws legislators
years areas adjustments chambers $100 bonds courts sales details raises
sessions members congressmen votes polls calls ...
"""
CLAW5_TAGSET_VHD_TEST = """VHD: past tense form of the verb "HAVE"
had, 'd
"""
class TestPosTag(unittest.TestCase):
def test_pos_tag_eng(self):
text = "John's big idea isn't all that bad."
expected_tagged = [
("John", "NNP"),
("'s", "POS"),
("big", "JJ"),
("idea", "NN"),
("is", "VBZ"),
("n't", "RB"),
("all", "PDT"),
("that", "DT"),
("bad", "JJ"),
(".", "."),
]
assert pos_tag(word_tokenize(text)) == expected_tagged
def test_pos_tag_eng_universal(self):
text = "John's big idea isn't all that bad."
expected_tagged = [
("John", "NOUN"),
("'s", "PRT"),
("big", "ADJ"),
("idea", "NOUN"),
("is", "VERB"),
("n't", "ADV"),
("all", "DET"),
("that", "DET"),
("bad", "ADJ"),
(".", "."),
]
assert pos_tag(word_tokenize(text), tagset="universal") == expected_tagged
@unittest.mock.patch("sys.stdout", new_callable=io.StringIO)
def check_stdout(self, tagset, query_regex, expected_output, mock_stdout):
tagset(query_regex)
self.assertEqual(mock_stdout.getvalue(), expected_output)
def test_tagsets_upenn(self):
self.check_stdout(upenn_tagset, r".*\$", UPENN_TAGSET_DOLLAR_TEST)
def test_tagsets_brown(self):
self.check_stdout(brown_tagset, r"NNS", BROWN_TAGSET_NNS_TEST)
def test_tagsets_claw5(self):
self.check_stdout(claws5_tagset, r"VHD", CLAW5_TAGSET_VHD_TEST)
def test_pos_tag_rus(self):
text = "Илья оторопел и дважды перечитал бумажку."
expected_tagged = [
("Илья", "S"),
("оторопел", "V"),
("и", "CONJ"),
("дважды", "ADV"),
("перечитал", "V"),
("бумажку", "S"),
(".", "NONLEX"),
]
assert pos_tag(word_tokenize(text), lang="rus") == expected_tagged
def test_pos_tag_rus_universal(self):
text = "Илья оторопел и дважды перечитал бумажку."
expected_tagged = [
("Илья", "NOUN"),
("оторопел", "VERB"),
("и", "CONJ"),
("дважды", "ADV"),
("перечитал", "VERB"),
("бумажку", "NOUN"),
(".", "."),
]
assert (
pos_tag(word_tokenize(text), tagset="universal", lang="rus")
== expected_tagged
)
def test_pos_tag_unknown_lang(self):
text = "모르겠 습니 다"
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang="kor")
# Test for default kwarg, `lang=None`
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None)
def test_unspecified_lang(self):
# Tries to force the lang='eng' option.
text = "모르겠 습니 다"
expected_but_wrong = [("모르겠", "JJ"), ("습니", "NNP"), ("", "NN")]
assert pos_tag(word_tokenize(text)) == expected_but_wrong

View File

@@ -0,0 +1,246 @@
from nltk.translate.ribes_score import corpus_ribes, word_rank_alignment
def test_ribes_empty_worder(): # worder as in word order
# Verifies that these two sentences have no alignment,
# and hence have the lowest possible RIBES score.
hyp = "This is a nice sentence which I quite like".split()
ref = "Okay well that's neat and all but the reference's different".split()
assert word_rank_alignment(ref, hyp) == []
list_of_refs = [[ref]]
hypotheses = [hyp]
assert corpus_ribes(list_of_refs, hypotheses) == 0.0
def test_ribes_one_worder():
# Verifies that these two sentences have just one match,
# and the RIBES score for this sentence with very little
# correspondence is 0.
hyp = "This is a nice sentence which I quite like".split()
ref = "Okay well that's nice and all but the reference's different".split()
assert word_rank_alignment(ref, hyp) == [3]
list_of_refs = [[ref]]
hypotheses = [hyp]
assert corpus_ribes(list_of_refs, hypotheses) == 0.0
def test_ribes_two_worder():
# Verifies that these two sentences have two matches,
# but still get the lowest possible RIBES score due
# to the lack of similarity.
hyp = "This is a nice sentence which I quite like".split()
ref = "Okay well that's nice and all but the reference is different".split()
assert word_rank_alignment(ref, hyp) == [9, 3]
list_of_refs = [[ref]]
hypotheses = [hyp]
assert corpus_ribes(list_of_refs, hypotheses) == 0.0
def test_ribes():
# Based on the doctest of the corpus_ribes function
hyp1 = [
"It",
"is",
"a",
"guide",
"to",
"action",
"which",
"ensures",
"that",
"the",
"military",
"always",
"obeys",
"the",
"commands",
"of",
"the",
"party",
]
ref1a = [
"It",
"is",
"a",
"guide",
"to",
"action",
"that",
"ensures",
"that",
"the",
"military",
"will",
"forever",
"heed",
"Party",
"commands",
]
ref1b = [
"It",
"is",
"the",
"guiding",
"principle",
"which",
"guarantees",
"the",
"military",
"forces",
"always",
"being",
"under",
"the",
"command",
"of",
"the",
"Party",
]
ref1c = [
"It",
"is",
"the",
"practical",
"guide",
"for",
"the",
"army",
"always",
"to",
"heed",
"the",
"directions",
"of",
"the",
"party",
]
hyp2 = [
"he",
"read",
"the",
"book",
"because",
"he",
"was",
"interested",
"in",
"world",
"history",
]
ref2a = [
"he",
"was",
"interested",
"in",
"world",
"history",
"because",
"he",
"read",
"the",
"book",
]
list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]]
hypotheses = [hyp1, hyp2]
score = corpus_ribes(list_of_refs, hypotheses)
assert round(score, 4) == 0.3597
def test_no_zero_div():
# Regression test for Issue 2529, assure that no ZeroDivisionError is thrown.
hyp1 = [
"It",
"is",
"a",
"guide",
"to",
"action",
"which",
"ensures",
"that",
"the",
"military",
"always",
"obeys",
"the",
"commands",
"of",
"the",
"party",
]
ref1a = [
"It",
"is",
"a",
"guide",
"to",
"action",
"that",
"ensures",
"that",
"the",
"military",
"will",
"forever",
"heed",
"Party",
"commands",
]
ref1b = [
"It",
"is",
"the",
"guiding",
"principle",
"which",
"guarantees",
"the",
"military",
"forces",
"always",
"being",
"under",
"the",
"command",
"of",
"the",
"Party",
]
ref1c = [
"It",
"is",
"the",
"practical",
"guide",
"for",
"the",
"army",
"always",
"to",
"heed",
"the",
"directions",
"of",
"the",
"party",
]
hyp2 = ["he", "read", "the"]
ref2a = ["he", "was", "interested", "in", "world", "history", "because", "he"]
list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]]
hypotheses = [hyp1, hyp2]
score = corpus_ribes(list_of_refs, hypotheses)
assert round(score, 4) == 0.1688

View File

@@ -0,0 +1,94 @@
import pytest
from nltk import config_megam
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
from nltk.corpus import rte as rte_corpus
expected_from_rte_feature_extration = """
alwayson => True
ne_hyp_extra => 0
ne_overlap => 1
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 3
word_overlap => 3
alwayson => True
ne_hyp_extra => 0
ne_overlap => 1
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 2
word_overlap => 1
alwayson => True
ne_hyp_extra => 1
ne_overlap => 1
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 1
word_overlap => 2
alwayson => True
ne_hyp_extra => 1
ne_overlap => 0
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 6
word_overlap => 2
alwayson => True
ne_hyp_extra => 1
ne_overlap => 0
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 4
word_overlap => 0
alwayson => True
ne_hyp_extra => 1
ne_overlap => 0
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 3
word_overlap => 1
"""
class TestRTEClassifier:
# Test the feature extraction method.
def test_rte_feature_extraction(self):
pairs = rte_corpus.pairs(["rte1_dev.xml"])[:6]
test_output = [
f"{key:<15} => {rte_features(pair)[key]}"
for pair in pairs
for key in sorted(rte_features(pair))
]
expected_output = expected_from_rte_feature_extration.strip().split("\n")
# Remove null strings.
expected_output = list(filter(None, expected_output))
assert test_output == expected_output
# Test the RTEFeatureExtractor object.
def test_feature_extractor_object(self):
rtepair = rte_corpus.pairs(["rte3_dev.xml"])[33]
extractor = RTEFeatureExtractor(rtepair)
assert extractor.hyp_words == {"member", "China", "SCO."}
assert extractor.overlap("word") == set()
assert extractor.overlap("ne") == {"China"}
assert extractor.hyp_extra("word") == {"member"}
# Test the RTE classifier training.
def test_rte_classification_without_megam(self):
# Use a sample size for unit testing, since we
# don't need to fully train these classifiers
clf = rte_classifier("IIS", sample_N=100)
clf = rte_classifier("GIS", sample_N=100)
def test_rte_classification_with_megam(self):
try:
config_megam()
except (LookupError, AttributeError) as e:
pytest.skip("Skipping tests with dependencies on MEGAM")
clf = rte_classifier("megam", sample_N=100)

View File

@@ -0,0 +1,86 @@
import os
from io import BytesIO
import pytest
from nltk.corpus.reader import SeekableUnicodeStreamReader
def check_reader(unicode_string, encoding):
bytestr = unicode_string.encode(encoding)
stream = BytesIO(bytestr)
reader = SeekableUnicodeStreamReader(stream, encoding)
# Should open at the start of the file
assert reader.tell() == 0
# Compare original string to contents from `.readlines()`
assert unicode_string == "".join(reader.readlines())
# Should be at the end of the file now
stream.seek(0, os.SEEK_END)
assert reader.tell() == stream.tell()
reader.seek(0) # go back to start
# Compare original string to contents from `.read()`
contents = ""
char = None
while char != "":
char = reader.read(1)
contents += char
assert unicode_string == contents
# Call `check_reader` with a variety of input strings and encodings.
ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"]
STRINGS = [
"""
This is a test file.
It is fairly short.
""",
"This file can be encoded with latin1. \x83",
"""\
This is a test file.
Here's a blank line:
And here's some unicode: \xee \u0123 \uffe3
""",
"""\
This is a test file.
Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
""",
"""\
This is a larger file. It has some lines that are longer \
than 72 characters. It's got lots of repetition. Here's \
some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
How fun! Let's repeat it twenty times.
"""
* 20,
]
@pytest.mark.parametrize("string", STRINGS)
def test_reader(string):
for encoding in ENCODINGS:
# skip strings that can't be encoded with the current encoding
try:
string.encode(encoding)
except UnicodeEncodeError:
continue
check_reader(string, encoding)
def test_reader_stream_closes_when_deleted():
reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii")
assert not reader.stream.closed
reader.__del__()
assert reader.stream.closed
def teardown_module(module=None):
import gc
gc.collect()

View File

@@ -0,0 +1,112 @@
"""
Unit tests for Senna
"""
import unittest
from os import environ, path, sep
from nltk.classify import Senna
from nltk.tag import SennaChunkTagger, SennaNERTagger, SennaTagger
# Set Senna executable path for tests if it is not specified as an environment variable
if "SENNA" in environ:
SENNA_EXECUTABLE_PATH = path.normpath(environ["SENNA"]) + sep
else:
SENNA_EXECUTABLE_PATH = "/usr/share/senna-v3.0"
senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
class TestSennaPipeline(unittest.TestCase):
"""Unittest for nltk.classify.senna"""
def test_senna_pipeline(self):
"""Senna pipeline interface"""
pipeline = Senna(SENNA_EXECUTABLE_PATH, ["pos", "chk", "ner"])
sent = "Dusseldorf is an international business center".split()
result = [
(token["word"], token["chk"], token["ner"], token["pos"])
for token in pipeline.tag(sent)
]
expected = [
("Dusseldorf", "B-NP", "B-LOC", "NNP"),
("is", "B-VP", "O", "VBZ"),
("an", "B-NP", "O", "DT"),
("international", "I-NP", "O", "JJ"),
("business", "I-NP", "O", "NN"),
("center", "I-NP", "O", "NN"),
]
self.assertEqual(result, expected)
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
class TestSennaTagger(unittest.TestCase):
"""Unittest for nltk.tag.senna"""
def test_senna_tagger(self):
tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
result = tagger.tag("What is the airspeed of an unladen swallow ?".split())
expected = [
("What", "WP"),
("is", "VBZ"),
("the", "DT"),
("airspeed", "NN"),
("of", "IN"),
("an", "DT"),
("unladen", "NN"),
("swallow", "NN"),
("?", "."),
]
self.assertEqual(result, expected)
def test_senna_chunk_tagger(self):
chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
result_1 = chktagger.tag("What is the airspeed of an unladen swallow ?".split())
expected_1 = [
("What", "B-NP"),
("is", "B-VP"),
("the", "B-NP"),
("airspeed", "I-NP"),
("of", "B-PP"),
("an", "B-NP"),
("unladen", "I-NP"),
("swallow", "I-NP"),
("?", "O"),
]
result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type="NP"))
expected_2 = [
("What", "0"),
("the airspeed", "2-3"),
("an unladen swallow", "5-6-7"),
]
self.assertEqual(result_1, expected_1)
self.assertEqual(result_2, expected_2)
def test_senna_ner_tagger(self):
nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
result_1 = nertagger.tag("Shakespeare theatre was in London .".split())
expected_1 = [
("Shakespeare", "B-PER"),
("theatre", "O"),
("was", "O"),
("in", "O"),
("London", "B-LOC"),
(".", "O"),
]
result_2 = nertagger.tag("UN headquarters are in NY , USA .".split())
expected_2 = [
("UN", "B-ORG"),
("headquarters", "O"),
("are", "O"),
("in", "O"),
("NY", "B-LOC"),
(",", "O"),
("USA", "B-LOC"),
(".", "O"),
]
self.assertEqual(result_1, expected_1)
self.assertEqual(result_2, expected_2)

View File

@@ -0,0 +1,157 @@
import unittest
from contextlib import closing
from nltk import data
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
class SnowballTest(unittest.TestCase):
def test_arabic(self):
"""
this unit testing for test the snowball arabic light stemmer
this stemmer deals with prefixes and suffixes
"""
# Test where the ignore_stopwords=True.
ar_stemmer = SnowballStemmer("arabic", True)
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("فالطالبات") == "طالب"
assert ar_stemmer.stem("والطالبات") == "طالب"
assert ar_stemmer.stem("الطالبون") == "طالب"
assert ar_stemmer.stem("اللذان") == "اللذان"
assert ar_stemmer.stem("من") == "من"
# Test where the ignore_stopwords=False.
ar_stemmer = SnowballStemmer("arabic", False)
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"
# test where create the arabic stemmer without given init value to ignore_stopwords
ar_stemmer = SnowballStemmer("arabic")
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"
def test_russian(self):
stemmer_russian = SnowballStemmer("russian")
assert stemmer_russian.stem("авантненькая") == "авантненьк"
def test_german(self):
stemmer_german = SnowballStemmer("german")
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
assert stemmer_german.stem("Schr\xe4nke") == "schrank"
assert stemmer_german2.stem("Schr\xe4nke") == "schrank"
assert stemmer_german.stem("keinen") == "kein"
assert stemmer_german2.stem("keinen") == "keinen"
def test_spanish(self):
stemmer = SnowballStemmer("spanish")
assert stemmer.stem("Visionado") == "vision"
# The word 'algue' was raising an IndexError
assert stemmer.stem("algue") == "algu"
def test_short_strings_bug(self):
stemmer = SnowballStemmer("english")
assert stemmer.stem("y's") == "y"
class PorterTest(unittest.TestCase):
def _vocabulary(self):
with closing(
data.find("stemmers/porter_test/porter_vocabulary.txt").open(
encoding="utf-8"
)
) as fp:
return fp.read().splitlines()
def _test_against_expected_output(self, stemmer_mode, expected_stems):
stemmer = PorterStemmer(mode=stemmer_mode)
for word, true_stem in zip(self._vocabulary(), expected_stems):
our_stem = stemmer.stem(word)
assert (
our_stem == true_stem
), "{} should stem to {} in {} mode but got {}".format(
word,
true_stem,
stemmer_mode,
our_stem,
)
def test_vocabulary_martin_mode(self):
"""Tests all words from the test vocabulary provided by M Porter
The sample vocabulary and output were sourced from
https://tartarus.org/martin/PorterStemmer/voc.txt and
https://tartarus.org/martin/PorterStemmer/output.txt
and are linked to from the Porter Stemmer algorithm's homepage
at https://tartarus.org/martin/PorterStemmer/
"""
with closing(
data.find("stemmers/porter_test/porter_martin_output.txt").open(
encoding="utf-8"
)
) as fp:
self._test_against_expected_output(
PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
)
def test_vocabulary_nltk_mode(self):
with closing(
data.find("stemmers/porter_test/porter_nltk_output.txt").open(
encoding="utf-8"
)
) as fp:
self._test_against_expected_output(
PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
)
def test_vocabulary_original_mode(self):
# The list of stems for this test was generated by taking the
# Martin-blessed stemmer from
# https://tartarus.org/martin/PorterStemmer/c.txt
# and removing all the --DEPARTURE-- sections from it and
# running it against Martin's test vocabulary.
with closing(
data.find("stemmers/porter_test/porter_original_output.txt").open(
encoding="utf-8"
)
) as fp:
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
)
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM,
data.find("stemmers/porter_test/porter_original_output.txt")
.open(encoding="utf-8")
.read()
.splitlines(),
)
def test_oed_bug(self):
"""Test for bug https://github.com/nltk/nltk/issues/1581
Ensures that 'oed' can be stemmed without throwing an error.
"""
assert PorterStemmer().stem("oed") == "o"
def test_lowercase_option(self):
"""Test for improvement on https://github.com/nltk/nltk/issues/2507
Ensures that stems are lowercased when `to_lowercase=True`
"""
porter = PorterStemmer()
assert porter.stem("On") == "on"
assert porter.stem("I") == "i"
assert porter.stem("I", to_lowercase=False) == "I"
assert porter.stem("Github") == "github"
assert porter.stem("Github", to_lowercase=False) == "Github"

View File

@@ -0,0 +1,23 @@
def test_basic():
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
result = pos_tag(word_tokenize("John's big idea isn't all that bad."))
assert result == [
("John", "NNP"),
("'s", "POS"),
("big", "JJ"),
("idea", "NN"),
("is", "VBZ"),
("n't", "RB"),
("all", "PDT"),
("that", "DT"),
("bad", "JJ"),
(".", "."),
]
def setup_module(module):
import pytest
pytest.importorskip("numpy")

View File

@@ -0,0 +1,779 @@
#!/usr/bin/env python
#
# Natural Language Toolkit: TGrep search
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Will Roberts <wildwilhelm@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Unit tests for nltk.tgrep.
"""
import unittest
from nltk import tgrep
from nltk.tree import ParentedTree
class TestSequenceFunctions(unittest.TestCase):
"""
Class containing unit tests for nltk.tgrep.
"""
def test_tokenize_simple(self):
"""
Simple test of tokenization.
"""
tokens = tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]")
self.assertEqual(
tokens,
[
"A",
"..",
"(",
"B",
"!",
"<",
"C",
".",
"D",
")",
"|",
"!",
"[",
"<<",
"(",
"E",
",",
"F",
")",
"$",
"G",
"]",
],
)
def test_tokenize_encoding(self):
"""
Test that tokenization handles bytes and strs the same way.
"""
self.assertEqual(
tgrep.tgrep_tokenize(b"A .. (B !< C . D) | ![<< (E , F) $ G]"),
tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]"),
)
def test_tokenize_link_types(self):
"""
Test tokenization of basic link types.
"""
self.assertEqual(tgrep.tgrep_tokenize("A<B"), ["A", "<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>B"), ["A", ">", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<3B"), ["A", "<3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>3B"), ["A", ">3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<,B"), ["A", "<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>,B"), ["A", ">,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<-3B"), ["A", "<-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>-3B"), ["A", ">-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<-B"), ["A", "<-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>-B"), ["A", ">-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<'B"), ["A", "<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>'B"), ["A", ">'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<:B"), ["A", "<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>:B"), ["A", ">:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<B"), ["A", "<<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>B"), ["A", ">>", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<,B"), ["A", "<<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>,B"), ["A", ">>,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<'B"), ["A", "<<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>'B"), ["A", ">>'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<:B"), ["A", "<<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>:B"), ["A", ">>:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A.B"), ["A", ".", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A,B"), ["A", ",", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A..B"), ["A", "..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A,,B"), ["A", ",,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$B"), ["A", "$", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$.B"), ["A", "$.", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$,B"), ["A", "$,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$..B"), ["A", "$..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$,,B"), ["A", "$,,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<B"), ["A", "!", "<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>B"), ["A", "!", ">", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<3B"), ["A", "!", "<3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>3B"), ["A", "!", ">3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<,B"), ["A", "!", "<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>,B"), ["A", "!", ">,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<-3B"), ["A", "!", "<-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>-3B"), ["A", "!", ">-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<-B"), ["A", "!", "<-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>-B"), ["A", "!", ">-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<'B"), ["A", "!", "<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>'B"), ["A", "!", ">'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<:B"), ["A", "!", "<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>:B"), ["A", "!", ">:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<B"), ["A", "!", "<<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>B"), ["A", "!", ">>", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<,B"), ["A", "!", "<<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>,B"), ["A", "!", ">>,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<'B"), ["A", "!", "<<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>'B"), ["A", "!", ">>'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<:B"), ["A", "!", "<<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>:B"), ["A", "!", ">>:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!.B"), ["A", "!", ".", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!,B"), ["A", "!", ",", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!..B"), ["A", "!", "..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!,,B"), ["A", "!", ",,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$B"), ["A", "!", "$", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$.B"), ["A", "!", "$.", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$,B"), ["A", "!", "$,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$..B"), ["A", "!", "$..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$,,B"), ["A", "!", "$,,", "B"])
def test_tokenize_examples(self):
"""
Test tokenization of the TGrep2 manual example patterns.
"""
self.assertEqual(tgrep.tgrep_tokenize("NP < PP"), ["NP", "<", "PP"])
self.assertEqual(tgrep.tgrep_tokenize("/^NP/"), ["/^NP/"])
self.assertEqual(
tgrep.tgrep_tokenize("NP << PP . VP"), ["NP", "<<", "PP", ".", "VP"]
)
self.assertEqual(
tgrep.tgrep_tokenize("NP << PP | . VP"), ["NP", "<<", "PP", "|", ".", "VP"]
)
self.assertEqual(
tgrep.tgrep_tokenize("NP !<< PP [> NP | >> VP]"),
["NP", "!", "<<", "PP", "[", ">", "NP", "|", ">>", "VP", "]"],
)
self.assertEqual(
tgrep.tgrep_tokenize("NP << (PP . VP)"),
["NP", "<<", "(", "PP", ".", "VP", ")"],
)
self.assertEqual(
tgrep.tgrep_tokenize("NP <' (PP <, (IN < on))"),
["NP", "<'", "(", "PP", "<,", "(", "IN", "<", "on", ")", ")"],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < (A < B) < C"),
["S", "<", "(", "A", "<", "B", ")", "<", "C"],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < ((A < B) < C)"),
["S", "<", "(", "(", "A", "<", "B", ")", "<", "C", ")"],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < (A < B < C)"),
["S", "<", "(", "A", "<", "B", "<", "C", ")"],
)
self.assertEqual(tgrep.tgrep_tokenize("A<B&.C"), ["A", "<", "B", "&", ".", "C"])
def test_tokenize_quoting(self):
"""
Test tokenization of quoting.
"""
self.assertEqual(
tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
['"A<<:B"', "<<:", '"A $.. B"', "<", '"A>3B"', "<", "C"],
)
def test_tokenize_nodenames(self):
"""
Test tokenization of node names.
"""
self.assertEqual(tgrep.tgrep_tokenize("Robert"), ["Robert"])
self.assertEqual(tgrep.tgrep_tokenize("/^[Bb]ob/"), ["/^[Bb]ob/"])
self.assertEqual(tgrep.tgrep_tokenize("*"), ["*"])
self.assertEqual(tgrep.tgrep_tokenize("__"), ["__"])
# test tokenization of NLTK tree position syntax
self.assertEqual(tgrep.tgrep_tokenize("N()"), ["N(", ")"])
self.assertEqual(tgrep.tgrep_tokenize("N(0,)"), ["N(", "0", ",", ")"])
self.assertEqual(tgrep.tgrep_tokenize("N(0,0)"), ["N(", "0", ",", "0", ")"])
self.assertEqual(
tgrep.tgrep_tokenize("N(0,0,)"), ["N(", "0", ",", "0", ",", ")"]
)
def test_tokenize_macros(self):
"""
Test tokenization of macro definitions.
"""
self.assertEqual(
tgrep.tgrep_tokenize(
"@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN"
),
[
"@",
"NP",
"/^NP/",
";",
"@",
"NN",
"/^NN/",
";",
"@NP",
"[",
"!",
"<",
"NP",
"|",
"<",
"@NN",
"]",
"!",
"$..",
"@NN",
],
)
def test_node_simple(self):
"""
Test a simple use of tgrep for finding nodes matching a given
pattern.
"""
tree = ParentedTree.fromstring(
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
)
self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(
list(tgrep.tgrep_nodes("NN", [tree])), [[tree[0, 2], tree[2, 1]]]
)
self.assertEqual(
list(tgrep.tgrep_positions("NN|JJ", [tree])), [[(0, 1), (0, 2), (2, 1)]]
)
def test_node_printing(self):
"""Test that the tgrep print operator ' is properly ignored."""
tree = ParentedTree.fromstring("(S (n x) (N x))")
self.assertEqual(
list(tgrep.tgrep_positions("N", [tree])),
list(tgrep.tgrep_positions("'N", [tree])),
)
self.assertEqual(
list(tgrep.tgrep_positions("/[Nn]/", [tree])),
list(tgrep.tgrep_positions("'/[Nn]/", [tree])),
)
def test_node_encoding(self):
"""
Test that tgrep search strings handles bytes and strs the same
way.
"""
tree = ParentedTree.fromstring(
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
)
self.assertEqual(
list(tgrep.tgrep_positions(b"NN", [tree])),
list(tgrep.tgrep_positions(b"NN", [tree])),
)
self.assertEqual(
list(tgrep.tgrep_nodes(b"NN", [tree])),
list(tgrep.tgrep_nodes("NN", [tree])),
)
self.assertEqual(
list(tgrep.tgrep_positions(b"NN|JJ", [tree])),
list(tgrep.tgrep_positions("NN|JJ", [tree])),
)
def test_node_nocase(self):
"""
Test selecting nodes using case insensitive node names.
"""
tree = ParentedTree.fromstring("(S (n x) (N x))")
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
def test_node_quoted(self):
"""
Test selecting nodes using quoted node names.
"""
tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
def test_node_regex(self):
"""
Test regex matching on nodes.
"""
tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
# This is a regular expression that matches any node whose
# name starts with NP, including NP-SBJ:
self.assertEqual(list(tgrep.tgrep_positions("/^NP/", [tree])), [[(0,), (1,)]])
def test_node_regex_2(self):
"""
Test regex matching on nodes.
"""
tree = ParentedTree.fromstring("(S (SBJ x) (SBJ1 x) (NP-SBJ x))")
self.assertEqual(list(tgrep.tgrep_positions("/^SBJ/", [tree])), [[(0,), (1,)]])
# This is a regular expression that matches any node whose
# name includes SBJ, including NP-SBJ:
self.assertEqual(
list(tgrep.tgrep_positions("/SBJ/", [tree])), [[(0,), (1,), (2,)]]
)
def test_node_tree_position(self):
"""
Test matching on nodes based on NLTK tree position.
"""
tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
# test all tree positions that are not leaves
leaf_positions = {tree.leaf_treeposition(x) for x in range(len(tree.leaves()))}
tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
for position in tree_positions:
node_id = f"N{position}"
tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
self.assertEqual(len(tgrep_positions[0]), 1)
self.assertEqual(tgrep_positions[0][0], position)
def test_node_noleaves(self):
"""
Test node name matching with the search_leaves flag set to False.
"""
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
self.assertEqual(
list(tgrep.tgrep_positions("x", [tree])), [[(0, 0, 0), (1, 0, 0)]]
)
self.assertEqual(list(tgrep.tgrep_positions("x", [tree], False)), [[]])
def tests_rel_dominance(self):
"""
Test matching nodes based on dominance relations.
"""
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* < T > S", [tree])), [[(0,)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !< T", [tree])),
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
)
self.assertEqual(list(tgrep.tgrep_positions("* !< T > S", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* > A", [tree])), [[(0, 0)]])
self.assertEqual(list(tgrep.tgrep_positions("* > B", [tree])), [[(1, 0)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !> B", [tree])),
[[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* !> B >> S", [tree])), [[(0,), (0, 0), (1,)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* >> S", [tree])),
[[(0,), (0, 0), (1,), (1, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* >>, S", [tree])), [[(0,), (0, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* >>' S", [tree])), [[(1,), (1, 0)]]
)
# Known issue:
# self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
# [[()]])
self.assertEqual(list(tgrep.tgrep_positions("* << T", [tree])), [[(), (0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <<' T", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <<1 N", [tree])), [[(1,)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !<< T", [tree])),
[[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
)
tree = ParentedTree.fromstring("(S (A (T x)) (B (T x) (N x )))")
self.assertEqual(list(tgrep.tgrep_positions("* <: T", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,), (1,)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !<: T", [tree])),
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
)
self.assertEqual(list(tgrep.tgrep_positions("* !<: T > S", [tree])), [[(1,)]])
tree = ParentedTree.fromstring("(S (T (A x) (B x)) (T (C x)))")
self.assertEqual(list(tgrep.tgrep_positions("* >: T", [tree])), [[(1, 0)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !>: T", [tree])),
[[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
)
tree = ParentedTree.fromstring(
"(S (A (B (C (D (E (T x))))))" " (A (B (C (D (E (T x))) (N x)))))"
)
self.assertEqual(
list(tgrep.tgrep_positions("* <<: T", [tree])),
[
[
(0,),
(0, 0),
(0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0, 0),
(1, 0, 0, 0),
(1, 0, 0, 0, 0),
]
],
)
self.assertEqual(
list(tgrep.tgrep_positions("* >>: A", [tree])),
[
[
(0, 0),
(0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0, 0),
(0, 0, 0, 0, 0, 0),
(1, 0),
(1, 0, 0),
]
],
)
def test_bad_operator(self):
"""
Test error handling of undefined tgrep operators.
"""
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
self.assertRaises(
tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree])
)
def test_comments(self):
"""
Test that comments are correctly filtered out of tgrep search
strings.
"""
tree = ParentedTree.fromstring("(S (NN x) (NP x) (NN x))")
search1 = """
@ NP /^NP/;
@ NN /^NN/;
@NN
"""
self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
search2 = """
# macros
@ NP /^NP/;
@ NN /^NN/;
# search string
@NN
"""
self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
def test_rel_sister_nodes(self):
"""
Test matching sister nodes in a tree.
"""
tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]])
def tests_rel_indexed_children(self):
"""
Test matching nodes based on their index in their parent node.
"""
tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0,)]])
tree = ParentedTree.fromstring(
"(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))"
)
self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0,)]])
def test_rel_precedence(self):
"""
Test matching nodes based on precedence relations.
"""
tree = ParentedTree.fromstring(
"(S (NP (NP (PP x)) (NP (AP x)))"
" (VP (AP (X (PP x)) (Y (AP x))))"
" (NP (RC (NP (AP x)))))"
)
self.assertEqual(
list(tgrep.tgrep_positions("* . X", [tree])), [[(0,), (0, 1), (0, 1, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* .. X", [tree])),
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* .. Y", [tree])),
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* , Y", [tree])),
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* ,, X", [tree])),
[[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* ,, Y", [tree])),
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
def test_examples(self):
"""
Test the Basic Examples from the TGrep2 manual.
"""
tree = ParentedTree.fromstring("(S (NP (AP x)) (NP (PP x)))")
# This matches any NP node that immediately dominates a PP:
self.assertEqual(list(tgrep.tgrep_positions("NP < PP", [tree])), [[(1,)]])
tree = ParentedTree.fromstring("(S (NP x) (VP x) (NP (PP x)) (VP x))")
# This matches an NP that dominates a PP and is immediately
# followed by a VP:
self.assertEqual(list(tgrep.tgrep_positions("NP << PP . VP", [tree])), [[(2,)]])
tree = ParentedTree.fromstring(
"(S (NP (AP x)) (NP (PP x)) " "(NP (DET x) (NN x)) (VP x))"
)
# This matches an NP that dominates a PP or is immediately
# followed by a VP:
self.assertEqual(
list(tgrep.tgrep_positions("NP << PP | . VP", [tree])), [[(1,), (2,)]]
)
tree = ParentedTree.fromstring(
"(S (NP (NP (PP x)) (NP (AP x)))"
" (VP (AP (NP (PP x)) (NP (AP x))))"
" (NP (RC (NP (AP x)))))"
)
# This matches an NP that does not dominate a PP. Also, the NP
# must either have a parent that is an NP or be dominated by a
# VP:
self.assertEqual(
list(tgrep.tgrep_positions("NP !<< PP [> NP | >> VP]", [tree])),
[[(0, 1), (1, 0, 1)]],
)
tree = ParentedTree.fromstring(
"(S (NP (AP (PP x) (VP x))) " "(NP (AP (PP x) (NP x))) (NP x))"
)
# This matches an NP that dominates a PP which itself is
# immediately followed by a VP. Note the use of parentheses to
# group ". VP" with the PP rather than with the NP:
self.assertEqual(
list(tgrep.tgrep_positions("NP << (PP . VP)", [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
"(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))"
" (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))"
" (NP x))"
)
# This matches an NP whose last child is a PP that begins with
# the preposition "on":
self.assertEqual(
list(tgrep.tgrep_positions("NP <' (PP <, (IN < on))", [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
"(S (S (C x) (A (B x))) (S (C x) (A x)) " "(S (D x) (A (B x))))"
)
# The following pattern matches an S which has a child A and
# another child that is a C and that the A has a child B:
self.assertEqual(
list(tgrep.tgrep_positions("S < (A < B) < C", [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
"(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))"
)
# However, this pattern means that S has child A and that A
# has children B and C:
self.assertEqual(
list(tgrep.tgrep_positions("S < ((A < B) < C)", [tree])), [[(0,)]]
)
# It is equivalent to this:
self.assertEqual(
list(tgrep.tgrep_positions("S < (A < B < C)", [tree])), [[(0,)]]
)
def test_use_macros(self):
"""
Test defining and using tgrep2 macros.
"""
tree = ParentedTree.fromstring(
"(VP (VB sold) (NP (DET the) "
"(NN heiress)) (NP (NN deed) (PREP to) "
"(NP (DET the) (NN school) (NN house))))"
)
self.assertEqual(
list(
tgrep.tgrep_positions(
"@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN", [tree]
)
),
[[(1,), (2, 2)]],
)
# use undefined macro @CNP
self.assertRaises(
tgrep.TgrepException,
list,
tgrep.tgrep_positions(
"@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN", [tree]
),
)
def test_tokenize_node_labels(self):
"""Test tokenization of labeled nodes."""
self.assertEqual(
tgrep.tgrep_tokenize("S < @SBJ < (@VP < (@VB $.. @OBJ))"),
[
"S",
"<",
"@SBJ",
"<",
"(",
"@VP",
"<",
"(",
"@VB",
"$..",
"@OBJ",
")",
")",
],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))"),
[
"S",
"<",
"@SBJ",
"=",
"s",
"<",
"(",
"@VP",
"=",
"v",
"<",
"(",
"@VB",
"$..",
"@OBJ",
")",
")",
],
)
def test_tokenize_segmented_patterns(self):
"""Test tokenization of segmented patterns."""
self.assertEqual(
tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"),
[
"S",
"<",
"@SBJ",
"=",
"s",
"<",
"(",
"@VP",
"=",
"v",
"<",
"(",
"@VB",
"$..",
"@OBJ",
")",
")",
":",
"=s",
"..",
"=v",
],
)
def test_labeled_nodes(self):
"""
Test labeled nodes.
Test case from Emily M. Bender.
"""
search = """
# macros
@ SBJ /SBJ/;
@ VP /VP/;
@ VB /VB/;
@ VPoB /V[PB]/;
@ OBJ /OBJ/;
# 1 svo
S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"""
sent1 = ParentedTree.fromstring(
"(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))"
)
sent2 = ParentedTree.fromstring(
"(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))"
)
search_firsthalf = search.split("\n\n")[0] + "S < @SBJ < (@VP < (@VB $.. @OBJ))"
search_rewrite = "S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))"
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
self.assertEqual(
list(tgrep.tgrep_positions(search, [sent1])),
list(tgrep.tgrep_positions(search_rewrite, [sent1])),
)
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
self.assertEqual(
list(tgrep.tgrep_positions(search, [sent2])),
list(tgrep.tgrep_positions(search_rewrite, [sent2])),
)
def test_multiple_conjs(self):
"""
Test that multiple (3 or more) conjunctions of node relations are
handled properly.
"""
sent = ParentedTree.fromstring("((A (B b) (C c)) (A (B b) (C c) (D d)))")
# search = '(A < B < C < D)'
# search_tworels = '(A < B < C)'
self.assertEqual(
list(tgrep.tgrep_positions("(A < B < C < D)", [sent])), [[(1,)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("(A < B < C)", [sent])), [[(0,), (1,)]]
)
def test_trailing_semicolon(self):
"""
Test that semicolons at the end of a tgrep2 search string won't
cause a parse failure.
"""
tree = ParentedTree.fromstring(
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
)
self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(list(tgrep.tgrep_positions("NN;", [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(
list(tgrep.tgrep_positions("NN;;", [tree])), [[(0, 2), (2, 1)]]
)

View File

@@ -0,0 +1,905 @@
"""
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
"""
from typing import List, Tuple
import pytest
from nltk.tokenize import (
LegalitySyllableTokenizer,
StanfordSegmenter,
SyllableTokenizer,
TreebankWordTokenizer,
TweetTokenizer,
punkt,
sent_tokenize,
word_tokenize,
)
from nltk.tokenize.simple import CharTokenizer
def load_stanford_segmenter():
try:
seg = StanfordSegmenter()
seg.default_config("ar")
seg.default_config("zh")
return True
except LookupError:
return False
check_stanford_segmenter = pytest.mark.skipif(
not load_stanford_segmenter(),
reason="NLTK was unable to find stanford-segmenter.jar.",
)
class TestTokenize:
def test_tweet_tokenizer(self):
"""
Test TweetTokenizer using words with special and accented characters.
"""
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
s9 = "@myke: Let's test these words: resumé España München français"
tokens = tokenizer.tokenize(s9)
expected = [
":",
"Let's",
"test",
"these",
"words",
":",
"resumé",
"España",
"München",
"français",
]
assert tokens == expected
@pytest.mark.parametrize(
"test_input, expecteds",
[
(
"My text 0106404243030 is great text",
(
["My", "text", "01064042430", "30", "is", "great", "text"],
["My", "text", "0106404243030", "is", "great", "text"],
),
),
(
"My ticket id is 1234543124123",
(
["My", "ticket", "id", "is", "12345431241", "23"],
["My", "ticket", "id", "is", "1234543124123"],
),
),
(
"@remy: This is waaaaayyyy too much for you!!!!!! 01064042430",
(
[
":",
"This",
"is",
"waaayyy",
"too",
"much",
"for",
"you",
"!",
"!",
"!",
"01064042430",
],
[
":",
"This",
"is",
"waaayyy",
"too",
"much",
"for",
"you",
"!",
"!",
"!",
"01064042430",
],
),
),
# Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085,
# showing the TweetTokenizer performance for `match_phone_numbers=True` and
# `match_phone_numbers=False`.
(
# Some phone numbers are always tokenized, even with `match_phone_numbers=`False`
"My number is 06-46124080, except it's not.",
(
[
"My",
"number",
"is",
"06-46124080",
",",
"except",
"it's",
"not",
".",
],
[
"My",
"number",
"is",
"06-46124080",
",",
"except",
"it's",
"not",
".",
],
),
),
(
# Phone number here is only tokenized correctly if `match_phone_numbers=True`
"My number is 601-984-4813, except it's not.",
(
[
"My",
"number",
"is",
"601-984-4813",
",",
"except",
"it's",
"not",
".",
],
[
"My",
"number",
"is",
"601-984-",
"4813",
",",
"except",
"it's",
"not",
".",
],
),
),
(
# Phone number here is only tokenized correctly if `match_phone_numbers=True`
"My number is (393) 928 -3010, except it's not.",
(
[
"My",
"number",
"is",
"(393) 928 -3010",
",",
"except",
"it's",
"not",
".",
],
[
"My",
"number",
"is",
"(",
"393",
")",
"928",
"-",
"3010",
",",
"except",
"it's",
"not",
".",
],
),
),
(
# A long number is tokenized correctly only if `match_phone_numbers=False`
"The product identification number is 48103284512.",
(
[
"The",
"product",
"identification",
"number",
"is",
"4810328451",
"2",
".",
],
[
"The",
"product",
"identification",
"number",
"is",
"48103284512",
".",
],
),
),
(
# `match_phone_numbers=True` can have some unforeseen
"My favourite substraction is 240 - 1353.",
(
["My", "favourite", "substraction", "is", "240 - 1353", "."],
["My", "favourite", "substraction", "is", "240", "-", "1353", "."],
),
),
],
)
def test_tweet_tokenizer_expanded(
self, test_input: str, expecteds: Tuple[List[str], List[str]]
):
"""
Test `match_phone_numbers` in TweetTokenizer.
Note that TweetTokenizer is also passed the following for these tests:
* strip_handles=True
* reduce_len=True
:param test_input: The input string to tokenize using TweetTokenizer.
:type test_input: str
:param expecteds: A 2-tuple of tokenized sentences. The first of the two
tokenized is the expected output of tokenization with `match_phone_numbers=True`.
The second of the two tokenized lists is the expected output of tokenization
with `match_phone_numbers=False`.
:type expecteds: Tuple[List[str], List[str]]
"""
for match_phone_numbers, expected in zip([True, False], expecteds):
tokenizer = TweetTokenizer(
strip_handles=True,
reduce_len=True,
match_phone_numbers=match_phone_numbers,
)
predicted = tokenizer.tokenize(test_input)
assert predicted == expected
def test_sonority_sequencing_syllable_tokenizer(self):
"""
Test SyllableTokenizer tokenizer.
"""
tokenizer = SyllableTokenizer()
tokens = tokenizer.tokenize("justification")
assert tokens == ["jus", "ti", "fi", "ca", "tion"]
def test_syllable_tokenizer_numbers(self):
"""
Test SyllableTokenizer tokenizer.
"""
tokenizer = SyllableTokenizer()
text = "9" * 10000
tokens = tokenizer.tokenize(text)
assert tokens == [text]
def test_legality_principle_syllable_tokenizer(self):
"""
Test LegalitySyllableTokenizer tokenizer.
"""
from nltk.corpus import words
test_word = "wonderful"
tokenizer = LegalitySyllableTokenizer(words.words())
tokens = tokenizer.tokenize(test_word)
assert tokens == ["won", "der", "ful"]
@check_stanford_segmenter
def test_stanford_segmenter_arabic(self):
"""
Test the Stanford Word Segmenter for Arabic (default config)
"""
seg = StanfordSegmenter()
seg.default_config("ar")
sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات"
segmented_sent = seg.segment(sent.split())
assert segmented_sent.split() == [
"يبحث",
"علم",
"الحاسوب",
"استخدام",
"الحوسبة",
"ب",
"جميع",
"اشكال",
"ها",
"ل",
"حل",
"المشكلات",
]
@check_stanford_segmenter
def test_stanford_segmenter_chinese(self):
"""
Test the Stanford Word Segmenter for Chinese (default config)
"""
seg = StanfordSegmenter()
seg.default_config("zh")
sent = "这是斯坦福中文分词器测试"
segmented_sent = seg.segment(sent.split())
assert segmented_sent.split() == [
"",
"",
"斯坦福",
"中文",
"分词器",
"测试",
]
def test_phone_tokenizer(self):
"""
Test a string that resembles a phone number but contains a newline
"""
# Should be recognized as a phone number, albeit one with multiple spaces
tokenizer = TweetTokenizer()
test1 = "(393) 928 -3010"
expected = ["(393) 928 -3010"]
result = tokenizer.tokenize(test1)
assert result == expected
# Due to newline, first three elements aren't part of a phone number;
# fourth is
test2 = "(393)\n928 -3010"
expected = ["(", "393", ")", "928 -3010"]
result = tokenizer.tokenize(test2)
assert result == expected
def test_emoji_tokenizer(self):
"""
Test a string that contains Emoji ZWJ Sequences and skin tone modifier
"""
tokenizer = TweetTokenizer()
# A Emoji ZWJ Sequences, they together build as a single emoji, should not be split.
test1 = "👨‍👩‍👧‍👧"
expected = ["👨‍👩‍👧‍👧"]
result = tokenizer.tokenize(test1)
assert result == expected
# A Emoji with skin tone modifier, the two characters build a single emoji, should not be split.
test2 = "👨🏿"
expected = ["👨🏿"]
result = tokenizer.tokenize(test2)
assert result == expected
# A string containing both skin tone modifier and ZWJ Sequences
test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
expected = [
"🤔",
"🙈",
"me",
"así",
",",
"se",
"😌",
"ds",
"💕",
"👭",
"👙",
"hello",
"👩🏾\u200d🎓",
"emoji",
"hello",
"👨\u200d👩\u200d👦\u200d👦",
"how",
"are",
"😊",
"you",
"today",
"🙅🏽",
"🙅🏽",
]
result = tokenizer.tokenize(test3)
assert result == expected
# emoji flag sequences, including enclosed letter pairs
# Expected behavior from #3034
test4 = "🇦🇵🇵🇱🇪"
expected = ["🇦🇵", "🇵🇱", "🇪"]
result = tokenizer.tokenize(test4)
assert result == expected
test5 = "Hi 🇨🇦, 😍!!"
expected = ["Hi", "🇨🇦", ",", "😍", "!", "!"]
result = tokenizer.tokenize(test5)
assert result == expected
test6 = "<3 🇨🇦 🤝 🇵🇱 <3"
expected = ["<3", "🇨🇦", "🤝", "🇵🇱", "<3"]
result = tokenizer.tokenize(test6)
assert result == expected
def test_pad_asterisk(self):
"""
Test padding of asterisk for word tokenization.
"""
text = "This is a, *weird sentence with *asterisks in it."
expected = [
"This",
"is",
"a",
",",
"*",
"weird",
"sentence",
"with",
"*",
"asterisks",
"in",
"it",
".",
]
assert word_tokenize(text) == expected
def test_pad_dotdot(self):
"""
Test padding of dotdot* for word tokenization.
"""
text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
expected = [
"Why",
"did",
"dotdot",
"..",
"not",
"get",
"tokenized",
"but",
"dotdotdot",
"...",
"did",
"?",
"How",
"about",
"manydots",
".....",
]
assert word_tokenize(text) == expected
def test_remove_handle(self):
"""
Test remove_handle() from casual.py with specially crafted edge cases
"""
tokenizer = TweetTokenizer(strip_handles=True)
# Simple example. Handles with just numbers should be allowed
test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
expected = ["hello", ".", "hi"]
result = tokenizer.tokenize(test1)
assert result == expected
# Handles are allowed to follow any of the following characters
test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
expected = [
"`",
"~",
"(",
")",
"-",
"=",
"+",
"\\",
"|",
"[",
"]",
"{",
"}",
";",
":",
"'",
'"',
"/",
"?",
".",
",",
"<",
">",
"ñ",
".",
"ü",
".",
"ç",
".",
]
result = tokenizer.tokenize(test2)
assert result == expected
# Handles are NOT allowed to follow any of the following characters
test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
expected = [
"a",
"@n",
"j",
"@n",
"z",
"@n",
"A",
"@n",
"L",
"@n",
"Z",
"@n",
"1",
"@n",
"4",
"@n",
"7",
"@n",
"9",
"@n",
"0",
"@n",
"_",
"@n",
"!",
"@n",
"@",
"@n",
"#",
"@n",
"$",
"@n",
"%",
"@n",
"&",
"@n",
"*",
"@n",
]
result = tokenizer.tokenize(test3)
assert result == expected
# Handles are allowed to precede the following characters
test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
expected = ["!", "a", "#", "a", "$", "a", "%", "a", "&", "a", "*", "a"]
result = tokenizer.tokenize(test4)
assert result == expected
# Tests interactions with special symbols and multiple @
test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
expected = [
"!",
"@n",
"#",
"@n",
"$",
"@n",
"%",
"@n",
"&",
"@n",
"*",
"@n",
"@n",
"@n",
"@",
"@n",
"@n",
"@",
"@n",
"@n_",
"@n",
"@n7",
"@n",
"@nj",
"@n",
]
result = tokenizer.tokenize(test5)
assert result == expected
# Tests that handles can have a max length of 15
test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle"
expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"]
result = tokenizer.tokenize(test6)
assert result == expected
# Edge case where an @ comes directly after a long handle
test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde"
expected = [
"p",
"@abcde",
"@abcdefghijklmno",
"@abcde",
"_",
"@abcde",
"5",
"@abcde",
]
result = tokenizer.tokenize(test7)
assert result == expected
def test_treebank_span_tokenizer(self):
"""
Test TreebankWordTokenizer.span_tokenize function
"""
tokenizer = TreebankWordTokenizer()
# Test case in the docstring
test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
expected = [
(0, 4),
(5, 12),
(13, 17),
(18, 19),
(19, 23),
(24, 26),
(27, 30),
(31, 32),
(32, 36),
(36, 37),
(37, 38),
(40, 46),
(47, 48),
(48, 51),
(51, 52),
(53, 55),
(56, 59),
(60, 62),
(63, 68),
(69, 70),
(70, 76),
(76, 77),
(77, 78),
]
result = list(tokenizer.span_tokenize(test1))
assert result == expected
# Test case with double quotation
test2 = 'The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues'
expected = [
(0, 3),
(4, 7),
(8, 10),
(11, 18),
(19, 21),
(22, 25),
(26, 27),
(27, 36),
(37, 42),
(42, 43),
(44, 46),
(47, 50),
(51, 57),
(58, 64),
(65, 68),
(69, 74),
(75, 76),
(77, 85),
(86, 92),
(93, 95),
(96, 102),
(103, 109),
]
result = list(tokenizer.span_tokenize(test2))
assert result == expected
# Test case with double qoutation as well as converted quotations
test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
expected = [
(0, 3),
(4, 7),
(8, 10),
(11, 18),
(19, 21),
(22, 25),
(26, 27),
(27, 36),
(37, 42),
(42, 43),
(44, 46),
(47, 50),
(51, 57),
(58, 64),
(65, 68),
(69, 74),
(75, 76),
(77, 79),
(79, 87),
(87, 89),
(90, 96),
(97, 99),
(100, 106),
(107, 113),
]
result = list(tokenizer.span_tokenize(test3))
assert result == expected
def test_word_tokenize(self):
"""
Test word_tokenize function
"""
sentence = "The 'v', I've been fooled but I'll seek revenge."
expected = [
"The",
"'",
"v",
"'",
",",
"I",
"'ve",
"been",
"fooled",
"but",
"I",
"'ll",
"seek",
"revenge",
".",
]
assert word_tokenize(sentence) == expected
sentence = "'v' 're'"
expected = ["'", "v", "'", "'re", "'"]
assert word_tokenize(sentence) == expected
def test_punkt_pair_iter(self):
test_cases = [
("12", [("1", "2"), ("2", None)]),
("123", [("1", "2"), ("2", "3"), ("3", None)]),
("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]),
]
for test_input, expected_output in test_cases:
actual_output = [x for x in punkt._pair_iter(test_input)]
assert actual_output == expected_output
def test_punkt_pair_iter_handles_stop_iteration_exception(self):
# test input to trigger StopIteration from next()
it = iter([])
# call method under test and produce a generator
gen = punkt._pair_iter(it)
# unpack generator, ensure that no error is raised
list(gen)
def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
obj = punkt.PunktBaseClass()
class TestPunktTokenizeWordsMock:
def word_tokenize(self, s):
return iter([])
obj._lang_vars = TestPunktTokenizeWordsMock()
# unpack generator, ensure that no error is raised
list(obj._tokenize_words("test"))
def test_punkt_tokenize_custom_lang_vars(self):
# Create LangVars including a full stop end character as used in Bengali
class BengaliLanguageVars(punkt.PunktLanguageVars):
sent_end_chars = (".", "?", "!", "\u0964")
obj = punkt.PunktSentenceTokenizer(lang_vars=BengaliLanguageVars())
# We now expect these sentences to be split up into the individual sentences
sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
expected = [
"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।",
"অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন।",
"এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।",
]
assert obj.tokenize(sentences) == expected
def test_punkt_tokenize_no_custom_lang_vars(self):
obj = punkt.PunktSentenceTokenizer()
# We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars
sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
expected = [
"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
]
assert obj.tokenize(sentences) == expected
@pytest.mark.parametrize(
"input_text,n_sents,n_splits,lang_vars",
[
# Test debug_decisions on a text with two sentences, split by a dot.
("Subject: Some subject. Attachments: Some attachments", 2, 1),
# The sentence should be split into two sections,
# with one split and hence one decision.
# Test debug_decisions on a text with two sentences, split by an exclamation mark.
("Subject: Some subject! Attachments: Some attachments", 2, 1),
# The sentence should be split into two sections,
# with one split and hence one decision.
# Test debug_decisions on a text with one sentences,
# which is not split.
("This is just a normal sentence, just like any other.", 1, 0),
# Hence just 1
],
)
def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None):
tokenizer = punkt.PunktSentenceTokenizer()
if lang_vars != None:
tokenizer._lang_vars = lang_vars
assert len(tokenizer.tokenize(input_text)) == n_sents
assert len(list(tokenizer.debug_decisions(input_text))) == n_splits
def test_punkt_debug_decisions_custom_end(self):
# Test debug_decisions on a text with two sentences,
# split by a custom end character, based on Issue #2519
class ExtLangVars(punkt.PunktLanguageVars):
sent_end_chars = (".", "?", "!", "^")
self.punkt_debug_decisions(
"Subject: Some subject^ Attachments: Some attachments",
n_sents=2,
n_splits=1,
lang_vars=ExtLangVars(),
)
# The sentence should be split into two sections,
# with one split and hence one decision.
@pytest.mark.parametrize(
"sentences, expected",
[
(
"this is a test. . new sentence.",
["this is a test.", ".", "new sentence."],
),
("This. . . That", ["This.", ".", ".", "That"]),
("This..... That", ["This..... That"]),
("This... That", ["This... That"]),
("This.. . That", ["This.. .", "That"]),
("This. .. That", ["This.", ".. That"]),
("This. ,. That", ["This.", ",.", "That"]),
("This!!! That", ["This!!!", "That"]),
("This! That", ["This!", "That"]),
(
"1. This is R .\n2. This is A .\n3. That's all",
["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
),
(
"1. This is R .\t2. This is A .\t3. That's all",
["1.", "This is R .", "2.", "This is A .", "3.", "That's all"],
),
("Hello.\tThere", ["Hello.", "There"]),
],
)
def test_sent_tokenize(self, sentences: str, expected: List[str]):
assert sent_tokenize(sentences) == expected
def test_string_tokenizer(self) -> None:
sentence = "Hello there"
tokenizer = CharTokenizer()
assert tokenizer.tokenize(sentence) == list(sentence)
assert list(tokenizer.span_tokenize(sentence)) == [
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
]
class TestPunktTrainer:
def test_punkt_train(self) -> None:
trainer = punkt.PunktTrainer()
trainer.train("This is a test.")
def test_punkt_train_single_word(self) -> None:
trainer = punkt.PunktTrainer()
trainer.train("This.")
def test_punkt_train_no_punc(self) -> None:
trainer = punkt.PunktTrainer()
trainer.train("This is a test")

View File

@@ -0,0 +1,77 @@
"""
Tests for static parts of Twitter package
"""
import os
import pytest
pytest.importorskip("twython")
from nltk.twitter import Authenticate
@pytest.fixture
def auth():
return Authenticate()
class TestCredentials:
"""
Tests that Twitter credentials from a file are handled correctly.
"""
@classmethod
def setup_class(self):
self.subdir = os.path.join(os.path.dirname(__file__), "files")
os.environ["TWITTER"] = "twitter-files"
def test_environment(self, auth):
"""
Test that environment variable has been read correctly.
"""
fn = os.path.basename(auth.creds_subdir)
assert fn == os.environ["TWITTER"]
@pytest.mark.parametrize(
"kwargs",
[
# Each of the following scenarios should raise an error:
# An empty subdir path
{"subdir": ""},
# A subdir path of None
{"subdir": None},
# A nonexistent directory
{"subdir": "/nosuchdir"},
# 'credentials.txt' is not in default subdir, as read from `os.environ['TWITTER']`
{},
# Nonexistent credentials file ('foobar')
{"creds_file": "foobar"},
# 'bad_oauth1-1.txt' is incomplete
{"creds_file": "bad_oauth1-1.txt"},
# The first key in credentials file 'bad_oauth1-2.txt' is ill-formed
{"creds_file": "bad_oauth1-2.txt"},
# The first two lines in 'bad_oauth1-3.txt' are collapsed
{"creds_file": "bad_oauth1-3.txt"},
],
)
def test_scenarios_that_should_raise_errors(self, kwargs, auth):
"""Various scenarios that should raise errors"""
try:
auth.load_creds(**kwargs)
# raises ValueError (zero length field name in format) for python 2.6
# OSError for the rest
except (OSError, ValueError):
pass
except Exception as e:
pytest.fail("Unexpected exception thrown: %s" % e)
else:
pytest.fail("OSError exception not thrown.")
def test_correct_file(self, auth):
"""Test that a proper file succeeds and is read correctly"""
oauth = auth.load_creds(subdir=self.subdir)
assert auth.creds_fullpath == os.path.join(self.subdir, auth.creds_file)
assert auth.creds_file == "credentials.txt"
assert oauth["app_key"] == "a"

View File

@@ -0,0 +1,82 @@
import pytest
from nltk.util import everygrams
@pytest.fixture
def everygram_input():
"""Form test data for tests."""
return iter(["a", "b", "c"])
def test_everygrams_without_padding(everygram_input):
expected_output = [
("a",),
("a", "b"),
("a", "b", "c"),
("b",),
("b", "c"),
("c",),
]
output = list(everygrams(everygram_input))
assert output == expected_output
def test_everygrams_max_len(everygram_input):
expected_output = [
("a",),
("a", "b"),
("b",),
("b", "c"),
("c",),
]
output = list(everygrams(everygram_input, max_len=2))
assert output == expected_output
def test_everygrams_min_len(everygram_input):
expected_output = [
("a", "b"),
("a", "b", "c"),
("b", "c"),
]
output = list(everygrams(everygram_input, min_len=2))
assert output == expected_output
def test_everygrams_pad_right(everygram_input):
expected_output = [
("a",),
("a", "b"),
("a", "b", "c"),
("b",),
("b", "c"),
("b", "c", None),
("c",),
("c", None),
("c", None, None),
(None,),
(None, None),
(None,),
]
output = list(everygrams(everygram_input, max_len=3, pad_right=True))
assert output == expected_output
def test_everygrams_pad_left(everygram_input):
expected_output = [
(None,),
(None, None),
(None, None, "a"),
(None,),
(None, "a"),
(None, "a", "b"),
("a",),
("a", "b"),
("a", "b", "c"),
("b",),
("b", "c"),
("c",),
]
output = list(everygrams(everygram_input, max_len=3, pad_left=True))
assert output == expected_output

View File

@@ -0,0 +1,292 @@
"""
Unit tests for nltk.corpus.wordnet
See also nltk/test/wordnet.doctest
"""
import unittest
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic as wnic
wn.ensure_loaded()
S = wn.synset
L = wn.lemma
class WordnNetDemo(unittest.TestCase):
def test_retrieve_synset(self):
move_synset = S("go.v.21")
self.assertEqual(move_synset.name(), "move.v.15")
self.assertEqual(move_synset.lemma_names(), ["move", "go"])
self.assertEqual(
move_synset.definition(), "have a turn; make one's move in a game"
)
self.assertEqual(move_synset.examples(), ["Can I go now?"])
def test_retrieve_synsets(self):
self.assertEqual(sorted(wn.synsets("zap", pos="n")), [S("zap.n.01")])
self.assertEqual(
sorted(wn.synsets("zap", pos="v")),
[S("microwave.v.01"), S("nuke.v.01"), S("zap.v.01"), S("zap.v.02")],
)
def test_hyperhyponyms(self):
# Not every synset as hypernyms()
self.assertEqual(S("travel.v.01").hypernyms(), [])
self.assertEqual(S("travel.v.02").hypernyms(), [S("travel.v.03")])
self.assertEqual(S("travel.v.03").hypernyms(), [])
# Test hyper-/hyponyms.
self.assertEqual(S("breakfast.n.1").hypernyms(), [S("meal.n.01")])
first_five_meal_hypo = [
S("banquet.n.02"),
S("bite.n.04"),
S("breakfast.n.01"),
S("brunch.n.01"),
S("buffet.n.02"),
]
self.assertEqual(sorted(S("meal.n.1").hyponyms())[:5], first_five_meal_hypo)
self.assertEqual(S("Austen.n.1").instance_hypernyms(), [S("writer.n.01")])
first_five_composer_hypo = [
S("ambrose.n.01"),
S("bach.n.01"),
S("barber.n.01"),
S("bartok.n.01"),
S("beethoven.n.01"),
]
self.assertEqual(
sorted(S("composer.n.1").instance_hyponyms())[:5], first_five_composer_hypo
)
# Test root hyper-/hyponyms
self.assertEqual(S("person.n.01").root_hypernyms(), [S("entity.n.01")])
self.assertEqual(S("sail.v.01").root_hypernyms(), [S("travel.v.01")])
self.assertEqual(
sorted(S("fall.v.12").root_hypernyms()), [S("act.v.01"), S("fall.v.17")]
)
def test_derivationally_related_forms(self):
# Test `derivationally_related_forms()`
self.assertEqual(
L("zap.v.03.nuke").derivationally_related_forms(),
[L("atomic_warhead.n.01.nuke")],
)
self.assertEqual(
L("zap.v.03.atomize").derivationally_related_forms(),
[L("atomization.n.02.atomization")],
)
self.assertEqual(
L("zap.v.03.atomise").derivationally_related_forms(),
[L("atomization.n.02.atomisation")],
)
self.assertEqual(L("zap.v.03.zap").derivationally_related_forms(), [])
def test_meronyms_holonyms(self):
# Test meronyms, holonyms.
self.assertEqual(
sorted(S("dog.n.01").member_holonyms()), [S("canis.n.01"), S("pack.n.06")]
)
self.assertEqual(S("dog.n.01").part_meronyms(), [S("flag.n.07")])
self.assertEqual(S("faculty.n.2").member_meronyms(), [S("professor.n.01")])
self.assertEqual(S("copilot.n.1").member_holonyms(), [S("crew.n.01")])
self.assertEqual(
sorted(S("table.n.2").part_meronyms()),
[S("leg.n.03"), S("tabletop.n.01"), S("tableware.n.01")],
)
self.assertEqual(S("course.n.7").part_holonyms(), [S("meal.n.01")])
self.assertEqual(
sorted(S("water.n.1").substance_meronyms()),
[S("hydrogen.n.01"), S("oxygen.n.01")],
)
self.assertEqual(
sorted(S("gin.n.1").substance_holonyms()),
[
S("gin_and_it.n.01"),
S("gin_and_tonic.n.01"),
S("martini.n.01"),
S("pink_lady.n.01"),
],
)
def test_antonyms(self):
# Test antonyms.
self.assertEqual(
L("leader.n.1.leader").antonyms(), [L("follower.n.01.follower")]
)
self.assertEqual(
L("increase.v.1.increase").antonyms(), [L("decrease.v.01.decrease")]
)
def test_misc_relations(self):
# Test misc relations.
self.assertEqual(S("snore.v.1").entailments(), [S("sleep.v.01")])
self.assertEqual(
sorted(S("heavy.a.1").similar_tos()),
[
S("dense.s.03"),
S("doughy.s.01"),
S("heavier-than-air.s.01"),
S("hefty.s.02"),
S("massive.s.04"),
S("non-buoyant.s.01"),
S("ponderous.s.02"),
],
)
self.assertEqual(S("light.a.1").attributes(), [S("weight.n.01")])
self.assertEqual(S("heavy.a.1").attributes(), [S("weight.n.01")])
# Test pertainyms.
self.assertEqual(
L("English.a.1.English").pertainyms(), [L("england.n.01.England")]
)
def test_lch(self):
# Test LCH.
self.assertEqual(
S("person.n.01").lowest_common_hypernyms(S("dog.n.01")),
[S("organism.n.01")],
)
self.assertEqual(
S("woman.n.01").lowest_common_hypernyms(S("girlfriend.n.02")),
[S("woman.n.01")],
)
def test_domains(self):
# Test domains.
self.assertEqual(S("code.n.03").topic_domains(), [S("computer_science.n.01")])
self.assertEqual(S("pukka.a.01").region_domains(), [S("india.n.01")])
self.assertEqual(S("freaky.a.01").usage_domains(), [S("slang.n.02")])
def test_in_topic_domains(self):
# Test in domains.
self.assertEqual(
sorted(S("computer_science.n.01").in_topic_domains())[0], S("access.n.05")
)
self.assertEqual(
sorted(S("germany.n.01").in_region_domains())[23], S("trillion.n.02")
)
self.assertEqual(
sorted(S("slang.n.02").in_usage_domains())[1], S("airhead.n.01")
)
def test_wordnet_similarities(self):
# Path based similarities.
self.assertAlmostEqual(S("cat.n.01").path_similarity(S("cat.n.01")), 1.0)
self.assertAlmostEqual(S("dog.n.01").path_similarity(S("cat.n.01")), 0.2)
self.assertAlmostEqual(
S("car.n.01").path_similarity(S("automobile.v.01")),
S("automobile.v.01").path_similarity(S("car.n.01")),
)
self.assertAlmostEqual(
S("big.a.01").path_similarity(S("dog.n.01")),
S("dog.n.01").path_similarity(S("big.a.01")),
)
self.assertAlmostEqual(
S("big.a.01").path_similarity(S("long.a.01")),
S("long.a.01").path_similarity(S("big.a.01")),
)
self.assertAlmostEqual(
S("dog.n.01").lch_similarity(S("cat.n.01")), 2.028, places=3
)
self.assertAlmostEqual(
S("dog.n.01").wup_similarity(S("cat.n.01")), 0.8571, places=3
)
self.assertAlmostEqual(
S("car.n.01").wup_similarity(S("automobile.v.01")),
S("automobile.v.01").wup_similarity(S("car.n.01")),
)
self.assertAlmostEqual(
S("big.a.01").wup_similarity(S("dog.n.01")),
S("dog.n.01").wup_similarity(S("big.a.01")),
)
self.assertAlmostEqual(
S("big.a.01").wup_similarity(S("long.a.01")),
S("long.a.01").wup_similarity(S("big.a.01")),
)
self.assertAlmostEqual(
S("big.a.01").lch_similarity(S("long.a.01")),
S("long.a.01").lch_similarity(S("big.a.01")),
)
# Information Content similarities.
brown_ic = wnic.ic("ic-brown.dat")
self.assertAlmostEqual(
S("dog.n.01").jcn_similarity(S("cat.n.01"), brown_ic), 0.4497, places=3
)
semcor_ic = wnic.ic("ic-semcor.dat")
self.assertAlmostEqual(
S("dog.n.01").lin_similarity(S("cat.n.01"), semcor_ic), 0.8863, places=3
)
def test_omw_lemma_no_trailing_underscore(self):
expected = sorted(
[
"popolna_sprememba_v_mišljenju",
"popoln_obrat",
"preobrat",
"preobrat_v_mišljenju",
]
)
self.assertEqual(sorted(S("about-face.n.02").lemma_names(lang="slv")), expected)
def test_iterable_type_for_all_lemma_names(self):
# Duck-test for iterables.
# See https://stackoverflow.com/a/36230057/610569
cat_lemmas = wn.all_lemma_names(lang="cat")
eng_lemmas = wn.all_lemma_names(lang="eng")
self.assertTrue(hasattr(eng_lemmas, "__iter__"))
self.assertTrue(hasattr(eng_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
self.assertTrue(hasattr(cat_lemmas, "__iter__"))
self.assertTrue(hasattr(cat_lemmas, "__next__") or hasattr(eng_lemmas, "next"))
self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
def test_en_ptb_tags(self):
# Common PTB tags (mapped in both PTB and Brown)
self.assertEqual(wn.tag2pos("NN"), "n") # noun
self.assertEqual(wn.tag2pos("VB"), "v") # verb
self.assertEqual(wn.tag2pos("JJ"), "a") # adjective
self.assertEqual(wn.tag2pos("RB"), "r") # adverb
# PTB-specific tags (mapped in PTB, not in Brown)
self.assertEqual(wn.tag2pos("NNS"), "n") # plural noun (PTB only)
self.assertEqual(wn.tag2pos("VBD"), "v") # verb, past tense (PTB only)
self.assertEqual(
wn.tag2pos("VBG"), "v"
) # verb, gerund/present participle (PTB only)
self.assertEqual(wn.tag2pos("JJR"), "a") # adjective, comparative (PTB only)
self.assertEqual(wn.tag2pos("RBR"), "r") # adverb, comparative (PTB only)
# Tags that should yield None (not mapped in WordNet)
self.assertIsNone(wn.tag2pos("PRP"))
self.assertIsNone(wn.tag2pos("WP"))
self.assertIsNone(wn.tag2pos("TO"))
self.assertIsNone(wn.tag2pos("PRT"))
self.assertIsNone(wn.tag2pos("POS"))
self.assertIsNone(wn.tag2pos("."))
def test_en_brown_tags(self):
# Common Brown tags (mapped in both PTB and Brown)
self.assertEqual(wn.tag2pos("NN", tagset="en-brown"), "n") # noun
self.assertEqual(wn.tag2pos("VB", tagset="en-brown"), "v") # verb
self.assertEqual(wn.tag2pos("JJ", tagset="en-brown"), "a") # adjective
self.assertEqual(wn.tag2pos("RB", tagset="en-brown"), "r") # adverb
# Brown-specific tags (mapped in Brown, not in PTB)
self.assertEqual(
wn.tag2pos("HV", tagset="en-brown"), "v"
) # 'have' auxiliary (Brown only)
self.assertEqual(
wn.tag2pos("BEZ", tagset="en-brown"), "v"
) # 'be' auxiliary, 3rd person singular present (Brown only)
self.assertEqual(
wn.tag2pos("DOZ", tagset="en-brown"), "v"
) # 'do' auxiliary, 3rd person singular present (Brown only)
# Tags that should yield None (not mapped in WordNet)
self.assertIsNone(wn.tag2pos("PPL", tagset="en-brown"))
self.assertIsNone(wn.tag2pos("(", tagset="en-brown"))

View File

@@ -0,0 +1,416 @@
"""
Tests for BLEU translation evaluation metric
"""
import unittest
import numpy as np
from nltk.data import find
from nltk.translate.bleu_score import (
SmoothingFunction,
brevity_penalty,
closest_ref_length,
corpus_bleu,
modified_precision,
sentence_bleu,
)
class TestBLEU(unittest.TestCase):
def test_modified_precision(self):
"""
Examples from the original BLEU paper
https://www.aclweb.org/anthology/P02-1040.pdf
"""
# Example 1: the "the*" example.
# Reference sentences.
ref1 = "the cat is on the mat".split()
ref2 = "there is a cat on the mat".split()
# Hypothesis sentence(s).
hyp1 = "the the the the the the the".split()
references = [ref1, ref2]
# Testing modified unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
assert round(hyp1_unigram_precision, 4) == 0.2857
# With assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 0.0
# Example 2: the "of the" example.
# Reference sentences
ref1 = str(
"It is a guide to action that ensures that the military "
"will forever heed Party commands"
).split()
ref2 = str(
"It is the guiding principle which guarantees the military "
"forces always being under the command of the Party"
).split()
ref3 = str(
"It is the practical guide for the army always to heed "
"the directions of the party"
).split()
# Hypothesis sentence(s).
hyp1 = "of the".split()
references = [ref1, ref2, ref3]
# Testing modified unigram precision.
assert float(modified_precision(references, hyp1, n=1)) == 1.0
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 1.0
# Example 3: Proper MT outputs.
hyp1 = str(
"It is a guide to action which ensures that the military "
"always obeys the commands of the party"
).split()
hyp2 = str(
"It is to insure the troops forever hearing the activity "
"guidebook that party direct"
).split()
references = [ref1, ref2, ref3]
# Unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
# Test unigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
# Test unigram precision with rounding.
assert round(hyp1_unigram_precision, 4) == 0.9444
assert round(hyp2_unigram_precision, 4) == 0.5714
# Bigram precision
hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
# Test bigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
# Test bigram precision with rounding.
assert round(hyp1_bigram_precision, 4) == 0.5882
assert round(hyp2_bigram_precision, 4) == 0.0769
def test_brevity_penalty(self):
# Test case from brevity_penalty_closest function in mteval-v13a.pl.
# Same test cases as in the doctest in nltk.translate.bleu_score.py
references = [["a"] * 11, ["a"] * 8]
hypothesis = ["a"] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
self.assertAlmostEqual(
brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
)
references = [["a"] * 11, ["a"] * 8, ["a"] * 6, ["a"] * 7]
hypothesis = ["a"] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
def test_zero_matches(self):
# Test case where there's 0 matches
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = "John loves Mary".split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = (1.0 / n,) * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 0
def test_full_matches(self):
# Test case where there's 100% matches
references = ["John loves Mary".split()]
hypothesis = "John loves Mary".split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = (1.0 / n,) * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 1.0
def test_partial_matches_hypothesis_longer_than_reference(self):
references = ["John loves Mary".split()]
hypothesis = "John loves Mary who loves Mike".split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised because len(reference) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# @unittest.skip("Skipping fringe cases for BLEU.")
class TestBLEUFringeCases(unittest.TestCase):
def test_case_where_n_is_bigger_than_hypothesis_length(self):
# Test BLEU to nth order of n-grams, where n > len(hypothesis).
references = ["John loves Mary ?".split()]
hypothesis = "John loves Mary".split()
n = len(hypothesis) + 1 #
weights = (1.0 / n,) * n # Uniform weights.
# Since no n-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
# Checks that the warning has been raised because len(hypothesis) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# Test case where n > len(hypothesis) but so is n > len(reference), and
# it's a special case where reference == hypothesis.
references = ["John loves Mary".split()]
hypothesis = "John loves Mary".split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
def test_empty_hypothesis(self):
# Test case where there's hypothesis is empty.
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_length_one_hypothesis(self):
# Test case where there's hypothesis is of length 1 in Smoothing method 4.
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = ["Foo"]
method4 = SmoothingFunction().method4
try:
sentence_bleu(references, hypothesis, smoothing_function=method4)
except ValueError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
def test_empty_references(self):
# Test case where there's reference is empty.
references = [[]]
hypothesis = "John loves Mary".split()
assert sentence_bleu(references, hypothesis) == 0
def test_empty_references_and_hypothesis(self):
# Test case where both references and hypothesis is empty.
references = [[]]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_reference_or_hypothesis_shorter_than_fourgrams(self):
# Test case where the length of reference or hypothesis
# is shorter than 4.
references = ["let it go".split()]
hypothesis = "let go it".split()
# Checks that the value the hypothesis and reference returns is 0.0
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
def test_numpy_weights(self):
# Test case where there's 0 matches
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = "John loves Mary".split()
weights = np.array([0.25] * 4)
assert sentence_bleu(references, hypothesis, weights) == 0
class TestBLEUvsMteval13a(unittest.TestCase):
def test_corpus_bleu(self):
ref_file = find("models/wmt15_eval/ref.ru")
hyp_file = find("models/wmt15_eval/google.ru")
mteval_output_file = find("models/wmt15_eval/mteval-13a.output")
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file) as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with open(ref_file, encoding="utf8") as ref_fin:
with open(hyp_file, encoding="utf8") as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references, hypothesis, weights=(1.0 / i,) * i
)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references,
hypothesis,
weights=(1.0 / i,) * i,
smoothing_function=chencherry.method3,
)
assert abs(mteval_bleu - nltk_bleu) < 0.005
class TestBLEUWithBadSentence(unittest.TestCase):
def test_corpus_bleu_with_bad_sentence(self):
hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
ref = str(
"Their tasks include changing a pump on the faulty stokehold ."
"Likewise , two species that are very similar in morphology "
"were distinguished using genetics ."
)
references = [[ref.split()]]
hypotheses = [hyp.split()]
try: # Check that the warning is raised since no. of 2-grams < 0.
with self.assertWarns(UserWarning):
# Verify that the BLEU output is undesired since no. of 2-grams < 0.
self.assertAlmostEqual(
corpus_bleu(references, hypotheses), 0.0, places=4
)
except (
AttributeError
): # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)
class TestBLEUWithMultipleWeights(unittest.TestCase):
def test_corpus_bleu_with_multiple_weights(self):
hyp1 = [
"It",
"is",
"a",
"guide",
"to",
"action",
"which",
"ensures",
"that",
"the",
"military",
"always",
"obeys",
"the",
"commands",
"of",
"the",
"party",
]
ref1a = [
"It",
"is",
"a",
"guide",
"to",
"action",
"that",
"ensures",
"that",
"the",
"military",
"will",
"forever",
"heed",
"Party",
"commands",
]
ref1b = [
"It",
"is",
"the",
"guiding",
"principle",
"which",
"guarantees",
"the",
"military",
"forces",
"always",
"being",
"under",
"the",
"command",
"of",
"the",
"Party",
]
ref1c = [
"It",
"is",
"the",
"practical",
"guide",
"for",
"the",
"army",
"always",
"to",
"heed",
"the",
"directions",
"of",
"the",
"party",
]
hyp2 = [
"he",
"read",
"the",
"book",
"because",
"he",
"was",
"interested",
"in",
"world",
"history",
]
ref2a = [
"he",
"was",
"interested",
"in",
"world",
"history",
"because",
"he",
"read",
"the",
"book",
]
weight_1 = (1, 0, 0, 0)
weight_2 = (0.25, 0.25, 0.25, 0.25)
weight_3 = (0, 0, 0, 0, 1)
bleu_scores = corpus_bleu(
list_of_references=[[ref1a, ref1b, ref1c], [ref2a]],
hypotheses=[hyp1, hyp2],
weights=[weight_1, weight_2, weight_3],
)
assert bleu_scores[0] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1
)
assert bleu_scores[1] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2
)
assert bleu_scores[2] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3
)

View File

@@ -0,0 +1,154 @@
"""
Tests GDFA alignments
"""
import unittest
from nltk.translate.gdfa import grow_diag_final_and
class TestGDFA(unittest.TestCase):
def test_from_eflomal_outputs(self):
"""
Testing GDFA with first 10 eflomal outputs from issue #1829
https://github.com/nltk/nltk/issues/1829
"""
# Input.
forwards = [
"0-0 1-2",
"0-0 1-1",
"0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14",
"0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10",
"0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31",
"0-0 1-1 0-2 2-3",
"0-0 2-2 4-4",
"0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20",
"3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14",
"1-0",
]
backwards = [
"0-0 1-2",
"0-0 1-1",
"0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13",
"0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8",
"0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31",
"0-0 1-1 2-3",
"0-0 1-1 2-3 4-4",
"0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18",
"0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10",
"1-0",
]
source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
# Expected Output.
expected = [
[(0, 0), (1, 2)],
[(0, 0), (1, 1)],
[
(0, 0),
(2, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(8, 7),
(10, 10),
(11, 12),
],
[
(0, 0),
(1, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(4, 6),
(5, 7),
(6, 8),
(7, 5),
(8, 7),
(8, 9),
(9, 8),
(9, 10),
],
[
(0, 0),
(1, 8),
(2, 9),
(3, 10),
(4, 11),
(5, 8),
(6, 9),
(6, 11),
(7, 10),
(8, 11),
(31, 31),
],
[(0, 0), (0, 2), (1, 1), (2, 3)],
[(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
[
(0, 0),
(1, 1),
(2, 3),
(3, 4),
(5, 5),
(7, 6),
(8, 7),
(9, 8),
(10, 9),
(11, 10),
(12, 11),
(13, 12),
(14, 13),
(15, 14),
(16, 16),
(17, 17),
(18, 18),
(19, 19),
],
[
(0, 0),
(1, 1),
(3, 0),
(3, 2),
(4, 1),
(5, 3),
(6, 2),
(6, 4),
(7, 5),
(8, 6),
(9, 7),
(9, 12),
(10, 8),
(10, 13),
(11, 9),
(12, 8),
(12, 14),
(13, 9),
(14, 8),
(15, 9),
(16, 10),
],
[(1, 0)],
[
(0, 0),
(1, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(9, 10),
(10, 12),
(11, 13),
(12, 14),
(13, 15),
],
]
# Iterate through all 10 examples and check for expected outputs.
for fw, bw, src_len, trg_len, expect in zip(
forwards, backwards, source_lens, target_lens, expected
):
self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw))

View File

@@ -0,0 +1,73 @@
"""
Tests for IBM Model 1 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent, IBMModel, IBMModel1
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel1(unittest.TestCase):
def test_set_uniform_translation_probabilities(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / (target vocab size + 1)
self.assertEqual(model1.translation_table["ham"]["eier"], 1.0 / 3)
self.assertEqual(model1.translation_table["eggs"][None], 1.0 / 3)
def test_set_uniform_translation_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]),
AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# examine target words that are not in the training data domain
self.assertEqual(model1.translation_table["parrot"]["eier"], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"]
trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"]
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
["UNUSED"] + trg_sentence,
None,
)
translation_table = defaultdict(lambda: defaultdict(float))
translation_table["i"]["ich"] = 0.98
translation_table["love"]["gern"] = 0.98
translation_table["to"][None] = 0.98
translation_table["eat"]["esse"] = 0.98
translation_table["smoked"]["räucherschinken"] = 0.98
translation_table["ham"]["räucherschinken"] = 0.98
model1 = IBMModel1(corpus, 0)
model1.translation_table = translation_table
# act
probability = model1.prob_t_a_given_s(alignment_info)
# assert
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
expected_probability = lexical_translation
self.assertEqual(round(probability, 4), round(expected_probability, 4))

Some files were not shown because too many files have changed in this diff Show More