This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,31 @@
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Transformation Based Learning
A general purpose package for Transformation Based Learning,
currently used by nltk.tag.BrillTagger.
isort:skip_file
"""
from nltk.tbl.template import Template
# API: Template(...), Template.expand(...)
from nltk.tbl.feature import Feature
# API: Feature(...), Feature.expand(...)
from nltk.tbl.rule import Rule
# API: Rule.format(...), Rule.templatetid
from nltk.tbl.erroranalysis import error_list

View File

@@ -0,0 +1,418 @@
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import os
import pickle
import random
import time
from nltk.corpus import treebank
from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger
from nltk.tag.brill import Pos, Word
from nltk.tbl import Template, error_list
def demo():
"""
Run a demo with defaults. See source comments for details,
or docstrings of any of the more specific demo_* functions.
"""
postag()
def demo_repr_rule_format():
"""
Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
"""
postag(ruleformat="repr")
def demo_str_rule_format():
"""
Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
"""
postag(ruleformat="str")
def demo_verbose_rule_format():
"""
Exemplify Rule.format("verbose")
"""
postag(ruleformat="verbose")
def demo_multiposition_feature():
"""
The feature/s of a template takes a list of positions
relative to the current word where the feature should be
looked for, conceptually joined by logical OR. For instance,
Pos([-1, 1]), given a value V, will hold whenever V is found
one step to the left and/or one step to the right.
For contiguous ranges, a 2-arg form giving inclusive end
points can also be used: Pos(-3, -1) is the same as the arg
below.
"""
postag(templates=[Template(Pos([-3, -2, -1]))])
def demo_multifeature_template():
"""
Templates can have more than a single feature.
"""
postag(templates=[Template(Word([0]), Pos([-2, -1]))])
def demo_template_statistics():
"""
Show aggregate statistics per template. Little used templates are
candidates for deletion, much used templates may possibly be refined.
Deleting unused templates is mostly about saving time and/or space:
training is basically O(T) in the number of templates T
(also in terms of memory usage, which often will be the limiting factor).
"""
postag(incremental_stats=True, template_stats=True)
def demo_generated_templates():
"""
Template.expand and Feature.expand are class methods facilitating
generating large amounts of templates. See their documentation for
details.
Note: training with 500 templates can easily fill all available
even on relatively small corpora
"""
wordtpls = Word.expand([-1, 0, 1], [1, 2], excludezero=False)
tagtpls = Pos.expand([-2, -1, 0, 1], [1, 2], excludezero=True)
templates = list(Template.expand([wordtpls, tagtpls], combinations=(1, 3)))
print(
"Generated {} templates for transformation-based learning".format(
len(templates)
)
)
postag(templates=templates, incremental_stats=True, template_stats=True)
def demo_learning_curve():
"""
Plot a learning curve -- the contribution on tagging accuracy of
the individual rules.
Note: requires matplotlib
"""
postag(
incremental_stats=True,
separate_baseline_data=True,
learning_curve_output="learningcurve.png",
)
def demo_error_analysis():
"""
Writes a file with context for each erroneous word after tagging testing data
"""
postag(error_output="errors.txt")
def demo_serialize_tagger():
"""
Serializes the learned tagger to a file in pickle format; reloads it
and validates the process.
"""
postag(serialize_output="tagger.pcl")
def demo_high_accuracy_rules():
"""
Discard rules with low accuracy. This may hurt performance a bit,
but will often produce rules which are more interesting read to a human.
"""
postag(num_sents=3000, min_acc=0.96, min_score=10)
def postag(
templates=None,
tagged_data=None,
num_sents=1000,
max_rules=300,
min_score=3,
min_acc=None,
train=0.8,
trace=3,
randomize=False,
ruleformat="str",
incremental_stats=False,
template_stats=False,
error_output=None,
serialize_output=None,
learning_curve_output=None,
learning_curve_take=300,
baseline_backoff_tagger=None,
separate_baseline_data=False,
cache_baseline_tagger=None,
):
"""
Brill Tagger Demonstration
:param templates: how many sentences of training and testing data to use
:type templates: list of Template
:param tagged_data: maximum number of rule instances to create
:type tagged_data: C{int}
:param num_sents: how many sentences of training and testing data to use
:type num_sents: C{int}
:param max_rules: maximum number of rule instances to create
:type max_rules: C{int}
:param min_score: the minimum score for a rule in order for it to be considered
:type min_score: C{int}
:param min_acc: the minimum score for a rule in order for it to be considered
:type min_acc: C{float}
:param train: the fraction of the the corpus to be used for training (1=all)
:type train: C{float}
:param trace: the level of diagnostic tracing output to produce (0-4)
:type trace: C{int}
:param randomize: whether the training data should be a random subset of the corpus
:type randomize: C{bool}
:param ruleformat: rule output format, one of "str", "repr", "verbose"
:type ruleformat: C{str}
:param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
:type incremental_stats: C{bool}
:param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
:type template_stats: C{bool}
:param error_output: the file where errors will be saved
:type error_output: C{string}
:param serialize_output: the file where the learned tbl tagger will be saved
:type serialize_output: C{string}
:param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
:type learning_curve_output: C{string}
:param learning_curve_take: how many rules plotted
:type learning_curve_take: C{int}
:param baseline_backoff_tagger: the file where rules will be saved
:type baseline_backoff_tagger: tagger
:param separate_baseline_data: use a fraction of the training data exclusively for training baseline
:type separate_baseline_data: C{bool}
:param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
deterministic output from the baseline unigram tagger between python versions)
:type cache_baseline_tagger: C{string}
Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
is fast and fine for a demo, but is likely to generalize worse on unseen data.
Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
"""
# defaults
baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
if templates is None:
from nltk.tag.brill import brill24, describe_template_sets
# some pre-built template sets taken from typical systems or publications are
# available. Print a list with describe_template_sets()
# for instance:
templates = brill24()
(training_data, baseline_data, gold_data, testing_data) = _demo_prepare_data(
tagged_data, train, num_sents, randomize, separate_baseline_data
)
# creating (or reloading from cache) a baseline tagger (unigram tagger)
# this is just a mechanism for getting deterministic output from the baseline between
# python versions
if cache_baseline_tagger:
if not os.path.exists(cache_baseline_tagger):
baseline_tagger = UnigramTagger(
baseline_data, backoff=baseline_backoff_tagger
)
with open(cache_baseline_tagger, "w") as print_rules:
pickle.dump(baseline_tagger, print_rules)
print(
"Trained baseline tagger, pickled it to {}".format(
cache_baseline_tagger
)
)
with open(cache_baseline_tagger) as print_rules:
baseline_tagger = pickle.load(print_rules)
print(f"Reloaded pickled tagger from {cache_baseline_tagger}")
else:
baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
print("Trained baseline tagger")
if gold_data:
print(
" Accuracy on test set: {:0.4f}".format(
baseline_tagger.accuracy(gold_data)
)
)
# creating a Brill tagger
tbrill = time.time()
trainer = BrillTaggerTrainer(
baseline_tagger, templates, trace, ruleformat=ruleformat
)
print("Training tbl tagger...")
brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds")
if gold_data:
print(" Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data))
# printing the learned rules, if learned silently
if trace == 1:
print("\nLearned rules: ")
for ruleno, rule in enumerate(brill_tagger.rules(), 1):
print(f"{ruleno:4d} {rule.format(ruleformat):s}")
# printing template statistics (optionally including comparison with the training data)
# note: if not separate_baseline_data, then baseline accuracy will be artificially high
if incremental_stats:
print(
"Incrementally tagging the test data, collecting individual rule statistics"
)
(taggedtest, teststats) = brill_tagger.batch_tag_incremental(
testing_data, gold_data
)
print(" Rule statistics collected")
if not separate_baseline_data:
print(
"WARNING: train_stats asked for separate_baseline_data=True; the baseline "
"will be artificially high"
)
trainstats = brill_tagger.train_stats()
if template_stats:
brill_tagger.print_template_statistics(teststats)
if learning_curve_output:
_demo_plot(
learning_curve_output, teststats, trainstats, take=learning_curve_take
)
print(f"Wrote plot of learning curve to {learning_curve_output}")
else:
print("Tagging the test data")
taggedtest = brill_tagger.tag_sents(testing_data)
if template_stats:
brill_tagger.print_template_statistics()
# writing error analysis to file
if error_output is not None:
with open(error_output, "w") as f:
f.write("Errors for Brill Tagger %r\n\n" % serialize_output)
f.write("\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n")
print(f"Wrote tagger errors including context to {error_output}")
# serializing the tagger to a pickle file and reloading (just to see it works)
if serialize_output is not None:
taggedtest = brill_tagger.tag_sents(testing_data)
with open(serialize_output, "w") as print_rules:
pickle.dump(brill_tagger, print_rules)
print(f"Wrote pickled tagger to {serialize_output}")
with open(serialize_output) as print_rules:
brill_tagger_reloaded = pickle.load(print_rules)
print(f"Reloaded pickled tagger from {serialize_output}")
taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
if taggedtest == taggedtest_reloaded:
print("Reloaded tagger tried on test set, results identical")
else:
print("PROBLEM: Reloaded tagger gave different results on test set")
def _demo_prepare_data(
tagged_data, train, num_sents, randomize, separate_baseline_data
):
# train is the proportion of data used in training; the rest is reserved
# for testing.
if tagged_data is None:
print("Loading tagged data from treebank... ")
tagged_data = treebank.tagged_sents()
if num_sents is None or len(tagged_data) <= num_sents:
num_sents = len(tagged_data)
if randomize:
random.seed(len(tagged_data))
random.shuffle(tagged_data)
cutoff = int(num_sents * train)
training_data = tagged_data[:cutoff]
gold_data = tagged_data[cutoff:num_sents]
testing_data = [[t[0] for t in sent] for sent in gold_data]
if not separate_baseline_data:
baseline_data = training_data
else:
bl_cutoff = len(training_data) // 3
(baseline_data, training_data) = (
training_data[:bl_cutoff],
training_data[bl_cutoff:],
)
(trainseqs, traintokens) = corpus_size(training_data)
(testseqs, testtokens) = corpus_size(testing_data)
(bltrainseqs, bltraintokens) = corpus_size(baseline_data)
print(f"Read testing data ({testseqs:d} sents/{testtokens:d} wds)")
print(f"Read training data ({trainseqs:d} sents/{traintokens:d} wds)")
print(
"Read baseline data ({:d} sents/{:d} wds) {:s}".format(
bltrainseqs,
bltraintokens,
"" if separate_baseline_data else "[reused the training set]",
)
)
return (training_data, baseline_data, gold_data, testing_data)
def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
testcurve = [teststats["initialerrors"]]
for rulescore in teststats["rulescores"]:
testcurve.append(testcurve[-1] - rulescore)
testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]]
traincurve = [trainstats["initialerrors"]]
for rulescore in trainstats["rulescores"]:
traincurve.append(traincurve[-1] - rulescore)
traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]]
import matplotlib.pyplot as plt
r = list(range(len(testcurve)))
plt.plot(r, testcurve, r, traincurve)
plt.axis([None, None, None, 1.0])
plt.savefig(learning_curve_output)
NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")])
REGEXP_TAGGER = RegexpTagger(
[
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "AT"), # articles
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
(r".*ly$", "RB"), # adverbs
(r".*s$", "NNS"), # plural nouns
(r".*ing$", "VBG"), # gerunds
(r".*ed$", "VBD"), # past tense verbs
(r".*", "NN"), # nouns (default)
]
)
def corpus_size(seqs):
return (len(seqs), sum(len(x) for x in seqs))
if __name__ == "__main__":
demo_learning_curve()

View File

@@ -0,0 +1,38 @@
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# returns a list of errors in string format
def error_list(train_sents, test_sents):
"""
Returns a list of human-readable strings indicating the errors in the
given tagging of the corpus.
:param train_sents: The correct tagging of the corpus
:type train_sents: list(tuple)
:param test_sents: The tagged corpus
:type test_sents: list(tuple)
"""
hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % (
"left context",
"word/test->gold".center(22),
"right context",
)
errors = [hdr]
for train_sent, test_sent in zip(train_sents, test_sents):
for wordnum, (word, train_pos) in enumerate(train_sent):
test_pos = test_sent[wordnum][1]
if train_pos != test_pos:
left = " ".join("%s/%s" % w for w in train_sent[:wordnum])
right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :])
mid = f"{word}/{test_pos}->{train_pos}"
errors.append(f"{left[-25:]:>25} | {mid.center(22)} | {right[:25]}")
return errors

View File

@@ -0,0 +1,267 @@
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
class Feature(metaclass=ABCMeta):
"""
An abstract base class for Features. A Feature is a combination of
a specific property-computing method and a list of relative positions
to apply that method to.
The property-computing method, M{extract_property(tokens, index)},
must be implemented by every subclass. It extracts or computes a specific
property for the token at the current index. Typical extract_property()
methods return features such as the token text or tag; but more involved
methods may consider the entire sequence M{tokens} and
for instance compute the length of the sentence the token belongs to.
In addition, the subclass may have a PROPERTY_NAME, which is how
it will be printed (in Rules and Templates, etc). If not given, defaults
to the classname.
"""
json_tag = "nltk.tbl.Feature"
PROPERTY_NAME = None
def __init__(self, positions, end=None):
"""
Construct a Feature which may apply at C{positions}.
>>> # For instance, importing some concrete subclasses (Feature is abstract)
>>> from nltk.tag.brill import Word, Pos
>>> # Feature Word, applying at one of [-2, -1]
>>> Word([-2,-1])
Word([-2, -1])
>>> # Positions need not be contiguous
>>> Word([-2,-1, 1])
Word([-2, -1, 1])
>>> # Contiguous ranges can alternatively be specified giving the
>>> # two endpoints (inclusive)
>>> Pos(-3, -1)
Pos([-3, -2, -1])
>>> # In two-arg form, start <= end is enforced
>>> Pos(2, 1)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "nltk/tbl/template.py", line 306, in __init__
raise TypeError
ValueError: illegal interval specification: (start=2, end=1)
:type positions: list of int
:param positions: the positions at which this features should apply
:raises ValueError: illegal position specifications
An alternative calling convention, for contiguous positions only,
is Feature(start, end):
:type start: int
:param start: start of range where this feature should apply
:type end: int
:param end: end of range (NOTE: inclusive!) where this feature should apply
"""
self.positions = None # to avoid warnings
if end is None:
self.positions = tuple(sorted({int(i) for i in positions}))
else: # positions was actually not a list, but only the start index
try:
if positions > end:
raise TypeError
self.positions = tuple(range(positions, end + 1))
except TypeError as e:
# let any kind of erroneous spec raise ValueError
raise ValueError(
"illegal interval specification: (start={}, end={})".format(
positions, end
)
) from e
# set property name given in subclass, or otherwise name of subclass
self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
def encode_json_obj(self):
return self.positions
@classmethod
def decode_json_obj(cls, obj):
positions = obj
return cls(positions)
def __repr__(self):
return f"{self.__class__.__name__}({list(self.positions)!r})"
@classmethod
def expand(cls, starts, winlens, excludezero=False):
"""
Return a list of features, one for each start point in starts
and for each window length in winlen. If excludezero is True,
no Features containing 0 in its positions will be generated
(many tbl trainers have a special representation for the
target feature at [0])
For instance, importing a concrete subclass (Feature is abstract)
>>> from nltk.tag.brill import Word
First argument gives the possible start positions, second the
possible window lengths
>>> Word.expand([-3,-2,-1], [1])
[Word([-3]), Word([-2]), Word([-1])]
>>> Word.expand([-2,-1], [1])
[Word([-2]), Word([-1])]
>>> Word.expand([-3,-2,-1], [1,2])
[Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]
>>> Word.expand([-2,-1], [1])
[Word([-2]), Word([-1])]
A third optional argument excludes all Features whose positions contain zero
>>> Word.expand([-2,-1,0], [1,2], excludezero=False)
[Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]
>>> Word.expand([-2,-1,0], [1,2], excludezero=True)
[Word([-2]), Word([-1]), Word([-2, -1])]
All window lengths must be positive
>>> Word.expand([-2,-1], [0])
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "nltk/tag/tbl/template.py", line 371, in expand
:param starts: where to start looking for Feature
ValueError: non-positive window length in [0]
:param starts: where to start looking for Feature
:type starts: list of ints
:param winlens: window lengths where to look for Feature
:type starts: list of ints
:param excludezero: do not output any Feature with 0 in any of its positions.
:type excludezero: bool
:returns: list of Features
:raises ValueError: for non-positive window lengths
"""
if not all(x > 0 for x in winlens):
raise ValueError(f"non-positive window length in {winlens}")
xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1))
return [cls(x) for x in xs if not (excludezero and 0 in x)]
def issuperset(self, other):
"""
Return True if this Feature always returns True when other does
More precisely, return True if this feature refers to the same property as other;
and this Feature looks at all positions that other does (and possibly
other positions in addition).
#For instance, importing a concrete subclass (Feature is abstract)
>>> from nltk.tag.brill import Word, Pos
>>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
True
>>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
False
#Feature subclasses must agree
>>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
False
:param other: feature with which to compare
:type other: (subclass of) Feature
:return: True if this feature is superset, otherwise False
:rtype: bool
"""
return self.__class__ is other.__class__ and set(self.positions) >= set(
other.positions
)
def intersects(self, other):
"""
Return True if the positions of this Feature intersects with those of other
More precisely, return True if this feature refers to the same property as other;
and there is some overlap in the positions they look at.
#For instance, importing a concrete subclass (Feature is abstract)
>>> from nltk.tag.brill import Word, Pos
>>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
True
>>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
True
>>> Word([-3,-2,-1]).intersects(Word([0]))
False
#Feature subclasses must agree
>>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
False
:param other: feature with which to compare
:type other: (subclass of) Feature
:return: True if feature classes agree and there is some overlap in the positions they look at
:rtype: bool
"""
return bool(
self.__class__ is other.__class__
and set(self.positions) & set(other.positions)
)
# Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
# it will be enough to define __lt__ and __eq__
def __eq__(self, other):
return self.__class__ is other.__class__ and self.positions == other.positions
def __lt__(self, other):
return (
self.__class__.__name__ < other.__class__.__name__
or
# self.positions is a sorted tuple of ints
self.positions < other.positions
)
def __ne__(self, other):
return not (self == other)
def __gt__(self, other):
return other < self
def __ge__(self, other):
return not self < other
def __le__(self, other):
return self < other or self == other
@staticmethod
@abstractmethod
def extract_property(tokens, index):
"""
Any subclass of Feature must define static method extract_property(tokens, index)
:param tokens: the sequence of tokens
:type tokens: list of tokens
:param index: the current index
:type index: int
:return: feature value
:rtype: any (but usually scalar)
"""

View File

@@ -0,0 +1,319 @@
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
from nltk import jsontags
######################################################################
# Tag Rules
######################################################################
class TagRule(metaclass=ABCMeta):
"""
An interface for tag transformations on a tagged corpus, as
performed by tbl taggers. Each transformation finds all tokens
in the corpus that are tagged with a specific original tag and
satisfy a specific condition, and replaces their tags with a
replacement tag. For any given transformation, the original
tag, replacement tag, and condition are fixed. Conditions may
depend on the token under consideration, as well as any other
tokens in the corpus.
Tag rules must be comparable and hashable.
"""
def __init__(self, original_tag, replacement_tag):
self.original_tag = original_tag
"""The tag which this TagRule may cause to be replaced."""
self.replacement_tag = replacement_tag
"""The tag with which this TagRule may replace another tag."""
def apply(self, tokens, positions=None):
"""
Apply this rule at every position in positions where it
applies to the given sentence. I.e., for each position p
in *positions*, if *tokens[p]* is tagged with this rule's
original tag, and satisfies this rule's condition, then set
its tag to be this rule's replacement tag.
:param tokens: The tagged sentence
:type tokens: list(tuple(str, str))
:type positions: list(int)
:param positions: The positions where the transformation is to
be tried. If not specified, try it at all positions.
:return: The indices of tokens whose tags were changed by this
rule.
:rtype: int
"""
if positions is None:
positions = list(range(len(tokens)))
# Determine the indices at which this rule applies.
change = [i for i in positions if self.applies(tokens, i)]
# Make the changes. Note: this must be done in a separate
# step from finding applicable locations, since we don't want
# the rule to interact with itself.
for i in change:
tokens[i] = (tokens[i][0], self.replacement_tag)
return change
@abstractmethod
def applies(self, tokens, index):
"""
:return: True if the rule would change the tag of
``tokens[index]``, False otherwise
:rtype: bool
:param tokens: A tagged sentence
:type tokens: list(str)
:param index: The index to check
:type index: int
"""
# Rules must be comparable and hashable for the algorithm to work
def __eq__(self, other):
raise TypeError("Rules must implement __eq__()")
def __ne__(self, other):
raise TypeError("Rules must implement __ne__()")
def __hash__(self):
raise TypeError("Rules must implement __hash__()")
@jsontags.register_tag
class Rule(TagRule):
"""
A Rule checks the current corpus position for a certain set of conditions;
if they are all fulfilled, the Rule is triggered, meaning that it
will change tag A to tag B. For other tags than A, nothing happens.
The conditions are parameters to the Rule instance. Each condition is a feature-value pair,
with a set of positions to check for the value of the corresponding feature.
Conceptually, the positions are joined by logical OR, and the feature set by logical AND.
More formally, the Rule is then applicable to the M{n}th token iff:
- The M{n}th token is tagged with the Rule's original tag; and
- For each (Feature(positions), M{value}) tuple:
- The value of Feature of at least one token in {n+p for p in positions}
is M{value}.
"""
json_tag = "nltk.tbl.Rule"
def __init__(self, templateid, original_tag, replacement_tag, conditions):
"""
Construct a new Rule that changes a token's tag from
C{original_tag} to C{replacement_tag} if all of the properties
specified in C{conditions} hold.
:param templateid: the template id (a zero-padded string, '001' etc,
so it will sort nicely)
:type templateid: string
:param conditions: A list of Feature(positions),
each of which specifies that the property (computed by
Feature.extract_property()) of at least one
token in M{n} + p in positions is C{value}.
:type conditions: C{iterable} of C{Feature}
"""
TagRule.__init__(self, original_tag, replacement_tag)
self._conditions = conditions
self.templateid = templateid
def encode_json_obj(self):
return {
"templateid": self.templateid,
"original": self.original_tag,
"replacement": self.replacement_tag,
"conditions": self._conditions,
}
@classmethod
def decode_json_obj(cls, obj):
return cls(
obj["templateid"],
obj["original"],
obj["replacement"],
tuple(tuple(feat) for feat in obj["conditions"]),
)
def applies(self, tokens, index):
# Inherit docs from TagRule
# Does the given token have this Rule's "original tag"?
if tokens[index][1] != self.original_tag:
return False
# Check to make sure that every condition holds.
for feature, val in self._conditions:
# Look for *any* token that satisfies the condition.
for pos in feature.positions:
if not (0 <= index + pos < len(tokens)):
continue
if feature.extract_property(tokens, index + pos) == val:
break
else:
# No token satisfied the condition; return false.
return False
# Every condition checked out, so the Rule is applicable.
return True
def __eq__(self, other):
return self is other or (
other is not None
and other.__class__ == self.__class__
and self.original_tag == other.original_tag
and self.replacement_tag == other.replacement_tag
and self._conditions == other._conditions
)
def __ne__(self, other):
return not (self == other)
def __hash__(self):
# Cache our hash value (justified by profiling.)
try:
return self.__hash
except AttributeError:
self.__hash = hash(repr(self))
return self.__hash
def __repr__(self):
# Cache the repr (justified by profiling -- this is used as
# a sort key when deterministic=True.)
try:
return self.__repr
except AttributeError:
self.__repr = "{}('{}', {}, {}, [{}])".format(
self.__class__.__name__,
self.templateid,
repr(self.original_tag),
repr(self.replacement_tag),
# list(self._conditions) would be simpler but will not generate
# the same Rule.__repr__ in python 2 and 3 and thus break some tests
", ".join(f"({f},{repr(v)})" for (f, v) in self._conditions),
)
return self.__repr
def __str__(self):
def _condition_to_logic(feature, value):
"""
Return a compact, predicate-logic styled string representation
of the given condition.
"""
return "{}:{}@[{}]".format(
feature.PROPERTY_NAME,
value,
",".join(str(w) for w in feature.positions),
)
conditions = " & ".join(
[_condition_to_logic(f, v) for (f, v) in self._conditions]
)
s = f"{self.original_tag}->{self.replacement_tag} if {conditions}"
return s
def format(self, fmt):
"""
Return a string representation of this rule.
>>> from nltk.tbl.rule import Rule
>>> from nltk.tag.brill import Pos
>>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')])
r.format("str") == str(r)
True
>>> r.format("str")
'VB->NN if Pos:DT@[-2,-1]'
r.format("repr") == repr(r)
True
>>> r.format("repr")
"Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])"
>>> r.format("verbose")
'VB -> NN if the Pos of words i-2...i-1 is "DT"'
>>> r.format("not_found")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "nltk/tbl/rule.py", line 256, in format
raise ValueError("unknown rule format spec: {0}".format(fmt))
ValueError: unknown rule format spec: not_found
>>>
:param fmt: format specification
:type fmt: str
:return: string representation
:rtype: str
"""
if fmt == "str":
return self.__str__()
elif fmt == "repr":
return self.__repr__()
elif fmt == "verbose":
return self._verbose_format()
else:
raise ValueError(f"unknown rule format spec: {fmt}")
def _verbose_format(self):
"""
Return a wordy, human-readable string representation
of the given rule.
Not sure how useful this is.
"""
def condition_to_str(feature, value):
return 'the {} of {} is "{}"'.format(
feature.PROPERTY_NAME,
range_to_str(feature.positions),
value,
)
def range_to_str(positions):
if len(positions) == 1:
p = positions[0]
if p == 0:
return "this word"
if p == -1:
return "the preceding word"
elif p == 1:
return "the following word"
elif p < 0:
return "word i-%d" % -p
elif p > 0:
return "word i+%d" % p
else:
# for complete compatibility with the wordy format of nltk2
mx = max(positions)
mn = min(positions)
if mx - mn == len(positions) - 1:
return "words i%+d...i%+d" % (mn, mx)
else:
return "words {{{}}}".format(
",".join("i%+d" % d for d in positions)
)
replacement = f"{self.original_tag} -> {self.replacement_tag}"
conditions = (" if " if self._conditions else "") + ", and ".join(
condition_to_str(f, v) for (f, v) in self._conditions
)
return replacement + conditions

View File

@@ -0,0 +1,325 @@
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import itertools as it
from abc import ABCMeta, abstractmethod
from nltk.tbl.feature import Feature
from nltk.tbl.rule import Rule
class BrillTemplateI(metaclass=ABCMeta):
"""
An interface for generating lists of transformational rules that
apply at given sentence positions. ``BrillTemplateI`` is used by
``Brill`` training algorithms to generate candidate rules.
"""
@abstractmethod
def applicable_rules(self, tokens, i, correctTag):
"""
Return a list of the transformational rules that would correct
the ``i``-th subtoken's tag in the given token. In particular,
return a list of zero or more rules that would change
``tokens[i][1]`` to ``correctTag``, if applied to ``token[i]``.
If the ``i``-th token already has the correct tag (i.e., if
``tagged_tokens[i][1] == correctTag``), then
``applicable_rules()`` should return the empty list.
:param tokens: The tagged tokens being tagged.
:type tokens: list(tuple)
:param i: The index of the token whose tag should be corrected.
:type i: int
:param correctTag: The correct tag for the ``i``-th token.
:type correctTag: any
:rtype: list(BrillRule)
"""
@abstractmethod
def get_neighborhood(self, token, index):
"""
Returns the set of indices *i* such that
``applicable_rules(token, i, ...)`` depends on the value of
the *index*th token of *token*.
This method is used by the "fast" Brill tagger trainer.
:param token: The tokens being tagged.
:type token: list(tuple)
:param index: The index whose neighborhood should be returned.
:type index: int
:rtype: set
"""
class Template(BrillTemplateI):
"""
A tbl Template that generates a list of L{Rule}s that apply at a given sentence
position. In particular, each C{Template} is parameterized by a list of
independent features (a combination of a specific
property to extract and a list C{L} of relative positions at which to extract
it) and generates all Rules that:
- use the given features, each at its own independent position; and
- are applicable to the given token.
"""
ALLTEMPLATES = []
# record a unique id of form "001", for each template created
# _ids = it.count(0)
def __init__(self, *features):
"""
Construct a Template for generating Rules.
Takes a list of Features. A C{Feature} is a combination
of a specific property and its relative positions and should be
a subclass of L{nltk.tbl.feature.Feature}.
An alternative calling convention (kept for backwards compatibility,
but less expressive as it only permits one feature type) is
Template(Feature, (start1, end1), (start2, end2), ...)
In new code, that would be better written
Template(Feature(start1, end1), Feature(start2, end2), ...)
For instance, importing some features
>>> from nltk.tbl.template import Template
>>> from nltk.tag.brill import Word, Pos
Create some features
>>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1]))
Create a single-feature template
>>> Template(wfeat1)
Template(Word([-1]))
Or a two-feature one
>>> Template(wfeat1, wfeat2)
Template(Word([-1]),Word([1, 2]))
Or a three-feature one with two different feature types
>>> Template(wfeat1, wfeat2, pfeat)
Template(Word([-1]),Word([1, 2]),Pos([-2, -1]))
deprecated api: Feature subclass, followed by list of (start,end) pairs
(permits only a single Feature)
>>> Template(Word, (-2,-1), (0,0))
Template(Word([-2, -1]),Word([0]))
Incorrect specification raises TypeError
>>> Template(Word, (-2,-1), Pos, (0,0))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "nltk/tag/tbl/template.py", line 143, in __init__
raise TypeError(
TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...
:type features: list of Features
:param features: the features to build this Template on
"""
# determine the calling form: either
# Template(Feature, args1, [args2, ...)]
# Template(Feature1(args), Feature2(args), ...)
if all(isinstance(f, Feature) for f in features):
self._features = features
elif issubclass(features[0], Feature) and all(
isinstance(a, tuple) for a in features[1:]
):
self._features = [features[0](*tp) for tp in features[1:]]
else:
raise TypeError(
"expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..."
)
self.id = f"{len(self.ALLTEMPLATES):03d}"
self.ALLTEMPLATES.append(self)
def __repr__(self):
return "{}({})".format(
self.__class__.__name__,
",".join([str(f) for f in self._features]),
)
def applicable_rules(self, tokens, index, correct_tag):
if tokens[index][1] == correct_tag:
return []
# For each of this Template's features, find the conditions
# that are applicable for the given token.
# Then, generate one Rule for each combination of features
# (the crossproduct of the conditions).
applicable_conditions = self._applicable_conditions(tokens, index)
xs = list(it.product(*applicable_conditions))
return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs]
def _applicable_conditions(self, tokens, index):
"""
:returns: A set of all conditions for rules
that are applicable to C{tokens[index]}.
"""
conditions = []
for feature in self._features:
conditions.append([])
for pos in feature.positions:
if not (0 <= index + pos < len(tokens)):
continue
value = feature.extract_property(tokens, index + pos)
conditions[-1].append((feature, value))
return conditions
def get_neighborhood(self, tokens, index):
# inherit docs from BrillTemplateI
# applicable_rules(tokens, index, ...) depends on index.
neighborhood = {index} # set literal for python 2.7+
# applicable_rules(tokens, i, ...) depends on index if
# i+start < index <= i+end.
allpositions = [0] + [p for feat in self._features for p in feat.positions]
start, end = min(allpositions), max(allpositions)
s = max(0, index + (-end))
e = min(index + (-start) + 1, len(tokens))
for i in range(s, e):
neighborhood.add(i)
return neighborhood
@classmethod
def expand(cls, featurelists, combinations=None, skipintersecting=True):
"""
Factory method to mass generate Templates from a list L of lists of Features.
#With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2
#of the sublists in L; it will output all Templates formed by the Cartesian product
#of this selection, with duplicates and other semantically equivalent
#forms removed. Default for combinations is (1, len(L)).
The feature lists may have been specified
manually, or generated from Feature.expand(). For instance,
>>> from nltk.tbl.template import Template
>>> from nltk.tag.brill import Word, Pos
#creating some features
>>> (wd_0, wd_01) = (Word([0]), Word([0,1]))
>>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3]))
>>> list(Template.expand([[wd_0], [pos_m2]]))
[Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))]
>>> list(Template.expand([[wd_0, wd_01], [pos_m2]]))
[Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))]
#note: with Feature.expand(), it is very easy to generate more templates
#than your system can handle -- for instance,
>>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False)
>>> len(wordtpls)
7
>>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True)
>>> len(postpls)
9
#and now the Cartesian product of all non-empty combinations of two wordtpls and
#two postpls, with semantic equivalents removed
>>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls]))
>>> len(templates)
713
will return a list of eight templates
Template(Word([0])),
Template(Word([0, 1])),
Template(Pos([-2])),
Template(Pos([-1])),
Template(Pos([-2]),Word([0])),
Template(Pos([-1]),Word([0])),
Template(Pos([-2]),Word([0, 1])),
Template(Pos([-1]),Word([0, 1]))]
#Templates where one feature is a subset of another, such as
#Template(Word([0,1]), Word([1]), will not appear in the output.
#By default, this non-subset constraint is tightened to disjointness:
#Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out.
#With skipintersecting=False, then such Templates are allowed
WARNING: this method makes it very easy to fill all your memory when training
generated templates on any real-world corpus
:param featurelists: lists of Features, whose Cartesian product will return a set of Templates
:type featurelists: list of (list of Features)
:param combinations: given n featurelists: if combinations=k, all generated Templates will have
k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n
:type combinations: None, int, or (int, int)
:param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature)
:type skipintersecting: bool
:returns: generator of Templates
"""
def nonempty_powerset(xs): # xs is a list
# itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
# find the correct tuple given combinations, one of {None, k, (k1,k2)}
k = combinations # for brevity
combrange = (
(1, len(xs) + 1)
if k is None
else (
(k, k + 1) # n over 1 .. n over n (all non-empty combinations)
if isinstance(k, int)
else (k[0], k[1] + 1)
) # n over k (only
) # n over k1, n over k1+1... n over k2
return it.chain.from_iterable(
it.combinations(xs, r) for r in range(*combrange)
)
seentemplates = set()
for picks in nonempty_powerset(featurelists):
for pick in it.product(*picks):
if any(
i != j and x.issuperset(y)
for (i, x) in enumerate(pick)
for (j, y) in enumerate(pick)
):
continue
if skipintersecting and any(
i != j and x.intersects(y)
for (i, x) in enumerate(pick)
for (j, y) in enumerate(pick)
):
continue
thistemplate = cls(*sorted(pick))
strpick = str(thistemplate)
#!!FIXME --this is hackish
if strpick in seentemplates: # already added
cls._poptemplate()
continue
seentemplates.add(strpick)
yield thistemplate
@classmethod
def _cleartemplates(cls):
cls.ALLTEMPLATES = []
@classmethod
def _poptemplate(cls):
return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None