updates
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
# Natural Language Toolkit: Transformation-based learning
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Marcus Uneson <marcus.uneson@gmail.com>
|
||||
# based on previous (nltk2) version by
|
||||
# Christopher Maloof, Edward Loper, Steven Bird
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Transformation Based Learning
|
||||
|
||||
A general purpose package for Transformation Based Learning,
|
||||
currently used by nltk.tag.BrillTagger.
|
||||
|
||||
isort:skip_file
|
||||
"""
|
||||
|
||||
from nltk.tbl.template import Template
|
||||
|
||||
# API: Template(...), Template.expand(...)
|
||||
|
||||
from nltk.tbl.feature import Feature
|
||||
|
||||
# API: Feature(...), Feature.expand(...)
|
||||
|
||||
from nltk.tbl.rule import Rule
|
||||
|
||||
# API: Rule.format(...), Rule.templatetid
|
||||
|
||||
from nltk.tbl.erroranalysis import error_list
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
418
Backend/venv/lib/python3.12/site-packages/nltk/tbl/demo.py
Normal file
418
Backend/venv/lib/python3.12/site-packages/nltk/tbl/demo.py
Normal file
@@ -0,0 +1,418 @@
|
||||
# Natural Language Toolkit: Transformation-based learning
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Marcus Uneson <marcus.uneson@gmail.com>
|
||||
# based on previous (nltk2) version by
|
||||
# Christopher Maloof, Edward Loper, Steven Bird
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
|
||||
from nltk.corpus import treebank
|
||||
from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger
|
||||
from nltk.tag.brill import Pos, Word
|
||||
from nltk.tbl import Template, error_list
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
Run a demo with defaults. See source comments for details,
|
||||
or docstrings of any of the more specific demo_* functions.
|
||||
"""
|
||||
postag()
|
||||
|
||||
|
||||
def demo_repr_rule_format():
|
||||
"""
|
||||
Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
|
||||
"""
|
||||
postag(ruleformat="repr")
|
||||
|
||||
|
||||
def demo_str_rule_format():
|
||||
"""
|
||||
Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
|
||||
"""
|
||||
postag(ruleformat="str")
|
||||
|
||||
|
||||
def demo_verbose_rule_format():
|
||||
"""
|
||||
Exemplify Rule.format("verbose")
|
||||
"""
|
||||
postag(ruleformat="verbose")
|
||||
|
||||
|
||||
def demo_multiposition_feature():
|
||||
"""
|
||||
The feature/s of a template takes a list of positions
|
||||
relative to the current word where the feature should be
|
||||
looked for, conceptually joined by logical OR. For instance,
|
||||
Pos([-1, 1]), given a value V, will hold whenever V is found
|
||||
one step to the left and/or one step to the right.
|
||||
|
||||
For contiguous ranges, a 2-arg form giving inclusive end
|
||||
points can also be used: Pos(-3, -1) is the same as the arg
|
||||
below.
|
||||
"""
|
||||
postag(templates=[Template(Pos([-3, -2, -1]))])
|
||||
|
||||
|
||||
def demo_multifeature_template():
|
||||
"""
|
||||
Templates can have more than a single feature.
|
||||
"""
|
||||
postag(templates=[Template(Word([0]), Pos([-2, -1]))])
|
||||
|
||||
|
||||
def demo_template_statistics():
|
||||
"""
|
||||
Show aggregate statistics per template. Little used templates are
|
||||
candidates for deletion, much used templates may possibly be refined.
|
||||
|
||||
Deleting unused templates is mostly about saving time and/or space:
|
||||
training is basically O(T) in the number of templates T
|
||||
(also in terms of memory usage, which often will be the limiting factor).
|
||||
"""
|
||||
postag(incremental_stats=True, template_stats=True)
|
||||
|
||||
|
||||
def demo_generated_templates():
|
||||
"""
|
||||
Template.expand and Feature.expand are class methods facilitating
|
||||
generating large amounts of templates. See their documentation for
|
||||
details.
|
||||
|
||||
Note: training with 500 templates can easily fill all available
|
||||
even on relatively small corpora
|
||||
"""
|
||||
wordtpls = Word.expand([-1, 0, 1], [1, 2], excludezero=False)
|
||||
tagtpls = Pos.expand([-2, -1, 0, 1], [1, 2], excludezero=True)
|
||||
templates = list(Template.expand([wordtpls, tagtpls], combinations=(1, 3)))
|
||||
print(
|
||||
"Generated {} templates for transformation-based learning".format(
|
||||
len(templates)
|
||||
)
|
||||
)
|
||||
postag(templates=templates, incremental_stats=True, template_stats=True)
|
||||
|
||||
|
||||
def demo_learning_curve():
|
||||
"""
|
||||
Plot a learning curve -- the contribution on tagging accuracy of
|
||||
the individual rules.
|
||||
Note: requires matplotlib
|
||||
"""
|
||||
postag(
|
||||
incremental_stats=True,
|
||||
separate_baseline_data=True,
|
||||
learning_curve_output="learningcurve.png",
|
||||
)
|
||||
|
||||
|
||||
def demo_error_analysis():
|
||||
"""
|
||||
Writes a file with context for each erroneous word after tagging testing data
|
||||
"""
|
||||
postag(error_output="errors.txt")
|
||||
|
||||
|
||||
def demo_serialize_tagger():
|
||||
"""
|
||||
Serializes the learned tagger to a file in pickle format; reloads it
|
||||
and validates the process.
|
||||
"""
|
||||
postag(serialize_output="tagger.pcl")
|
||||
|
||||
|
||||
def demo_high_accuracy_rules():
|
||||
"""
|
||||
Discard rules with low accuracy. This may hurt performance a bit,
|
||||
but will often produce rules which are more interesting read to a human.
|
||||
"""
|
||||
postag(num_sents=3000, min_acc=0.96, min_score=10)
|
||||
|
||||
|
||||
def postag(
|
||||
templates=None,
|
||||
tagged_data=None,
|
||||
num_sents=1000,
|
||||
max_rules=300,
|
||||
min_score=3,
|
||||
min_acc=None,
|
||||
train=0.8,
|
||||
trace=3,
|
||||
randomize=False,
|
||||
ruleformat="str",
|
||||
incremental_stats=False,
|
||||
template_stats=False,
|
||||
error_output=None,
|
||||
serialize_output=None,
|
||||
learning_curve_output=None,
|
||||
learning_curve_take=300,
|
||||
baseline_backoff_tagger=None,
|
||||
separate_baseline_data=False,
|
||||
cache_baseline_tagger=None,
|
||||
):
|
||||
"""
|
||||
Brill Tagger Demonstration
|
||||
:param templates: how many sentences of training and testing data to use
|
||||
:type templates: list of Template
|
||||
|
||||
:param tagged_data: maximum number of rule instances to create
|
||||
:type tagged_data: C{int}
|
||||
|
||||
:param num_sents: how many sentences of training and testing data to use
|
||||
:type num_sents: C{int}
|
||||
|
||||
:param max_rules: maximum number of rule instances to create
|
||||
:type max_rules: C{int}
|
||||
|
||||
:param min_score: the minimum score for a rule in order for it to be considered
|
||||
:type min_score: C{int}
|
||||
|
||||
:param min_acc: the minimum score for a rule in order for it to be considered
|
||||
:type min_acc: C{float}
|
||||
|
||||
:param train: the fraction of the the corpus to be used for training (1=all)
|
||||
:type train: C{float}
|
||||
|
||||
:param trace: the level of diagnostic tracing output to produce (0-4)
|
||||
:type trace: C{int}
|
||||
|
||||
:param randomize: whether the training data should be a random subset of the corpus
|
||||
:type randomize: C{bool}
|
||||
|
||||
:param ruleformat: rule output format, one of "str", "repr", "verbose"
|
||||
:type ruleformat: C{str}
|
||||
|
||||
:param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
|
||||
:type incremental_stats: C{bool}
|
||||
|
||||
:param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
|
||||
:type template_stats: C{bool}
|
||||
|
||||
:param error_output: the file where errors will be saved
|
||||
:type error_output: C{string}
|
||||
|
||||
:param serialize_output: the file where the learned tbl tagger will be saved
|
||||
:type serialize_output: C{string}
|
||||
|
||||
:param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
|
||||
:type learning_curve_output: C{string}
|
||||
|
||||
:param learning_curve_take: how many rules plotted
|
||||
:type learning_curve_take: C{int}
|
||||
|
||||
:param baseline_backoff_tagger: the file where rules will be saved
|
||||
:type baseline_backoff_tagger: tagger
|
||||
|
||||
:param separate_baseline_data: use a fraction of the training data exclusively for training baseline
|
||||
:type separate_baseline_data: C{bool}
|
||||
|
||||
:param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
|
||||
deterministic output from the baseline unigram tagger between python versions)
|
||||
:type cache_baseline_tagger: C{string}
|
||||
|
||||
|
||||
Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
|
||||
is fast and fine for a demo, but is likely to generalize worse on unseen data.
|
||||
Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
|
||||
"""
|
||||
|
||||
# defaults
|
||||
baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
|
||||
if templates is None:
|
||||
from nltk.tag.brill import brill24, describe_template_sets
|
||||
|
||||
# some pre-built template sets taken from typical systems or publications are
|
||||
# available. Print a list with describe_template_sets()
|
||||
# for instance:
|
||||
templates = brill24()
|
||||
(training_data, baseline_data, gold_data, testing_data) = _demo_prepare_data(
|
||||
tagged_data, train, num_sents, randomize, separate_baseline_data
|
||||
)
|
||||
|
||||
# creating (or reloading from cache) a baseline tagger (unigram tagger)
|
||||
# this is just a mechanism for getting deterministic output from the baseline between
|
||||
# python versions
|
||||
if cache_baseline_tagger:
|
||||
if not os.path.exists(cache_baseline_tagger):
|
||||
baseline_tagger = UnigramTagger(
|
||||
baseline_data, backoff=baseline_backoff_tagger
|
||||
)
|
||||
with open(cache_baseline_tagger, "w") as print_rules:
|
||||
pickle.dump(baseline_tagger, print_rules)
|
||||
print(
|
||||
"Trained baseline tagger, pickled it to {}".format(
|
||||
cache_baseline_tagger
|
||||
)
|
||||
)
|
||||
with open(cache_baseline_tagger) as print_rules:
|
||||
baseline_tagger = pickle.load(print_rules)
|
||||
print(f"Reloaded pickled tagger from {cache_baseline_tagger}")
|
||||
else:
|
||||
baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
|
||||
print("Trained baseline tagger")
|
||||
if gold_data:
|
||||
print(
|
||||
" Accuracy on test set: {:0.4f}".format(
|
||||
baseline_tagger.accuracy(gold_data)
|
||||
)
|
||||
)
|
||||
|
||||
# creating a Brill tagger
|
||||
tbrill = time.time()
|
||||
trainer = BrillTaggerTrainer(
|
||||
baseline_tagger, templates, trace, ruleformat=ruleformat
|
||||
)
|
||||
print("Training tbl tagger...")
|
||||
brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
|
||||
print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds")
|
||||
if gold_data:
|
||||
print(" Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data))
|
||||
|
||||
# printing the learned rules, if learned silently
|
||||
if trace == 1:
|
||||
print("\nLearned rules: ")
|
||||
for ruleno, rule in enumerate(brill_tagger.rules(), 1):
|
||||
print(f"{ruleno:4d} {rule.format(ruleformat):s}")
|
||||
|
||||
# printing template statistics (optionally including comparison with the training data)
|
||||
# note: if not separate_baseline_data, then baseline accuracy will be artificially high
|
||||
if incremental_stats:
|
||||
print(
|
||||
"Incrementally tagging the test data, collecting individual rule statistics"
|
||||
)
|
||||
(taggedtest, teststats) = brill_tagger.batch_tag_incremental(
|
||||
testing_data, gold_data
|
||||
)
|
||||
print(" Rule statistics collected")
|
||||
if not separate_baseline_data:
|
||||
print(
|
||||
"WARNING: train_stats asked for separate_baseline_data=True; the baseline "
|
||||
"will be artificially high"
|
||||
)
|
||||
trainstats = brill_tagger.train_stats()
|
||||
if template_stats:
|
||||
brill_tagger.print_template_statistics(teststats)
|
||||
if learning_curve_output:
|
||||
_demo_plot(
|
||||
learning_curve_output, teststats, trainstats, take=learning_curve_take
|
||||
)
|
||||
print(f"Wrote plot of learning curve to {learning_curve_output}")
|
||||
else:
|
||||
print("Tagging the test data")
|
||||
taggedtest = brill_tagger.tag_sents(testing_data)
|
||||
if template_stats:
|
||||
brill_tagger.print_template_statistics()
|
||||
|
||||
# writing error analysis to file
|
||||
if error_output is not None:
|
||||
with open(error_output, "w") as f:
|
||||
f.write("Errors for Brill Tagger %r\n\n" % serialize_output)
|
||||
f.write("\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n")
|
||||
print(f"Wrote tagger errors including context to {error_output}")
|
||||
|
||||
# serializing the tagger to a pickle file and reloading (just to see it works)
|
||||
if serialize_output is not None:
|
||||
taggedtest = brill_tagger.tag_sents(testing_data)
|
||||
with open(serialize_output, "w") as print_rules:
|
||||
pickle.dump(brill_tagger, print_rules)
|
||||
print(f"Wrote pickled tagger to {serialize_output}")
|
||||
with open(serialize_output) as print_rules:
|
||||
brill_tagger_reloaded = pickle.load(print_rules)
|
||||
print(f"Reloaded pickled tagger from {serialize_output}")
|
||||
taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
|
||||
if taggedtest == taggedtest_reloaded:
|
||||
print("Reloaded tagger tried on test set, results identical")
|
||||
else:
|
||||
print("PROBLEM: Reloaded tagger gave different results on test set")
|
||||
|
||||
|
||||
def _demo_prepare_data(
|
||||
tagged_data, train, num_sents, randomize, separate_baseline_data
|
||||
):
|
||||
# train is the proportion of data used in training; the rest is reserved
|
||||
# for testing.
|
||||
if tagged_data is None:
|
||||
print("Loading tagged data from treebank... ")
|
||||
tagged_data = treebank.tagged_sents()
|
||||
if num_sents is None or len(tagged_data) <= num_sents:
|
||||
num_sents = len(tagged_data)
|
||||
if randomize:
|
||||
random.seed(len(tagged_data))
|
||||
random.shuffle(tagged_data)
|
||||
cutoff = int(num_sents * train)
|
||||
training_data = tagged_data[:cutoff]
|
||||
gold_data = tagged_data[cutoff:num_sents]
|
||||
testing_data = [[t[0] for t in sent] for sent in gold_data]
|
||||
if not separate_baseline_data:
|
||||
baseline_data = training_data
|
||||
else:
|
||||
bl_cutoff = len(training_data) // 3
|
||||
(baseline_data, training_data) = (
|
||||
training_data[:bl_cutoff],
|
||||
training_data[bl_cutoff:],
|
||||
)
|
||||
(trainseqs, traintokens) = corpus_size(training_data)
|
||||
(testseqs, testtokens) = corpus_size(testing_data)
|
||||
(bltrainseqs, bltraintokens) = corpus_size(baseline_data)
|
||||
print(f"Read testing data ({testseqs:d} sents/{testtokens:d} wds)")
|
||||
print(f"Read training data ({trainseqs:d} sents/{traintokens:d} wds)")
|
||||
print(
|
||||
"Read baseline data ({:d} sents/{:d} wds) {:s}".format(
|
||||
bltrainseqs,
|
||||
bltraintokens,
|
||||
"" if separate_baseline_data else "[reused the training set]",
|
||||
)
|
||||
)
|
||||
return (training_data, baseline_data, gold_data, testing_data)
|
||||
|
||||
|
||||
def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
|
||||
testcurve = [teststats["initialerrors"]]
|
||||
for rulescore in teststats["rulescores"]:
|
||||
testcurve.append(testcurve[-1] - rulescore)
|
||||
testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]]
|
||||
|
||||
traincurve = [trainstats["initialerrors"]]
|
||||
for rulescore in trainstats["rulescores"]:
|
||||
traincurve.append(traincurve[-1] - rulescore)
|
||||
traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]]
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
r = list(range(len(testcurve)))
|
||||
plt.plot(r, testcurve, r, traincurve)
|
||||
plt.axis([None, None, None, 1.0])
|
||||
plt.savefig(learning_curve_output)
|
||||
|
||||
|
||||
NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")])
|
||||
|
||||
REGEXP_TAGGER = RegexpTagger(
|
||||
[
|
||||
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
|
||||
(r"(The|the|A|a|An|an)$", "AT"), # articles
|
||||
(r".*able$", "JJ"), # adjectives
|
||||
(r".*ness$", "NN"), # nouns formed from adjectives
|
||||
(r".*ly$", "RB"), # adverbs
|
||||
(r".*s$", "NNS"), # plural nouns
|
||||
(r".*ing$", "VBG"), # gerunds
|
||||
(r".*ed$", "VBD"), # past tense verbs
|
||||
(r".*", "NN"), # nouns (default)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def corpus_size(seqs):
|
||||
return (len(seqs), sum(len(x) for x in seqs))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo_learning_curve()
|
||||
@@ -0,0 +1,38 @@
|
||||
# Natural Language Toolkit: Transformation-based learning
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Marcus Uneson <marcus.uneson@gmail.com>
|
||||
# based on previous (nltk2) version by
|
||||
# Christopher Maloof, Edward Loper, Steven Bird
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# returns a list of errors in string format
|
||||
|
||||
|
||||
def error_list(train_sents, test_sents):
|
||||
"""
|
||||
Returns a list of human-readable strings indicating the errors in the
|
||||
given tagging of the corpus.
|
||||
|
||||
:param train_sents: The correct tagging of the corpus
|
||||
:type train_sents: list(tuple)
|
||||
:param test_sents: The tagged corpus
|
||||
:type test_sents: list(tuple)
|
||||
"""
|
||||
hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % (
|
||||
"left context",
|
||||
"word/test->gold".center(22),
|
||||
"right context",
|
||||
)
|
||||
errors = [hdr]
|
||||
for train_sent, test_sent in zip(train_sents, test_sents):
|
||||
for wordnum, (word, train_pos) in enumerate(train_sent):
|
||||
test_pos = test_sent[wordnum][1]
|
||||
if train_pos != test_pos:
|
||||
left = " ".join("%s/%s" % w for w in train_sent[:wordnum])
|
||||
right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :])
|
||||
mid = f"{word}/{test_pos}->{train_pos}"
|
||||
errors.append(f"{left[-25:]:>25} | {mid.center(22)} | {right[:25]}")
|
||||
|
||||
return errors
|
||||
267
Backend/venv/lib/python3.12/site-packages/nltk/tbl/feature.py
Normal file
267
Backend/venv/lib/python3.12/site-packages/nltk/tbl/feature.py
Normal file
@@ -0,0 +1,267 @@
|
||||
# Natural Language Toolkit: Transformation-based learning
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Marcus Uneson <marcus.uneson@gmail.com>
|
||||
# based on previous (nltk2) version by
|
||||
# Christopher Maloof, Edward Loper, Steven Bird
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
|
||||
class Feature(metaclass=ABCMeta):
|
||||
"""
|
||||
An abstract base class for Features. A Feature is a combination of
|
||||
a specific property-computing method and a list of relative positions
|
||||
to apply that method to.
|
||||
|
||||
The property-computing method, M{extract_property(tokens, index)},
|
||||
must be implemented by every subclass. It extracts or computes a specific
|
||||
property for the token at the current index. Typical extract_property()
|
||||
methods return features such as the token text or tag; but more involved
|
||||
methods may consider the entire sequence M{tokens} and
|
||||
for instance compute the length of the sentence the token belongs to.
|
||||
|
||||
In addition, the subclass may have a PROPERTY_NAME, which is how
|
||||
it will be printed (in Rules and Templates, etc). If not given, defaults
|
||||
to the classname.
|
||||
|
||||
"""
|
||||
|
||||
json_tag = "nltk.tbl.Feature"
|
||||
PROPERTY_NAME = None
|
||||
|
||||
def __init__(self, positions, end=None):
|
||||
"""
|
||||
Construct a Feature which may apply at C{positions}.
|
||||
|
||||
>>> # For instance, importing some concrete subclasses (Feature is abstract)
|
||||
>>> from nltk.tag.brill import Word, Pos
|
||||
|
||||
>>> # Feature Word, applying at one of [-2, -1]
|
||||
>>> Word([-2,-1])
|
||||
Word([-2, -1])
|
||||
|
||||
>>> # Positions need not be contiguous
|
||||
>>> Word([-2,-1, 1])
|
||||
Word([-2, -1, 1])
|
||||
|
||||
>>> # Contiguous ranges can alternatively be specified giving the
|
||||
>>> # two endpoints (inclusive)
|
||||
>>> Pos(-3, -1)
|
||||
Pos([-3, -2, -1])
|
||||
|
||||
>>> # In two-arg form, start <= end is enforced
|
||||
>>> Pos(2, 1)
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
File "nltk/tbl/template.py", line 306, in __init__
|
||||
raise TypeError
|
||||
ValueError: illegal interval specification: (start=2, end=1)
|
||||
|
||||
:type positions: list of int
|
||||
:param positions: the positions at which this features should apply
|
||||
:raises ValueError: illegal position specifications
|
||||
|
||||
An alternative calling convention, for contiguous positions only,
|
||||
is Feature(start, end):
|
||||
|
||||
:type start: int
|
||||
:param start: start of range where this feature should apply
|
||||
:type end: int
|
||||
:param end: end of range (NOTE: inclusive!) where this feature should apply
|
||||
"""
|
||||
self.positions = None # to avoid warnings
|
||||
if end is None:
|
||||
self.positions = tuple(sorted({int(i) for i in positions}))
|
||||
else: # positions was actually not a list, but only the start index
|
||||
try:
|
||||
if positions > end:
|
||||
raise TypeError
|
||||
self.positions = tuple(range(positions, end + 1))
|
||||
except TypeError as e:
|
||||
# let any kind of erroneous spec raise ValueError
|
||||
raise ValueError(
|
||||
"illegal interval specification: (start={}, end={})".format(
|
||||
positions, end
|
||||
)
|
||||
) from e
|
||||
|
||||
# set property name given in subclass, or otherwise name of subclass
|
||||
self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
|
||||
|
||||
def encode_json_obj(self):
|
||||
return self.positions
|
||||
|
||||
@classmethod
|
||||
def decode_json_obj(cls, obj):
|
||||
positions = obj
|
||||
return cls(positions)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({list(self.positions)!r})"
|
||||
|
||||
@classmethod
|
||||
def expand(cls, starts, winlens, excludezero=False):
|
||||
"""
|
||||
Return a list of features, one for each start point in starts
|
||||
and for each window length in winlen. If excludezero is True,
|
||||
no Features containing 0 in its positions will be generated
|
||||
(many tbl trainers have a special representation for the
|
||||
target feature at [0])
|
||||
|
||||
For instance, importing a concrete subclass (Feature is abstract)
|
||||
|
||||
>>> from nltk.tag.brill import Word
|
||||
|
||||
First argument gives the possible start positions, second the
|
||||
possible window lengths
|
||||
|
||||
>>> Word.expand([-3,-2,-1], [1])
|
||||
[Word([-3]), Word([-2]), Word([-1])]
|
||||
|
||||
>>> Word.expand([-2,-1], [1])
|
||||
[Word([-2]), Word([-1])]
|
||||
|
||||
>>> Word.expand([-3,-2,-1], [1,2])
|
||||
[Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]
|
||||
|
||||
>>> Word.expand([-2,-1], [1])
|
||||
[Word([-2]), Word([-1])]
|
||||
|
||||
A third optional argument excludes all Features whose positions contain zero
|
||||
|
||||
>>> Word.expand([-2,-1,0], [1,2], excludezero=False)
|
||||
[Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]
|
||||
|
||||
>>> Word.expand([-2,-1,0], [1,2], excludezero=True)
|
||||
[Word([-2]), Word([-1]), Word([-2, -1])]
|
||||
|
||||
All window lengths must be positive
|
||||
|
||||
>>> Word.expand([-2,-1], [0])
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
File "nltk/tag/tbl/template.py", line 371, in expand
|
||||
:param starts: where to start looking for Feature
|
||||
ValueError: non-positive window length in [0]
|
||||
|
||||
:param starts: where to start looking for Feature
|
||||
:type starts: list of ints
|
||||
:param winlens: window lengths where to look for Feature
|
||||
:type starts: list of ints
|
||||
:param excludezero: do not output any Feature with 0 in any of its positions.
|
||||
:type excludezero: bool
|
||||
:returns: list of Features
|
||||
:raises ValueError: for non-positive window lengths
|
||||
"""
|
||||
if not all(x > 0 for x in winlens):
|
||||
raise ValueError(f"non-positive window length in {winlens}")
|
||||
xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1))
|
||||
return [cls(x) for x in xs if not (excludezero and 0 in x)]
|
||||
|
||||
def issuperset(self, other):
|
||||
"""
|
||||
Return True if this Feature always returns True when other does
|
||||
|
||||
More precisely, return True if this feature refers to the same property as other;
|
||||
and this Feature looks at all positions that other does (and possibly
|
||||
other positions in addition).
|
||||
|
||||
#For instance, importing a concrete subclass (Feature is abstract)
|
||||
>>> from nltk.tag.brill import Word, Pos
|
||||
|
||||
>>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
|
||||
True
|
||||
|
||||
>>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
|
||||
False
|
||||
|
||||
#Feature subclasses must agree
|
||||
>>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
|
||||
False
|
||||
|
||||
:param other: feature with which to compare
|
||||
:type other: (subclass of) Feature
|
||||
:return: True if this feature is superset, otherwise False
|
||||
:rtype: bool
|
||||
|
||||
|
||||
"""
|
||||
return self.__class__ is other.__class__ and set(self.positions) >= set(
|
||||
other.positions
|
||||
)
|
||||
|
||||
def intersects(self, other):
|
||||
"""
|
||||
Return True if the positions of this Feature intersects with those of other
|
||||
|
||||
More precisely, return True if this feature refers to the same property as other;
|
||||
and there is some overlap in the positions they look at.
|
||||
|
||||
#For instance, importing a concrete subclass (Feature is abstract)
|
||||
>>> from nltk.tag.brill import Word, Pos
|
||||
|
||||
>>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
|
||||
True
|
||||
|
||||
>>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
|
||||
True
|
||||
|
||||
>>> Word([-3,-2,-1]).intersects(Word([0]))
|
||||
False
|
||||
|
||||
#Feature subclasses must agree
|
||||
>>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
|
||||
False
|
||||
|
||||
:param other: feature with which to compare
|
||||
:type other: (subclass of) Feature
|
||||
:return: True if feature classes agree and there is some overlap in the positions they look at
|
||||
:rtype: bool
|
||||
"""
|
||||
|
||||
return bool(
|
||||
self.__class__ is other.__class__
|
||||
and set(self.positions) & set(other.positions)
|
||||
)
|
||||
|
||||
# Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
|
||||
# it will be enough to define __lt__ and __eq__
|
||||
def __eq__(self, other):
|
||||
return self.__class__ is other.__class__ and self.positions == other.positions
|
||||
|
||||
def __lt__(self, other):
|
||||
return (
|
||||
self.__class__.__name__ < other.__class__.__name__
|
||||
or
|
||||
# self.positions is a sorted tuple of ints
|
||||
self.positions < other.positions
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __gt__(self, other):
|
||||
return other < self
|
||||
|
||||
def __ge__(self, other):
|
||||
return not self < other
|
||||
|
||||
def __le__(self, other):
|
||||
return self < other or self == other
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def extract_property(tokens, index):
|
||||
"""
|
||||
Any subclass of Feature must define static method extract_property(tokens, index)
|
||||
|
||||
:param tokens: the sequence of tokens
|
||||
:type tokens: list of tokens
|
||||
:param index: the current index
|
||||
:type index: int
|
||||
:return: feature value
|
||||
:rtype: any (but usually scalar)
|
||||
"""
|
||||
319
Backend/venv/lib/python3.12/site-packages/nltk/tbl/rule.py
Normal file
319
Backend/venv/lib/python3.12/site-packages/nltk/tbl/rule.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# Natural Language Toolkit: Transformation-based learning
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Marcus Uneson <marcus.uneson@gmail.com>
|
||||
# based on previous (nltk2) version by
|
||||
# Christopher Maloof, Edward Loper, Steven Bird
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from nltk import jsontags
|
||||
|
||||
|
||||
######################################################################
|
||||
# Tag Rules
|
||||
######################################################################
|
||||
class TagRule(metaclass=ABCMeta):
|
||||
"""
|
||||
An interface for tag transformations on a tagged corpus, as
|
||||
performed by tbl taggers. Each transformation finds all tokens
|
||||
in the corpus that are tagged with a specific original tag and
|
||||
satisfy a specific condition, and replaces their tags with a
|
||||
replacement tag. For any given transformation, the original
|
||||
tag, replacement tag, and condition are fixed. Conditions may
|
||||
depend on the token under consideration, as well as any other
|
||||
tokens in the corpus.
|
||||
|
||||
Tag rules must be comparable and hashable.
|
||||
"""
|
||||
|
||||
def __init__(self, original_tag, replacement_tag):
|
||||
self.original_tag = original_tag
|
||||
"""The tag which this TagRule may cause to be replaced."""
|
||||
|
||||
self.replacement_tag = replacement_tag
|
||||
"""The tag with which this TagRule may replace another tag."""
|
||||
|
||||
def apply(self, tokens, positions=None):
|
||||
"""
|
||||
Apply this rule at every position in positions where it
|
||||
applies to the given sentence. I.e., for each position p
|
||||
in *positions*, if *tokens[p]* is tagged with this rule's
|
||||
original tag, and satisfies this rule's condition, then set
|
||||
its tag to be this rule's replacement tag.
|
||||
|
||||
:param tokens: The tagged sentence
|
||||
:type tokens: list(tuple(str, str))
|
||||
:type positions: list(int)
|
||||
:param positions: The positions where the transformation is to
|
||||
be tried. If not specified, try it at all positions.
|
||||
:return: The indices of tokens whose tags were changed by this
|
||||
rule.
|
||||
:rtype: int
|
||||
"""
|
||||
if positions is None:
|
||||
positions = list(range(len(tokens)))
|
||||
|
||||
# Determine the indices at which this rule applies.
|
||||
change = [i for i in positions if self.applies(tokens, i)]
|
||||
|
||||
# Make the changes. Note: this must be done in a separate
|
||||
# step from finding applicable locations, since we don't want
|
||||
# the rule to interact with itself.
|
||||
for i in change:
|
||||
tokens[i] = (tokens[i][0], self.replacement_tag)
|
||||
|
||||
return change
|
||||
|
||||
@abstractmethod
|
||||
def applies(self, tokens, index):
|
||||
"""
|
||||
:return: True if the rule would change the tag of
|
||||
``tokens[index]``, False otherwise
|
||||
:rtype: bool
|
||||
:param tokens: A tagged sentence
|
||||
:type tokens: list(str)
|
||||
:param index: The index to check
|
||||
:type index: int
|
||||
"""
|
||||
|
||||
# Rules must be comparable and hashable for the algorithm to work
|
||||
def __eq__(self, other):
|
||||
raise TypeError("Rules must implement __eq__()")
|
||||
|
||||
def __ne__(self, other):
|
||||
raise TypeError("Rules must implement __ne__()")
|
||||
|
||||
def __hash__(self):
|
||||
raise TypeError("Rules must implement __hash__()")
|
||||
|
||||
|
||||
@jsontags.register_tag
|
||||
class Rule(TagRule):
|
||||
"""
|
||||
A Rule checks the current corpus position for a certain set of conditions;
|
||||
if they are all fulfilled, the Rule is triggered, meaning that it
|
||||
will change tag A to tag B. For other tags than A, nothing happens.
|
||||
|
||||
The conditions are parameters to the Rule instance. Each condition is a feature-value pair,
|
||||
with a set of positions to check for the value of the corresponding feature.
|
||||
Conceptually, the positions are joined by logical OR, and the feature set by logical AND.
|
||||
|
||||
More formally, the Rule is then applicable to the M{n}th token iff:
|
||||
|
||||
- The M{n}th token is tagged with the Rule's original tag; and
|
||||
- For each (Feature(positions), M{value}) tuple:
|
||||
|
||||
- The value of Feature of at least one token in {n+p for p in positions}
|
||||
is M{value}.
|
||||
"""
|
||||
|
||||
json_tag = "nltk.tbl.Rule"
|
||||
|
||||
def __init__(self, templateid, original_tag, replacement_tag, conditions):
|
||||
"""
|
||||
Construct a new Rule that changes a token's tag from
|
||||
C{original_tag} to C{replacement_tag} if all of the properties
|
||||
specified in C{conditions} hold.
|
||||
|
||||
:param templateid: the template id (a zero-padded string, '001' etc,
|
||||
so it will sort nicely)
|
||||
:type templateid: string
|
||||
|
||||
:param conditions: A list of Feature(positions),
|
||||
each of which specifies that the property (computed by
|
||||
Feature.extract_property()) of at least one
|
||||
token in M{n} + p in positions is C{value}.
|
||||
:type conditions: C{iterable} of C{Feature}
|
||||
|
||||
"""
|
||||
TagRule.__init__(self, original_tag, replacement_tag)
|
||||
self._conditions = conditions
|
||||
self.templateid = templateid
|
||||
|
||||
def encode_json_obj(self):
|
||||
return {
|
||||
"templateid": self.templateid,
|
||||
"original": self.original_tag,
|
||||
"replacement": self.replacement_tag,
|
||||
"conditions": self._conditions,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def decode_json_obj(cls, obj):
|
||||
return cls(
|
||||
obj["templateid"],
|
||||
obj["original"],
|
||||
obj["replacement"],
|
||||
tuple(tuple(feat) for feat in obj["conditions"]),
|
||||
)
|
||||
|
||||
def applies(self, tokens, index):
|
||||
# Inherit docs from TagRule
|
||||
|
||||
# Does the given token have this Rule's "original tag"?
|
||||
if tokens[index][1] != self.original_tag:
|
||||
return False
|
||||
|
||||
# Check to make sure that every condition holds.
|
||||
for feature, val in self._conditions:
|
||||
# Look for *any* token that satisfies the condition.
|
||||
for pos in feature.positions:
|
||||
if not (0 <= index + pos < len(tokens)):
|
||||
continue
|
||||
if feature.extract_property(tokens, index + pos) == val:
|
||||
break
|
||||
else:
|
||||
# No token satisfied the condition; return false.
|
||||
return False
|
||||
|
||||
# Every condition checked out, so the Rule is applicable.
|
||||
return True
|
||||
|
||||
def __eq__(self, other):
|
||||
return self is other or (
|
||||
other is not None
|
||||
and other.__class__ == self.__class__
|
||||
and self.original_tag == other.original_tag
|
||||
and self.replacement_tag == other.replacement_tag
|
||||
and self._conditions == other._conditions
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __hash__(self):
|
||||
# Cache our hash value (justified by profiling.)
|
||||
try:
|
||||
return self.__hash
|
||||
except AttributeError:
|
||||
self.__hash = hash(repr(self))
|
||||
return self.__hash
|
||||
|
||||
def __repr__(self):
|
||||
# Cache the repr (justified by profiling -- this is used as
|
||||
# a sort key when deterministic=True.)
|
||||
try:
|
||||
return self.__repr
|
||||
except AttributeError:
|
||||
self.__repr = "{}('{}', {}, {}, [{}])".format(
|
||||
self.__class__.__name__,
|
||||
self.templateid,
|
||||
repr(self.original_tag),
|
||||
repr(self.replacement_tag),
|
||||
# list(self._conditions) would be simpler but will not generate
|
||||
# the same Rule.__repr__ in python 2 and 3 and thus break some tests
|
||||
", ".join(f"({f},{repr(v)})" for (f, v) in self._conditions),
|
||||
)
|
||||
|
||||
return self.__repr
|
||||
|
||||
def __str__(self):
|
||||
def _condition_to_logic(feature, value):
|
||||
"""
|
||||
Return a compact, predicate-logic styled string representation
|
||||
of the given condition.
|
||||
"""
|
||||
return "{}:{}@[{}]".format(
|
||||
feature.PROPERTY_NAME,
|
||||
value,
|
||||
",".join(str(w) for w in feature.positions),
|
||||
)
|
||||
|
||||
conditions = " & ".join(
|
||||
[_condition_to_logic(f, v) for (f, v) in self._conditions]
|
||||
)
|
||||
s = f"{self.original_tag}->{self.replacement_tag} if {conditions}"
|
||||
|
||||
return s
|
||||
|
||||
def format(self, fmt):
|
||||
"""
|
||||
Return a string representation of this rule.
|
||||
|
||||
>>> from nltk.tbl.rule import Rule
|
||||
>>> from nltk.tag.brill import Pos
|
||||
|
||||
>>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')])
|
||||
|
||||
r.format("str") == str(r)
|
||||
True
|
||||
>>> r.format("str")
|
||||
'VB->NN if Pos:DT@[-2,-1]'
|
||||
|
||||
r.format("repr") == repr(r)
|
||||
True
|
||||
>>> r.format("repr")
|
||||
"Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])"
|
||||
|
||||
>>> r.format("verbose")
|
||||
'VB -> NN if the Pos of words i-2...i-1 is "DT"'
|
||||
|
||||
>>> r.format("not_found")
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
File "nltk/tbl/rule.py", line 256, in format
|
||||
raise ValueError("unknown rule format spec: {0}".format(fmt))
|
||||
ValueError: unknown rule format spec: not_found
|
||||
>>>
|
||||
|
||||
:param fmt: format specification
|
||||
:type fmt: str
|
||||
:return: string representation
|
||||
:rtype: str
|
||||
"""
|
||||
if fmt == "str":
|
||||
return self.__str__()
|
||||
elif fmt == "repr":
|
||||
return self.__repr__()
|
||||
elif fmt == "verbose":
|
||||
return self._verbose_format()
|
||||
else:
|
||||
raise ValueError(f"unknown rule format spec: {fmt}")
|
||||
|
||||
def _verbose_format(self):
|
||||
"""
|
||||
Return a wordy, human-readable string representation
|
||||
of the given rule.
|
||||
|
||||
Not sure how useful this is.
|
||||
"""
|
||||
|
||||
def condition_to_str(feature, value):
|
||||
return 'the {} of {} is "{}"'.format(
|
||||
feature.PROPERTY_NAME,
|
||||
range_to_str(feature.positions),
|
||||
value,
|
||||
)
|
||||
|
||||
def range_to_str(positions):
|
||||
if len(positions) == 1:
|
||||
p = positions[0]
|
||||
if p == 0:
|
||||
return "this word"
|
||||
if p == -1:
|
||||
return "the preceding word"
|
||||
elif p == 1:
|
||||
return "the following word"
|
||||
elif p < 0:
|
||||
return "word i-%d" % -p
|
||||
elif p > 0:
|
||||
return "word i+%d" % p
|
||||
else:
|
||||
# for complete compatibility with the wordy format of nltk2
|
||||
mx = max(positions)
|
||||
mn = min(positions)
|
||||
if mx - mn == len(positions) - 1:
|
||||
return "words i%+d...i%+d" % (mn, mx)
|
||||
else:
|
||||
return "words {{{}}}".format(
|
||||
",".join("i%+d" % d for d in positions)
|
||||
)
|
||||
|
||||
replacement = f"{self.original_tag} -> {self.replacement_tag}"
|
||||
conditions = (" if " if self._conditions else "") + ", and ".join(
|
||||
condition_to_str(f, v) for (f, v) in self._conditions
|
||||
)
|
||||
return replacement + conditions
|
||||
325
Backend/venv/lib/python3.12/site-packages/nltk/tbl/template.py
Normal file
325
Backend/venv/lib/python3.12/site-packages/nltk/tbl/template.py
Normal file
@@ -0,0 +1,325 @@
|
||||
# Natural Language Toolkit: Transformation-based learning
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Marcus Uneson <marcus.uneson@gmail.com>
|
||||
# based on previous (nltk2) version by
|
||||
# Christopher Maloof, Edward Loper, Steven Bird
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import itertools as it
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from nltk.tbl.feature import Feature
|
||||
from nltk.tbl.rule import Rule
|
||||
|
||||
|
||||
class BrillTemplateI(metaclass=ABCMeta):
|
||||
"""
|
||||
An interface for generating lists of transformational rules that
|
||||
apply at given sentence positions. ``BrillTemplateI`` is used by
|
||||
``Brill`` training algorithms to generate candidate rules.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def applicable_rules(self, tokens, i, correctTag):
|
||||
"""
|
||||
Return a list of the transformational rules that would correct
|
||||
the ``i``-th subtoken's tag in the given token. In particular,
|
||||
return a list of zero or more rules that would change
|
||||
``tokens[i][1]`` to ``correctTag``, if applied to ``token[i]``.
|
||||
|
||||
If the ``i``-th token already has the correct tag (i.e., if
|
||||
``tagged_tokens[i][1] == correctTag``), then
|
||||
``applicable_rules()`` should return the empty list.
|
||||
|
||||
:param tokens: The tagged tokens being tagged.
|
||||
:type tokens: list(tuple)
|
||||
:param i: The index of the token whose tag should be corrected.
|
||||
:type i: int
|
||||
:param correctTag: The correct tag for the ``i``-th token.
|
||||
:type correctTag: any
|
||||
:rtype: list(BrillRule)
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_neighborhood(self, token, index):
|
||||
"""
|
||||
Returns the set of indices *i* such that
|
||||
``applicable_rules(token, i, ...)`` depends on the value of
|
||||
the *index*th token of *token*.
|
||||
|
||||
This method is used by the "fast" Brill tagger trainer.
|
||||
|
||||
:param token: The tokens being tagged.
|
||||
:type token: list(tuple)
|
||||
:param index: The index whose neighborhood should be returned.
|
||||
:type index: int
|
||||
:rtype: set
|
||||
"""
|
||||
|
||||
|
||||
class Template(BrillTemplateI):
|
||||
"""
|
||||
A tbl Template that generates a list of L{Rule}s that apply at a given sentence
|
||||
position. In particular, each C{Template} is parameterized by a list of
|
||||
independent features (a combination of a specific
|
||||
property to extract and a list C{L} of relative positions at which to extract
|
||||
it) and generates all Rules that:
|
||||
|
||||
- use the given features, each at its own independent position; and
|
||||
- are applicable to the given token.
|
||||
"""
|
||||
|
||||
ALLTEMPLATES = []
|
||||
# record a unique id of form "001", for each template created
|
||||
# _ids = it.count(0)
|
||||
|
||||
def __init__(self, *features):
|
||||
"""
|
||||
Construct a Template for generating Rules.
|
||||
|
||||
Takes a list of Features. A C{Feature} is a combination
|
||||
of a specific property and its relative positions and should be
|
||||
a subclass of L{nltk.tbl.feature.Feature}.
|
||||
|
||||
An alternative calling convention (kept for backwards compatibility,
|
||||
but less expressive as it only permits one feature type) is
|
||||
Template(Feature, (start1, end1), (start2, end2), ...)
|
||||
In new code, that would be better written
|
||||
Template(Feature(start1, end1), Feature(start2, end2), ...)
|
||||
|
||||
For instance, importing some features
|
||||
|
||||
>>> from nltk.tbl.template import Template
|
||||
>>> from nltk.tag.brill import Word, Pos
|
||||
|
||||
Create some features
|
||||
|
||||
>>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1]))
|
||||
|
||||
Create a single-feature template
|
||||
|
||||
>>> Template(wfeat1)
|
||||
Template(Word([-1]))
|
||||
|
||||
Or a two-feature one
|
||||
|
||||
>>> Template(wfeat1, wfeat2)
|
||||
Template(Word([-1]),Word([1, 2]))
|
||||
|
||||
Or a three-feature one with two different feature types
|
||||
|
||||
>>> Template(wfeat1, wfeat2, pfeat)
|
||||
Template(Word([-1]),Word([1, 2]),Pos([-2, -1]))
|
||||
|
||||
deprecated api: Feature subclass, followed by list of (start,end) pairs
|
||||
(permits only a single Feature)
|
||||
|
||||
>>> Template(Word, (-2,-1), (0,0))
|
||||
Template(Word([-2, -1]),Word([0]))
|
||||
|
||||
Incorrect specification raises TypeError
|
||||
|
||||
>>> Template(Word, (-2,-1), Pos, (0,0))
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
File "nltk/tag/tbl/template.py", line 143, in __init__
|
||||
raise TypeError(
|
||||
TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...
|
||||
|
||||
:type features: list of Features
|
||||
:param features: the features to build this Template on
|
||||
"""
|
||||
# determine the calling form: either
|
||||
# Template(Feature, args1, [args2, ...)]
|
||||
# Template(Feature1(args), Feature2(args), ...)
|
||||
if all(isinstance(f, Feature) for f in features):
|
||||
self._features = features
|
||||
elif issubclass(features[0], Feature) and all(
|
||||
isinstance(a, tuple) for a in features[1:]
|
||||
):
|
||||
self._features = [features[0](*tp) for tp in features[1:]]
|
||||
else:
|
||||
raise TypeError(
|
||||
"expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..."
|
||||
)
|
||||
self.id = f"{len(self.ALLTEMPLATES):03d}"
|
||||
self.ALLTEMPLATES.append(self)
|
||||
|
||||
def __repr__(self):
|
||||
return "{}({})".format(
|
||||
self.__class__.__name__,
|
||||
",".join([str(f) for f in self._features]),
|
||||
)
|
||||
|
||||
def applicable_rules(self, tokens, index, correct_tag):
|
||||
if tokens[index][1] == correct_tag:
|
||||
return []
|
||||
|
||||
# For each of this Template's features, find the conditions
|
||||
# that are applicable for the given token.
|
||||
# Then, generate one Rule for each combination of features
|
||||
# (the crossproduct of the conditions).
|
||||
|
||||
applicable_conditions = self._applicable_conditions(tokens, index)
|
||||
xs = list(it.product(*applicable_conditions))
|
||||
return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs]
|
||||
|
||||
def _applicable_conditions(self, tokens, index):
|
||||
"""
|
||||
:returns: A set of all conditions for rules
|
||||
that are applicable to C{tokens[index]}.
|
||||
"""
|
||||
conditions = []
|
||||
|
||||
for feature in self._features:
|
||||
conditions.append([])
|
||||
for pos in feature.positions:
|
||||
if not (0 <= index + pos < len(tokens)):
|
||||
continue
|
||||
value = feature.extract_property(tokens, index + pos)
|
||||
conditions[-1].append((feature, value))
|
||||
return conditions
|
||||
|
||||
def get_neighborhood(self, tokens, index):
|
||||
# inherit docs from BrillTemplateI
|
||||
|
||||
# applicable_rules(tokens, index, ...) depends on index.
|
||||
neighborhood = {index} # set literal for python 2.7+
|
||||
|
||||
# applicable_rules(tokens, i, ...) depends on index if
|
||||
# i+start < index <= i+end.
|
||||
|
||||
allpositions = [0] + [p for feat in self._features for p in feat.positions]
|
||||
start, end = min(allpositions), max(allpositions)
|
||||
s = max(0, index + (-end))
|
||||
e = min(index + (-start) + 1, len(tokens))
|
||||
for i in range(s, e):
|
||||
neighborhood.add(i)
|
||||
return neighborhood
|
||||
|
||||
@classmethod
|
||||
def expand(cls, featurelists, combinations=None, skipintersecting=True):
|
||||
"""
|
||||
Factory method to mass generate Templates from a list L of lists of Features.
|
||||
|
||||
#With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2
|
||||
#of the sublists in L; it will output all Templates formed by the Cartesian product
|
||||
#of this selection, with duplicates and other semantically equivalent
|
||||
#forms removed. Default for combinations is (1, len(L)).
|
||||
|
||||
The feature lists may have been specified
|
||||
manually, or generated from Feature.expand(). For instance,
|
||||
|
||||
>>> from nltk.tbl.template import Template
|
||||
>>> from nltk.tag.brill import Word, Pos
|
||||
|
||||
#creating some features
|
||||
>>> (wd_0, wd_01) = (Word([0]), Word([0,1]))
|
||||
|
||||
>>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3]))
|
||||
|
||||
>>> list(Template.expand([[wd_0], [pos_m2]]))
|
||||
[Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))]
|
||||
|
||||
>>> list(Template.expand([[wd_0, wd_01], [pos_m2]]))
|
||||
[Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))]
|
||||
|
||||
#note: with Feature.expand(), it is very easy to generate more templates
|
||||
#than your system can handle -- for instance,
|
||||
>>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False)
|
||||
>>> len(wordtpls)
|
||||
7
|
||||
|
||||
>>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True)
|
||||
>>> len(postpls)
|
||||
9
|
||||
|
||||
#and now the Cartesian product of all non-empty combinations of two wordtpls and
|
||||
#two postpls, with semantic equivalents removed
|
||||
>>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls]))
|
||||
>>> len(templates)
|
||||
713
|
||||
|
||||
|
||||
will return a list of eight templates
|
||||
Template(Word([0])),
|
||||
Template(Word([0, 1])),
|
||||
Template(Pos([-2])),
|
||||
Template(Pos([-1])),
|
||||
Template(Pos([-2]),Word([0])),
|
||||
Template(Pos([-1]),Word([0])),
|
||||
Template(Pos([-2]),Word([0, 1])),
|
||||
Template(Pos([-1]),Word([0, 1]))]
|
||||
|
||||
|
||||
#Templates where one feature is a subset of another, such as
|
||||
#Template(Word([0,1]), Word([1]), will not appear in the output.
|
||||
#By default, this non-subset constraint is tightened to disjointness:
|
||||
#Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out.
|
||||
#With skipintersecting=False, then such Templates are allowed
|
||||
|
||||
WARNING: this method makes it very easy to fill all your memory when training
|
||||
generated templates on any real-world corpus
|
||||
|
||||
:param featurelists: lists of Features, whose Cartesian product will return a set of Templates
|
||||
:type featurelists: list of (list of Features)
|
||||
:param combinations: given n featurelists: if combinations=k, all generated Templates will have
|
||||
k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n
|
||||
:type combinations: None, int, or (int, int)
|
||||
:param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature)
|
||||
:type skipintersecting: bool
|
||||
:returns: generator of Templates
|
||||
|
||||
"""
|
||||
|
||||
def nonempty_powerset(xs): # xs is a list
|
||||
# itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
|
||||
|
||||
# find the correct tuple given combinations, one of {None, k, (k1,k2)}
|
||||
k = combinations # for brevity
|
||||
combrange = (
|
||||
(1, len(xs) + 1)
|
||||
if k is None
|
||||
else (
|
||||
(k, k + 1) # n over 1 .. n over n (all non-empty combinations)
|
||||
if isinstance(k, int)
|
||||
else (k[0], k[1] + 1)
|
||||
) # n over k (only
|
||||
) # n over k1, n over k1+1... n over k2
|
||||
return it.chain.from_iterable(
|
||||
it.combinations(xs, r) for r in range(*combrange)
|
||||
)
|
||||
|
||||
seentemplates = set()
|
||||
for picks in nonempty_powerset(featurelists):
|
||||
for pick in it.product(*picks):
|
||||
if any(
|
||||
i != j and x.issuperset(y)
|
||||
for (i, x) in enumerate(pick)
|
||||
for (j, y) in enumerate(pick)
|
||||
):
|
||||
continue
|
||||
if skipintersecting and any(
|
||||
i != j and x.intersects(y)
|
||||
for (i, x) in enumerate(pick)
|
||||
for (j, y) in enumerate(pick)
|
||||
):
|
||||
continue
|
||||
thistemplate = cls(*sorted(pick))
|
||||
strpick = str(thistemplate)
|
||||
#!!FIXME --this is hackish
|
||||
if strpick in seentemplates: # already added
|
||||
cls._poptemplate()
|
||||
continue
|
||||
seentemplates.add(strpick)
|
||||
yield thistemplate
|
||||
|
||||
@classmethod
|
||||
def _cleartemplates(cls):
|
||||
cls.ALLTEMPLATES = []
|
||||
|
||||
@classmethod
|
||||
def _poptemplate(cls):
|
||||
return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None
|
||||
Reference in New Issue
Block a user