updates
This commit is contained in:
347
Backend/venv/lib/python3.12/site-packages/nltk/classify/util.py
Normal file
347
Backend/venv/lib/python3.12/site-packages/nltk/classify/util.py
Normal file
@@ -0,0 +1,347 @@
|
||||
# Natural Language Toolkit: Classifier Utility Functions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Utility functions and classes for classifiers.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
# from nltk.util import Deprecated
|
||||
import nltk.classify.util # for accuracy & log_likelihood
|
||||
from nltk.util import LazyMap
|
||||
|
||||
######################################################################
|
||||
# { Helper Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
# alternative name possibility: 'map_featurefunc()'?
|
||||
# alternative name possibility: 'detect_features()'?
|
||||
# alternative name possibility: 'map_featuredetect()'?
|
||||
# or.. just have users use LazyMap directly?
|
||||
def apply_features(feature_func, toks, labeled=None):
|
||||
"""
|
||||
Use the ``LazyMap`` class to construct a lazy list-like
|
||||
object that is analogous to ``map(feature_func, toks)``. In
|
||||
particular, if ``labeled=False``, then the returned list-like
|
||||
object's values are equal to::
|
||||
|
||||
[feature_func(tok) for tok in toks]
|
||||
|
||||
If ``labeled=True``, then the returned list-like object's values
|
||||
are equal to::
|
||||
|
||||
[(feature_func(tok), label) for (tok, label) in toks]
|
||||
|
||||
The primary purpose of this function is to avoid the memory
|
||||
overhead involved in storing all the featuresets for every token
|
||||
in a corpus. Instead, these featuresets are constructed lazily,
|
||||
as-needed. The reduction in memory overhead can be especially
|
||||
significant when the underlying list of tokens is itself lazy (as
|
||||
is the case with many corpus readers).
|
||||
|
||||
:param feature_func: The function that will be applied to each
|
||||
token. It should return a featureset -- i.e., a dict
|
||||
mapping feature names to feature values.
|
||||
:param toks: The list of tokens to which ``feature_func`` should be
|
||||
applied. If ``labeled=True``, then the list elements will be
|
||||
passed directly to ``feature_func()``. If ``labeled=False``,
|
||||
then the list elements should be tuples ``(tok,label)``, and
|
||||
``tok`` will be passed to ``feature_func()``.
|
||||
:param labeled: If true, then ``toks`` contains labeled tokens --
|
||||
i.e., tuples of the form ``(tok, label)``. (Default:
|
||||
auto-detect based on types.)
|
||||
"""
|
||||
if labeled is None:
|
||||
labeled = toks and isinstance(toks[0], (tuple, list))
|
||||
if labeled:
|
||||
|
||||
def lazy_func(labeled_token):
|
||||
return (feature_func(labeled_token[0]), labeled_token[1])
|
||||
|
||||
return LazyMap(lazy_func, toks)
|
||||
else:
|
||||
return LazyMap(feature_func, toks)
|
||||
|
||||
|
||||
def attested_labels(tokens):
|
||||
"""
|
||||
:return: A list of all labels that are attested in the given list
|
||||
of tokens.
|
||||
:rtype: list of (immutable)
|
||||
:param tokens: The list of classified tokens from which to extract
|
||||
labels. A classified token has the form ``(token, label)``.
|
||||
:type tokens: list
|
||||
"""
|
||||
return tuple({label for (tok, label) in tokens})
|
||||
|
||||
|
||||
def log_likelihood(classifier, gold):
|
||||
results = classifier.prob_classify_many([fs for (fs, l) in gold])
|
||||
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
|
||||
return math.log(sum(ll) / len(ll))
|
||||
|
||||
|
||||
def accuracy(classifier, gold):
|
||||
results = classifier.classify_many([fs for (fs, l) in gold])
|
||||
correct = [l == r for ((fs, l), r) in zip(gold, results)]
|
||||
if correct:
|
||||
return sum(correct) / len(correct)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class CutoffChecker:
|
||||
"""
|
||||
A helper class that implements cutoff checks based on number of
|
||||
iterations and log likelihood.
|
||||
|
||||
Accuracy cutoffs are also implemented, but they're almost never
|
||||
a good idea to use.
|
||||
"""
|
||||
|
||||
def __init__(self, cutoffs):
|
||||
self.cutoffs = cutoffs.copy()
|
||||
if "min_ll" in cutoffs:
|
||||
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
|
||||
if "min_lldelta" in cutoffs:
|
||||
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
|
||||
self.ll = None
|
||||
self.acc = None
|
||||
self.iter = 1
|
||||
|
||||
def check(self, classifier, train_toks):
|
||||
cutoffs = self.cutoffs
|
||||
self.iter += 1
|
||||
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
|
||||
return True # iteration cutoff.
|
||||
|
||||
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if math.isnan(new_ll):
|
||||
return True
|
||||
|
||||
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
|
||||
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_lldelta" in cutoffs
|
||||
and self.ll
|
||||
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.ll = new_ll
|
||||
|
||||
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
|
||||
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_accdelta" in cutoffs
|
||||
and self.acc
|
||||
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.acc = new_acc
|
||||
|
||||
return False # no cutoff reached.
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Demos
|
||||
######################################################################
|
||||
|
||||
|
||||
def names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith"] = name[0].lower()
|
||||
features["endswith"] = name[-1].lower()
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
return features
|
||||
|
||||
|
||||
def binary_names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
|
||||
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
features["startswith(%s)" % letter] = letter == name[0].lower()
|
||||
features["endswith(%s)" % letter] = letter == name[-1].lower()
|
||||
return features
|
||||
|
||||
|
||||
def names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
# Construct a list of classified names, using the names corpus.
|
||||
namelist = [(name, "male") for name in names.words("male.txt")] + [
|
||||
(name, "female") for name in names.words("female.txt")
|
||||
]
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
random.seed(123456)
|
||||
random.shuffle(namelist)
|
||||
train = namelist[:5000]
|
||||
test = namelist[5000:5500]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(n), g) for (n, g) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, g) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, gender), pdist in list(zip(test, pdists))[:5]:
|
||||
if gender == "male":
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def partial_names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
male_names = names.words("male.txt")
|
||||
female_names = names.words("female.txt")
|
||||
|
||||
random.seed(654321)
|
||||
random.shuffle(male_names)
|
||||
random.shuffle(female_names)
|
||||
|
||||
# Create a list of male names to be used as positive-labeled examples for training
|
||||
positive = map(features, male_names[:2000])
|
||||
|
||||
# Create a list of male and female names to be used as unlabeled examples
|
||||
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
|
||||
|
||||
# Create a test set with correctly-labeled male and female names
|
||||
test = [(name, True) for name in male_names[2500:2750]] + [
|
||||
(name, False) for name in female_names[500:750]
|
||||
]
|
||||
|
||||
random.shuffle(test)
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer(positive, unlabeled)
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, m) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, is_male), pdist in zip(test, pdists)[:5]:
|
||||
if is_male == True:
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
_inst_cache = {}
|
||||
|
||||
|
||||
def wsd_demo(trainer, word, features, n=1000):
|
||||
import random
|
||||
|
||||
from nltk.corpus import senseval
|
||||
|
||||
# Get the instances.
|
||||
print("Reading data...")
|
||||
global _inst_cache
|
||||
if word not in _inst_cache:
|
||||
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
|
||||
instances = _inst_cache[word][:]
|
||||
if n > len(instances):
|
||||
n = len(instances)
|
||||
senses = list({l for (i, l) in instances})
|
||||
print(" Senses: " + " ".join(senses))
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
print("Splitting into test & train...")
|
||||
random.seed(123456)
|
||||
random.shuffle(instances)
|
||||
train = instances[: int(0.8 * n)]
|
||||
test = instances[int(0.8 * n) : n]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(i), l) for (i, l) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(i) for (i, n) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def check_megam_config():
|
||||
"""
|
||||
Checks whether the MEGAM binary is configured.
|
||||
"""
|
||||
try:
|
||||
_megam_bin
|
||||
except NameError as e:
|
||||
err_msg = str(
|
||||
"Please configure your megam binary first, e.g.\n"
|
||||
">>> nltk.config_megam('/usr/bin/local/megam')"
|
||||
)
|
||||
raise NameError(err_msg) from e
|
||||
Reference in New Issue
Block a user