This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,101 @@
# Natural Language Toolkit: Classifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Classes and interfaces for labeling tokens with category labels (or
"class labels"). Typically, labels are represented with strings
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
perform a wide range of classification tasks. For example,
classifiers can be used...
- to classify documents by topic
- to classify ambiguous words by which word sense is intended
- to classify acoustic signals by which phoneme they represent
- to classify sentences by their author
Features
========
In order to decide which category label is appropriate for a given
token, classifiers examine one or more 'features' of the token. These
"features" are typically chosen by hand, and indicate which aspects
of the token are relevant to the classification decision. For
example, a document classifier might use a separate feature for each
word, recording how often that word occurred in the document.
Featuresets
===========
The features describing a token are encoded using a "featureset",
which is a dictionary that maps from "feature names" to "feature
values". Feature names are unique strings that indicate what aspect
of the token is encoded by the feature. Examples include
``'prevword'``, for a feature whose value is the previous word; and
``'contains-word(library)'`` for a feature that is true when a document
contains the word ``'library'``. Feature values are typically
booleans, numbers, or strings, depending on which feature they
describe.
Featuresets are typically constructed using a "feature detector"
(also known as a "feature extractor"). A feature detector is a
function that takes a token (and sometimes information about its
context) as its input, and returns a featureset describing that token.
For example, the following feature detector converts a document
(stored as a list of words) to a featureset describing the set of
words included in the document:
>>> # Define a feature detector function.
>>> def document_features(document):
... return dict([('contains-word(%s)' % w, True) for w in document])
Feature detectors are typically applied to each token before it is fed
to the classifier:
>>> # Classify each Gutenberg document.
>>> from nltk.corpus import gutenberg
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
... doc = gutenberg.words(fileid) # doctest: +SKIP
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
The parameters that a feature detector expects will vary, depending on
the task and the needs of the feature detector. For example, a
feature detector for word sense disambiguation (WSD) might take as its
input a sentence, and the index of a word that should be classified,
and return a featureset for that word. The following feature detector
for WSD includes features describing the left and right contexts of
the target word:
>>> def wsd_features(sentence, index):
... featureset = {}
... for i in range(max(0, index-3), index):
... featureset['left-context(%s)' % sentence[i]] = True
... for i in range(index, max(index+3, len(sentence))):
... featureset['right-context(%s)' % sentence[i]] = True
... return featureset
Training Classifiers
====================
Most classifiers are built by training them on a list of hand-labeled
examples, known as the "training set". Training sets are represented
as lists of ``(featuredict, label)`` tuples.
"""
from nltk.classify.api import ClassifierI, MultiClassifierI
from nltk.classify.decisiontree import DecisionTreeClassifier
from nltk.classify.maxent import (
BinaryMaxentFeatureEncoding,
ConditionalExponentialClassifier,
MaxentClassifier,
TypedMaxentFeatureEncoding,
)
from nltk.classify.megam import call_megam, config_megam
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify.senna import Senna
from nltk.classify.textcat import TextCat
from nltk.classify.util import accuracy, apply_features, log_likelihood
from nltk.classify.weka import WekaClassifier, config_weka

View File

@@ -0,0 +1,195 @@
# Natural Language Toolkit: Classifier Interface
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Interfaces for labeling tokens with category labels (or "class labels").
``ClassifierI`` is a standard interface for "single-category
classification", in which the set of categories is known, the number
of categories is finite, and each text belongs to exactly one
category.
``MultiClassifierI`` is a standard interface for "multi-category
classification", which is like single-category classification except
that each text belongs to zero or more categories.
"""
from nltk.internals import overridden
##//////////////////////////////////////////////////////
# { Classification Interfaces
##//////////////////////////////////////////////////////
class ClassifierI:
"""
A processing interface for labeling tokens with a single category
label (or "class"). Labels are typically strs or
ints, but can be any immutable type. The set of labels
that the classifier chooses from must be fixed and finite.
Subclasses must define:
- ``labels()``
- either ``classify()`` or ``classify_many()`` (or both)
Subclasses may define:
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
"""
def labels(self):
"""
:return: the list of category labels used by this classifier.
:rtype: list of (immutable)
"""
raise NotImplementedError()
def classify(self, featureset):
"""
:return: the most appropriate label for the given featureset.
:rtype: label
"""
if overridden(self.classify_many):
return self.classify_many([featureset])[0]
else:
raise NotImplementedError()
def prob_classify(self, featureset):
"""
:return: a probability distribution over labels for the given
featureset.
:rtype: ProbDistI
"""
if overridden(self.prob_classify_many):
return self.prob_classify_many([featureset])[0]
else:
raise NotImplementedError()
def classify_many(self, featuresets):
"""
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
return [self.classify(fs) for fs in featuresets]
:rtype: list(label)
"""
return [self.classify(fs) for fs in featuresets]
def prob_classify_many(self, featuresets):
"""
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
return [self.prob_classify(fs) for fs in featuresets]
:rtype: list(ProbDistI)
"""
return [self.prob_classify(fs) for fs in featuresets]
class MultiClassifierI:
"""
A processing interface for labeling tokens with zero or more
category labels (or "labels"). Labels are typically strs
or ints, but can be any immutable type. The set of labels
that the multi-classifier chooses from must be fixed and finite.
Subclasses must define:
- ``labels()``
- either ``classify()`` or ``classify_many()`` (or both)
Subclasses may define:
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
"""
def labels(self):
"""
:return: the list of category labels used by this classifier.
:rtype: list of (immutable)
"""
raise NotImplementedError()
def classify(self, featureset):
"""
:return: the most appropriate set of labels for the given featureset.
:rtype: set(label)
"""
if overridden(self.classify_many):
return self.classify_many([featureset])[0]
else:
raise NotImplementedError()
def prob_classify(self, featureset):
"""
:return: a probability distribution over sets of labels for the
given featureset.
:rtype: ProbDistI
"""
if overridden(self.prob_classify_many):
return self.prob_classify_many([featureset])[0]
else:
raise NotImplementedError()
def classify_many(self, featuresets):
"""
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
return [self.classify(fs) for fs in featuresets]
:rtype: list(set(label))
"""
return [self.classify(fs) for fs in featuresets]
def prob_classify_many(self, featuresets):
"""
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
return [self.prob_classify(fs) for fs in featuresets]
:rtype: list(ProbDistI)
"""
return [self.prob_classify(fs) for fs in featuresets]
# # [XX] IN PROGRESS:
# class SequenceClassifierI:
# """
# A processing interface for labeling sequences of tokens with a
# single category label (or "class"). Labels are typically
# strs or ints, but can be any immutable type. The set
# of labels that the classifier chooses from must be fixed and
# finite.
# """
# def labels(self):
# """
# :return: the list of category labels used by this classifier.
# :rtype: list of (immutable)
# """
# raise NotImplementedError()
# def prob_classify(self, featureset):
# """
# Return a probability distribution over labels for the given
# featureset.
# If ``featureset`` is a list of featuresets, then return a
# corresponding list containing the probability distribution
# over labels for each of the given featuresets, where the
# *i*\ th element of this list is the most appropriate label for
# the *i*\ th element of ``featuresets``.
# """
# raise NotImplementedError()
# def classify(self, featureset):
# """
# Return the most appropriate label for the given featureset.
# If ``featureset`` is a list of featuresets, then return a
# corresponding list containing the most appropriate label for
# each of the given featuresets, where the *i*\ th element of
# this list is the most appropriate label for the *i*\ th element
# of ``featuresets``.
# """
# raise NotImplementedError()

View File

@@ -0,0 +1,349 @@
# Natural Language Toolkit: Decision Tree Classifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A classifier model that decides which label to assign to a token on
the basis of a tree structure, where branches correspond to conditions
on feature values, and leaves correspond to label assignments.
"""
from collections import defaultdict
from nltk.classify.api import ClassifierI
from nltk.probability import FreqDist, MLEProbDist, entropy
class DecisionTreeClassifier(ClassifierI):
def __init__(self, label, feature_name=None, decisions=None, default=None):
"""
:param label: The most likely label for tokens that reach
this node in the decision tree. If this decision tree
has no children, then this label will be assigned to
any token that reaches this decision tree.
:param feature_name: The name of the feature that this
decision tree selects for.
:param decisions: A dictionary mapping from feature values
for the feature identified by ``feature_name`` to
child decision trees.
:param default: The child that will be used if the value of
feature ``feature_name`` does not match any of the keys in
``decisions``. This is used when constructing binary
decision trees.
"""
self._label = label
self._fname = feature_name
self._decisions = decisions
self._default = default
def labels(self):
labels = [self._label]
if self._decisions is not None:
for dt in self._decisions.values():
labels.extend(dt.labels())
if self._default is not None:
labels.extend(self._default.labels())
return list(set(labels))
def classify(self, featureset):
# Decision leaf:
if self._fname is None:
return self._label
# Decision tree:
fval = featureset.get(self._fname)
if fval in self._decisions:
return self._decisions[fval].classify(featureset)
elif self._default is not None:
return self._default.classify(featureset)
else:
return self._label
def error(self, labeled_featuresets):
errors = 0
for featureset, label in labeled_featuresets:
if self.classify(featureset) != label:
errors += 1
return errors / len(labeled_featuresets)
def pretty_format(self, width=70, prefix="", depth=4):
"""
Return a string containing a pretty-printed version of this
decision tree. Each line in this string corresponds to a
single decision tree node or leaf, and indentation is used to
display the structure of the decision tree.
"""
# [xx] display default!!
if self._fname is None:
n = width - len(prefix) - 15
return "{}{} {}\n".format(prefix, "." * n, self._label)
s = ""
for i, (fval, result) in enumerate(
sorted(
self._decisions.items(),
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
)
):
hdr = f"{prefix}{self._fname}={fval}? "
n = width - 15 - len(hdr)
s += "{}{} {}\n".format(hdr, "." * (n), result._label)
if result._fname is not None and depth > 1:
s += result.pretty_format(width, prefix + " ", depth - 1)
if self._default is not None:
n = width - len(prefix) - 21
s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
if self._default._fname is not None and depth > 1:
s += self._default.pretty_format(width, prefix + " ", depth - 1)
return s
def pseudocode(self, prefix="", depth=4):
"""
Return a string representation of this decision tree that
expresses the decisions it makes as a nested set of pseudocode
if statements.
"""
if self._fname is None:
return f"{prefix}return {self._label!r}\n"
s = ""
for fval, result in sorted(
self._decisions.items(),
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
):
s += f"{prefix}if {self._fname} == {fval!r}: "
if result._fname is not None and depth > 1:
s += "\n" + result.pseudocode(prefix + " ", depth - 1)
else:
s += f"return {result._label!r}\n"
if self._default is not None:
if len(self._decisions) == 1:
s += "{}if {} != {!r}: ".format(
prefix, self._fname, list(self._decisions.keys())[0]
)
else:
s += f"{prefix}else: "
if self._default._fname is not None and depth > 1:
s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
else:
s += f"return {self._default._label!r}\n"
return s
def __str__(self):
return self.pretty_format()
@staticmethod
def train(
labeled_featuresets,
entropy_cutoff=0.05,
depth_cutoff=100,
support_cutoff=10,
binary=False,
feature_values=None,
verbose=False,
):
"""
:param binary: If true, then treat all feature/value pairs as
individual binary features, rather than using a single n-way
branch for each feature.
"""
# Collect a list of all feature names.
feature_names = set()
for featureset, label in labeled_featuresets:
for fname in featureset:
feature_names.add(fname)
# Collect a list of the values each feature can take.
if feature_values is None and binary:
feature_values = defaultdict(set)
for featureset, label in labeled_featuresets:
for fname, fval in featureset.items():
feature_values[fname].add(fval)
# Start with a stump.
if not binary:
tree = DecisionTreeClassifier.best_stump(
feature_names, labeled_featuresets, verbose
)
else:
tree = DecisionTreeClassifier.best_binary_stump(
feature_names, labeled_featuresets, feature_values, verbose
)
# Refine the stump.
tree.refine(
labeled_featuresets,
entropy_cutoff,
depth_cutoff - 1,
support_cutoff,
binary,
feature_values,
verbose,
)
# Return it
return tree
@staticmethod
def leaf(labeled_featuresets):
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
return DecisionTreeClassifier(label)
@staticmethod
def stump(feature_name, labeled_featuresets):
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
# Find the best label for each value.
freqs = defaultdict(FreqDist) # freq(label|value)
for featureset, label in labeled_featuresets:
feature_value = featureset.get(feature_name)
freqs[feature_value][label] += 1
decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
return DecisionTreeClassifier(label, feature_name, decisions)
def refine(
self,
labeled_featuresets,
entropy_cutoff,
depth_cutoff,
support_cutoff,
binary=False,
feature_values=None,
verbose=False,
):
if len(labeled_featuresets) <= support_cutoff:
return
if self._fname is None:
return
if depth_cutoff <= 0:
return
for fval in self._decisions:
fval_featuresets = [
(featureset, label)
for (featureset, label) in labeled_featuresets
if featureset.get(self._fname) == fval
]
label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
self._decisions[fval] = DecisionTreeClassifier.train(
fval_featuresets,
entropy_cutoff,
depth_cutoff,
support_cutoff,
binary,
feature_values,
verbose,
)
if self._default is not None:
default_featuresets = [
(featureset, label)
for (featureset, label) in labeled_featuresets
if featureset.get(self._fname) not in self._decisions
]
label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
self._default = DecisionTreeClassifier.train(
default_featuresets,
entropy_cutoff,
depth_cutoff,
support_cutoff,
binary,
feature_values,
verbose,
)
@staticmethod
def best_stump(feature_names, labeled_featuresets, verbose=False):
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
best_error = best_stump.error(labeled_featuresets)
for fname in feature_names:
stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
stump_error = stump.error(labeled_featuresets)
if stump_error < best_error:
best_error = stump_error
best_stump = stump
if verbose:
print(
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
len(labeled_featuresets), best_stump._fname, best_error
)
)
return best_stump
@staticmethod
def binary_stump(feature_name, feature_value, labeled_featuresets):
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
# Find the best label for each value.
pos_fdist = FreqDist()
neg_fdist = FreqDist()
for featureset, label in labeled_featuresets:
if featureset.get(feature_name) == feature_value:
pos_fdist[label] += 1
else:
neg_fdist[label] += 1
decisions = {}
default = label
# But hopefully we have observations!
if pos_fdist.N() > 0:
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
if neg_fdist.N() > 0:
default = DecisionTreeClassifier(neg_fdist.max())
return DecisionTreeClassifier(label, feature_name, decisions, default)
@staticmethod
def best_binary_stump(
feature_names, labeled_featuresets, feature_values, verbose=False
):
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
best_error = best_stump.error(labeled_featuresets)
for fname in feature_names:
for fval in feature_values[fname]:
stump = DecisionTreeClassifier.binary_stump(
fname, fval, labeled_featuresets
)
stump_error = stump.error(labeled_featuresets)
if stump_error < best_error:
best_error = stump_error
best_stump = stump
if verbose:
if best_stump._decisions:
descr = "{}={}".format(
best_stump._fname, list(best_stump._decisions.keys())[0]
)
else:
descr = "(default)"
print(
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
len(labeled_featuresets), descr, best_error
)
)
return best_stump
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
def f(x):
return DecisionTreeClassifier.train(x, binary=True, verbose=True)
def demo():
from nltk.classify.util import binary_names_demo_features, names_demo
classifier = names_demo(
f, binary_names_demo_features # DecisionTreeClassifier.train,
)
print(classifier.pretty_format(depth=7))
print(classifier.pseudocode(depth=7))
if __name__ == "__main__":
demo()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,184 @@
# Natural Language Toolkit: Interface to Megam Classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A set of functions used to interface with the external megam_ maxent
optimization package. Before megam can be used, you should tell NLTK where it
can find the megam binary, using the ``config_megam()`` function. Typical
usage:
>>> from nltk.classify import megam
>>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
[Found megam: ...]
Use with MaxentClassifier. Example below, see MaxentClassifier documentation
for details.
nltk.classify.MaxentClassifier.train(corpus, 'megam')
.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
"""
import subprocess
from nltk.internals import find_binary
try:
import numpy
except ImportError:
numpy = None
######################################################################
# { Configuration
######################################################################
_megam_bin = None
def config_megam(bin=None):
"""
Configure NLTK's interface to the ``megam`` maxent optimization
package.
:param bin: The full path to the ``megam`` binary. If not specified,
then nltk will search the system for a ``megam`` binary; and if
one is not found, it will raise a ``LookupError`` exception.
:type bin: str
"""
global _megam_bin
_megam_bin = find_binary(
"megam",
bin,
env_vars=["MEGAM"],
binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
url="https://www.umiacs.umd.edu/~hal/megam/index.html",
)
######################################################################
# { Megam Interface Functions
######################################################################
def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
"""
Generate an input file for ``megam`` based on the given corpus of
classified tokens.
:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.
:type encoding: MaxentFeatureEncodingI
:param encoding: A feature encoding, used to convert featuresets
into feature vectors. May optionally implement a cost() method
in order to assign different costs to different class predictions.
:type stream: stream
:param stream: The stream to which the megam input file should be
written.
:param bernoulli: If true, then use the 'bernoulli' format. I.e.,
all joint features have binary values, and are listed iff they
are true. Otherwise, list feature values explicitly. If
``bernoulli=False``, then you must call ``megam`` with the
``-fvals`` option.
:param explicit: If true, then use the 'explicit' format. I.e.,
list the features that would fire for any of the possible
labels, for each token. If ``explicit=True``, then you must
call ``megam`` with the ``-explicit`` option.
"""
# Look up the set of labels.
labels = encoding.labels()
labelnum = {label: i for (i, label) in enumerate(labels)}
# Write the file, which contains one line per instance.
for featureset, label in train_toks:
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
if hasattr(encoding, "cost"):
stream.write(
":".join(str(encoding.cost(featureset, label, l)) for l in labels)
)
else:
stream.write("%d" % labelnum[label])
# For implicit file formats, just list the features that fire
# for this instance's actual label.
if not explicit:
_write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
# For explicit formats, list the features that would fire for
# any of the possible labels.
else:
for l in labels:
stream.write(" #")
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
# End of the instance.
stream.write("\n")
def parse_megam_weights(s, features_count, explicit=True):
"""
Given the stdout output generated by ``megam`` when training a
model, return a ``numpy`` array containing the corresponding weight
vector. This function does not currently handle bias features.
"""
if numpy is None:
raise ValueError("This function requires that numpy be installed")
assert explicit, "non-explicit not supported yet"
lines = s.strip().split("\n")
weights = numpy.zeros(features_count, "d")
for line in lines:
if line.strip():
fid, weight = line.split()
weights[int(fid)] = float(weight)
return weights
def _write_megam_features(vector, stream, bernoulli):
if not vector:
raise ValueError(
"MEGAM classifier requires the use of an " "always-on feature."
)
for fid, fval in vector:
if bernoulli:
if fval == 1:
stream.write(" %s" % fid)
elif fval != 0:
raise ValueError(
"If bernoulli=True, then all" "features must be binary."
)
else:
stream.write(f" {fid} {fval}")
def call_megam(args):
"""
Call the ``megam`` binary with the given arguments.
"""
if isinstance(args, str):
raise TypeError("args should be a list of strings")
if _megam_bin is None:
config_megam()
# Call megam via a subprocess
cmd = [_megam_bin] + args
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
print()
print(stderr)
raise OSError("megam command failed!")
if isinstance(stdout, str):
return stdout
else:
return stdout.decode("utf-8")

View File

@@ -0,0 +1,260 @@
# Natural Language Toolkit: Naive Bayes Classifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A classifier based on the Naive Bayes algorithm. In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):
| P(label) * P(features|label)
| P(label|features) = ------------------------------
| P(features)
The algorithm then makes the 'naive' assumption that all features are
independent, given the label:
| P(label) * P(f1|label) * ... * P(fn|label)
| P(label|features) = --------------------------------------------
| P(features)
Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:
| P(label) * P(f1|label) * ... * P(fn|label)
| P(label|features) = --------------------------------------------
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""
from collections import defaultdict
from nltk.classify.api import ClassifierI
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
##//////////////////////////////////////////////////////
## Naive Bayes Classifier
##//////////////////////////////////////////////////////
class NaiveBayesClassifier(ClassifierI):
"""
A Naive Bayes classifier. Naive Bayes classifiers are
paramaterized by two probability distributions:
- P(label) gives the probability that an input will receive each
label, given no information about the input's features.
- P(fname=fval|label) gives the probability that a given feature
(fname) will receive a given value (fval), given that the
label (label).
If the classifier encounters an input with a feature that has
never been seen with any label, then rather than assigning a
probability of 0 to all labels, it will ignore that feature.
The feature value 'None' is reserved for unseen feature values;
you generally should not use 'None' as a feature value for one of
your own features.
"""
def __init__(self, label_probdist, feature_probdist):
"""
:param label_probdist: P(label), the probability distribution
over labels. It is expressed as a ``ProbDistI`` whose
samples are labels. I.e., P(label) =
``label_probdist.prob(label)``.
:param feature_probdist: P(fname=fval|label), the probability
distribution for feature values, given labels. It is
expressed as a dictionary whose keys are ``(label, fname)``
pairs and whose values are ``ProbDistI`` objects over feature
values. I.e., P(fname=fval|label) =
``feature_probdist[label,fname].prob(fval)``. If a given
``(label,fname)`` is not a key in ``feature_probdist``, then
it is assumed that the corresponding P(fname=fval|label)
is 0 for all values of ``fval``.
"""
self._label_probdist = label_probdist
self._feature_probdist = feature_probdist
self._labels = list(label_probdist.samples())
def labels(self):
return self._labels
def classify(self, featureset):
return self.prob_classify(featureset).max()
def prob_classify(self, featureset):
# Discard any feature names that we've never seen before.
# Otherwise, we'll just assign a probability of 0 to
# everything.
featureset = featureset.copy()
for fname in list(featureset.keys()):
for label in self._labels:
if (label, fname) in self._feature_probdist:
break
else:
# print('Ignoring unseen feature %s' % fname)
del featureset[fname]
# Find the log probability of each label, given the features.
# Start with the log probability of the label itself.
logprob = {}
for label in self._labels:
logprob[label] = self._label_probdist.logprob(label)
# Then add in the log probability of features given labels.
for label in self._labels:
for fname, fval in featureset.items():
if (label, fname) in self._feature_probdist:
feature_probs = self._feature_probdist[label, fname]
logprob[label] += feature_probs.logprob(fval)
else:
# nb: This case will never come up if the
# classifier was created by
# NaiveBayesClassifier.train().
logprob[label] += sum_logs([]) # = -INF.
return DictionaryProbDist(logprob, normalize=True, log=True)
def show_most_informative_features(self, n=10):
# Determine the most relevant features, and display them.
cpdist = self._feature_probdist
print("Most Informative Features")
for fname, fval in self.most_informative_features(n):
def labelprob(l):
return cpdist[l, fname].prob(fval)
labels = sorted(
(l for l in self._labels if fval in cpdist[l, fname].samples()),
key=lambda element: (-labelprob(element), element),
reverse=True,
)
if len(labels) == 1:
continue
l0 = labels[0]
l1 = labels[-1]
if cpdist[l0, fname].prob(fval) == 0:
ratio = "INF"
else:
ratio = "%8.1f" % (
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
)
print(
"%24s = %-14r %6s : %-6s = %s : 1.0"
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
)
def most_informative_features(self, n=100):
"""
Return a list of the 'most informative' features used by this
classifier. For the purpose of this function, the
informativeness of a feature ``(fname,fval)`` is equal to the
highest value of P(fname=fval|label), for any label, divided by
the lowest value of P(fname=fval|label), for any label:
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
"""
if hasattr(self, "_most_informative_features"):
return self._most_informative_features[:n]
else:
# The set of (fname, fval) pairs used by this classifier.
features = set()
# The max & min probability associated w/ each (fname, fval)
# pair. Maps (fname,fval) -> float.
maxprob = defaultdict(float)
minprob = defaultdict(lambda: 1.0)
for (label, fname), probdist in self._feature_probdist.items():
for fval in probdist.samples():
feature = (fname, fval)
features.add(feature)
p = probdist.prob(fval)
maxprob[feature] = max(p, maxprob[feature])
minprob[feature] = min(p, minprob[feature])
if minprob[feature] == 0:
features.discard(feature)
# Convert features to a list, & sort it by how informative
# features are.
self._most_informative_features = sorted(
features,
key=lambda feature_: (
minprob[feature_] / maxprob[feature_],
feature_[0],
feature_[1] in [None, False, True],
str(feature_[1]).lower(),
),
)
return self._most_informative_features[:n]
@classmethod
def train(cls, labeled_featuresets, estimator=ELEProbDist):
"""
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
"""
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
# Count up how many times each feature value occurred, given
# the label and featurename.
for featureset, label in labeled_featuresets:
label_freqdist[label] += 1
for fname, fval in featureset.items():
# Increment freq(fval|label, fname)
feature_freqdist[label, fname][fval] += 1
# Record that fname can take the value fval.
feature_values[fname].add(fval)
# Keep a list of all feature names.
fnames.add(fname)
# If a feature didn't have a value given for an instance, then
# we assume that it gets the implicit value 'None.' This loop
# counts up the number of 'missing' feature values for each
# (label,fname) pair, and increments the count of the fval
# 'None' by that amount.
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
# Only add a None key when necessary, i.e. if there are
# any samples with feature 'fname' missing.
if num_samples - count > 0:
feature_freqdist[label, fname][None] += num_samples - count
feature_values[fname].add(None)
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
for (label, fname), freqdist in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return cls(label_probdist, feature_probdist)
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
def demo():
from nltk.classify.util import names_demo
classifier = names_demo(NaiveBayesClassifier.train)
classifier.show_most_informative_features()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,180 @@
# Natural Language Toolkit: Positive Naive Bayes Classifier
#
# Copyright (C) 2012 NLTK Project
# Author: Alessandro Presta <alessandro.presta@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A variant of the Naive Bayes Classifier that performs binary classification with
partially-labeled training sets. In other words, assume we want to build a classifier
that assigns each example to one of two complementary classes (e.g., male names and
female names).
If we have a training set with labeled examples for both classes, we can use a
standard Naive Bayes Classifier. However, consider the case when we only have labeled
examples for one of the classes, and other, unlabeled, examples.
Then, assuming a prior distribution on the two labels, we can use the unlabeled set
to estimate the frequencies of the various features.
Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
and unlabeled examples. We are also given an estimate of P(1).
We compute P(feature|1) exactly as in the standard case.
To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
assuming that the unlabeled examples are drawn according to the given prior distribution)
and then express the conditional probability as:
| P(feature) - P(feature|1) * P(1)
| P(feature|0) = ----------------------------------
| P(0)
Example:
>>> from nltk.classify import PositiveNaiveBayesClassifier
Some sentences about sports:
>>> sports_sentences = [ 'The team dominated the game',
... 'They lost the ball',
... 'The game was intense',
... 'The goalkeeper catched the ball',
... 'The other team controlled the ball' ]
Mixed topics, including sports:
>>> various_sentences = [ 'The President did not comment',
... 'I lost the keys',
... 'The team won the game',
... 'Sara has two kids',
... 'The ball went off the court',
... 'They had the ball for the whole game',
... 'The show is over' ]
The features of a sentence are simply the words it contains:
>>> def features(sentence):
... words = sentence.lower().split()
... return dict(('contains(%s)' % w, True) for w in words)
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
>>> positive_featuresets = map(features, sports_sentences)
>>> unlabeled_featuresets = map(features, various_sentences)
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
... unlabeled_featuresets)
Is the following sentence about sports?
>>> classifier.classify(features('The cat is on the table'))
False
What about this one?
>>> classifier.classify(features('My team lost the game'))
True
"""
from collections import defaultdict
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
##//////////////////////////////////////////////////////
## Positive Naive Bayes Classifier
##//////////////////////////////////////////////////////
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
@staticmethod
def train(
positive_featuresets,
unlabeled_featuresets,
positive_prob_prior=0.5,
estimator=ELEProbDist,
):
"""
:param positive_featuresets: An iterable of featuresets that are known as positive
examples (i.e., their label is ``True``).
:param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
:param positive_prob_prior: A prior estimate of the probability of the label
``True`` (default 0.5).
"""
positive_feature_freqdist = defaultdict(FreqDist)
unlabeled_feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
# Count up how many times each feature value occurred in positive examples.
num_positive_examples = 0
for featureset in positive_featuresets:
for fname, fval in featureset.items():
positive_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
num_positive_examples += 1
# Count up how many times each feature value occurred in unlabeled examples.
num_unlabeled_examples = 0
for featureset in unlabeled_featuresets:
for fname, fval in featureset.items():
unlabeled_feature_freqdist[fname][fval] += 1
feature_values[fname].add(fval)
fnames.add(fname)
num_unlabeled_examples += 1
# If a feature didn't have a value given for an instance, then we assume that
# it gets the implicit value 'None'.
for fname in fnames:
count = positive_feature_freqdist[fname].N()
positive_feature_freqdist[fname][None] += num_positive_examples - count
feature_values[fname].add(None)
for fname in fnames:
count = unlabeled_feature_freqdist[fname].N()
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
feature_values[fname].add(None)
negative_prob_prior = 1.0 - positive_prob_prior
# Create the P(label) distribution.
label_probdist = DictionaryProbDist(
{True: positive_prob_prior, False: negative_prob_prior}
)
# Create the P(fval|label, fname) distribution.
feature_probdist = {}
for fname, freqdist in positive_feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[True, fname] = probdist
for fname, freqdist in unlabeled_feature_freqdist.items():
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
negative_feature_probs = {}
for fval in feature_values[fname]:
prob = (
global_probdist.prob(fval)
- positive_prob_prior * feature_probdist[True, fname].prob(fval)
) / negative_prob_prior
# TODO: We need to add some kind of smoothing here, instead of
# setting negative probabilities to zero and normalizing.
negative_feature_probs[fval] = max(prob, 0.0)
feature_probdist[False, fname] = DictionaryProbDist(
negative_feature_probs, normalize=True
)
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
##//////////////////////////////////////////////////////
## Demo
##//////////////////////////////////////////////////////
def demo():
from nltk.classify.util import partial_names_demo
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
classifier.show_most_informative_features()

View File

@@ -0,0 +1,183 @@
# Natural Language Toolkit: RTE Classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Simple classifier for RTE corpus.
It calculates the overlap in words and named entities between text and
hypothesis, and also whether there are words / named entities in the
hypothesis which fail to occur in the text, since this is an indicator that
the hypothesis is more informative than (i.e not entailed by) the text.
TO DO: better Named Entity classification
TO DO: add lemmatization
"""
from nltk.classify.maxent import MaxentClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import RegexpTokenizer
class RTEFeatureExtractor:
"""
This builds a bag of words for both the text and the hypothesis after
throwing away some stopwords, then calculates overlap and difference.
"""
def __init__(self, rtepair, stop=True, use_lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = {
"a",
"the",
"it",
"they",
"of",
"in",
"to",
"is",
"have",
"are",
"were",
"and",
"very",
".",
",",
}
self.negwords = {"no", "not", "never", "failed", "rejected", "denied"}
# Try to tokenize so that abbreviations, monetary amounts, email
# addresses, URLs are single tokens.
tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")
# Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if use_lemmatize:
self.text_words = {self._lemmatize(token) for token in self.text_tokens}
self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens}
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
def overlap(self, toktype, debug=False):
"""
Compute the overlap between text and hypothesis.
:param toktype: distinguish Named Entities from ordinary words
:type toktype: 'ne' or 'word'
"""
ne_overlap = {token for token in self._overlap if self._ne(token)}
if toktype == "ne":
if debug:
print("ne overlap", ne_overlap)
return ne_overlap
elif toktype == "word":
if debug:
print("word overlap", self._overlap - ne_overlap)
return self._overlap - ne_overlap
else:
raise ValueError("Type not recognized:'%s'" % toktype)
def hyp_extra(self, toktype, debug=True):
"""
Compute the extraneous material in the hypothesis.
:param toktype: distinguish Named Entities from ordinary words
:type toktype: 'ne' or 'word'
"""
ne_extra = {token for token in self._hyp_extra if self._ne(token)}
if toktype == "ne":
return ne_extra
elif toktype == "word":
return self._hyp_extra - ne_extra
else:
raise ValueError("Type not recognized: '%s'" % toktype)
@staticmethod
def _ne(token):
"""
This just assumes that words in all caps or titles are
named entities.
:type token: str
"""
if token.istitle() or token.isupper():
return True
return False
@staticmethod
def _lemmatize(word):
"""
Use morphy from WordNet to find the base form of verbs.
"""
from nltk.corpus import wordnet as wn
lemma = wn.morphy(word, pos=wn.VERB)
if lemma is not None:
return lemma
return word
def rte_features(rtepair):
extractor = RTEFeatureExtractor(rtepair)
features = {}
features["alwayson"] = True
features["word_overlap"] = len(extractor.overlap("word"))
features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
features["ne_overlap"] = len(extractor.overlap("ne"))
features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
features["neg_txt"] = len(extractor.negwords & extractor.text_words)
features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
return features
def rte_featurize(rte_pairs):
return [(rte_features(pair), pair.value) for pair in rte_pairs]
def rte_classifier(algorithm, sample_N=None):
from nltk.corpus import rte as rte_corpus
train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
if sample_N is not None:
train_set = train_set[:sample_N]
test_set = test_set[:sample_N]
featurized_train_set = rte_featurize(train_set)
featurized_test_set = rte_featurize(test_set)
# Train the classifier
print("Training classifier...")
if algorithm in ["megam"]: # MEGAM based algorithms.
clf = MaxentClassifier.train(featurized_train_set, algorithm)
elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm
clf = MaxentClassifier.train(featurized_train_set, algorithm)
else:
err_msg = str(
"RTEClassifier only supports these algorithms:\n "
"'megam', 'GIS', 'IIS'.\n"
)
raise Exception(err_msg)
print("Testing classifier...")
acc = accuracy(clf, featurized_test_set)
print("Accuracy: %6.4f" % acc)
return clf

View File

@@ -0,0 +1,143 @@
# Natural Language Toolkit: Interface to scikit-learn classifiers
#
# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
scikit-learn (https://scikit-learn.org) is a machine learning library for
Python. It supports many classification algorithms, including SVMs,
Naive Bayes, logistic regression (MaxEnt) and decision trees.
This package implements a wrapper around scikit-learn classifiers. To use this
wrapper, construct a scikit-learn estimator object, then use that to construct
a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
>>> from sklearn.svm import LinearSVC
>>> from nltk.classify.scikitlearn import SklearnClassifier
>>> classif = SklearnClassifier(LinearSVC())
A scikit-learn classifier may include preprocessing steps when it's wrapped
in a Pipeline object. The following constructs and wraps a Naive Bayes text
classifier with tf-idf weighting and chi-square feature selection to get the
best 1000 features:
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.feature_selection import SelectKBest, chi2
>>> from sklearn.naive_bayes import MultinomialNB
>>> from sklearn.pipeline import Pipeline
>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
... ('chi2', SelectKBest(chi2, k=1000)),
... ('nb', MultinomialNB())])
>>> classif = SklearnClassifier(pipeline)
"""
from nltk.classify.api import ClassifierI
from nltk.probability import DictionaryProbDist
try:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
except ImportError:
pass
__all__ = ["SklearnClassifier"]
class SklearnClassifier(ClassifierI):
"""Wrapper for scikit-learn classifiers."""
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __repr__(self):
return "<SklearnClassifier(%r)>" % self._clf
def classify_many(self, featuresets):
"""Classify a batch of samples.
:param featuresets: An iterable over featuresets, each a dict mapping
strings to either numbers, booleans or strings.
:return: The predicted class label for each input sample.
:rtype: list
"""
X = self._vectorizer.transform(featuresets)
classes = self._encoder.classes_
return [classes[i] for i in self._clf.predict(X)]
def prob_classify_many(self, featuresets):
"""Compute per-class probabilities for a batch of samples.
:param featuresets: An iterable over featuresets, each a dict mapping
strings to either numbers, booleans or strings.
:rtype: list of ``ProbDistI``
"""
X = self._vectorizer.transform(featuresets)
y_proba_list = self._clf.predict_proba(X)
return [self._make_probdist(y_proba) for y_proba in y_proba_list]
def labels(self):
"""The class labels used by this classifier.
:rtype: list
"""
return list(self._encoder.classes_)
def train(self, labeled_featuresets):
"""
Train (fit) the scikit-learn estimator.
:param labeled_featuresets: A list of ``(featureset, label)``
where each ``featureset`` is a dict mapping strings to either
numbers, booleans or strings.
"""
X, y = list(zip(*labeled_featuresets))
X = self._vectorizer.fit_transform(X)
y = self._encoder.fit_transform(y)
self._clf.fit(X, y)
return self
def _make_probdist(self, y_proba):
classes = self._encoder.classes_
return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
if __name__ == "__main__":
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from nltk.classify.util import names_demo, names_demo_features
# Bernoulli Naive Bayes is designed for binary classification. We set the
# binarize option to False since we know we're passing boolean features.
print("scikit-learn Naive Bayes:")
names_demo(
SklearnClassifier(BernoulliNB(binarize=False)).train,
features=names_demo_features,
)
# The C parameter on logistic regression (MaxEnt) controls regularization.
# The higher it's set, the less regularized the classifier is.
print("\n\nscikit-learn logistic regression:")
names_demo(
SklearnClassifier(LogisticRegression(C=1000)).train,
features=names_demo_features,
)

View File

@@ -0,0 +1,175 @@
# Natural Language Toolkit: Senna Interface
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A general interface to the SENNA pipeline that supports any of the
operations specified in SUPPORTED_OPERATIONS.
Applying multiple operations at once has the speed advantage. For example,
Senna will automatically determine POS tags if you are extracting named
entities. Applying both of the operations will cost only the time of
extracting the named entities.
The SENNA pipeline has a fixed maximum size of the sentences that it can read.
By default it is 1024 token/sentence. If you have larger sentences, changing
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
system specific binary should be rebuilt. Otherwise this could introduce
misalignment errors.
The input is:
- path to the directory that contains SENNA executables. If the path is incorrect,
Senna will automatically search for executable file specified in SENNA environment variable
- List of the operations needed to be performed.
- (optionally) the encoding of the input data (default:utf-8)
Note: Unit tests for this module can be found in test/unit/test_senna.py
>>> from nltk.classify import Senna
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP
>>> sent = 'Dusseldorf is an international business center'.split()
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
"""
from os import environ, path, sep
from platform import architecture, system
from subprocess import PIPE, Popen
from nltk.tag.api import TaggerI
class Senna(TaggerI):
SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
def __init__(self, senna_path, operations, encoding="utf-8"):
self._encoding = encoding
self._path = path.normpath(senna_path) + sep
# Verifies the existence of the executable on the self._path first
# senna_binary_file_1 = self.executable(self._path)
exe_file_1 = self.executable(self._path)
if not path.isfile(exe_file_1):
# Check for the system environment
if "SENNA" in environ:
# self._path = path.join(environ['SENNA'],'')
self._path = path.normpath(environ["SENNA"]) + sep
exe_file_2 = self.executable(self._path)
if not path.isfile(exe_file_2):
raise LookupError(
"Senna executable expected at %s or %s but not found"
% (exe_file_1, exe_file_2)
)
self.operations = operations
def executable(self, base_path):
"""
The function that determines the system specific binary that should be
used in the pipeline. In case, the system is not known the default senna binary will
be used.
"""
os_name = system()
if os_name == "Linux":
bits = architecture()[0]
if bits == "64bit":
return path.join(base_path, "senna-linux64")
return path.join(base_path, "senna-linux32")
if os_name == "Windows":
return path.join(base_path, "senna-win32.exe")
if os_name == "Darwin":
return path.join(base_path, "senna-osx")
return path.join(base_path, "senna")
def _map(self):
"""
A method that calculates the order of the columns that SENNA pipeline
will output the tags into. This depends on the operations being ordered.
"""
_map = {}
i = 1
for operation in Senna.SUPPORTED_OPERATIONS:
if operation in self.operations:
_map[operation] = i
i += 1
return _map
def tag(self, tokens):
"""
Applies the specified operation(s) on a list of tokens.
"""
return self.tag_sents([tokens])[0]
def tag_sents(self, sentences):
"""
Applies the tag method over a list of sentences. This method will return a
list of dictionaries. Every dictionary will contain a word with its
calculated annotations/tags.
"""
encoding = self._encoding
if not path.isfile(self.executable(self._path)):
raise LookupError(
"Senna executable expected at %s but not found"
% self.executable(self._path)
)
# Build the senna command to run the tagger
_senna_cmd = [
self.executable(self._path),
"-path",
self._path,
"-usrtokens",
"-iobtags",
]
_senna_cmd.extend(["-" + op for op in self.operations])
# Serialize the actual sentences to a temporary string
_input = "\n".join(" ".join(x) for x in sentences) + "\n"
if isinstance(_input, str) and encoding:
_input = _input.encode(encoding)
# Run the tagger and get the output
p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = p.communicate(input=_input)
senna_output = stdout
# Check the return code.
if p.returncode != 0:
raise RuntimeError("Senna command failed! Details: %s" % stderr)
if encoding:
senna_output = stdout.decode(encoding)
# Output the tagged sentences
map_ = self._map()
tagged_sentences = [[]]
sentence_index = 0
token_index = 0
for tagged_word in senna_output.strip().split("\n"):
if not tagged_word:
tagged_sentences.append([])
sentence_index += 1
token_index = 0
continue
tags = tagged_word.split("\t")
result = {}
for tag in map_:
result[tag] = tags[map_[tag]].strip()
try:
result["word"] = sentences[sentence_index][token_index]
except IndexError as e:
raise IndexError(
"Misalignment error occurred at sentence number %d. Possible reason"
" is that the sentence size exceeded the maximum size. Check the "
"documentation of Senna class for more information."
% sentence_index
) from e
tagged_sentences[-1].append(result)
token_index += 1
return tagged_sentences

View File

@@ -0,0 +1,17 @@
# Natural Language Toolkit: SVM-based classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
nltk.classify.svm was deprecated. For classification based
on support vector machines SVMs use nltk.classify.scikitlearn
(or `scikit-learn <https://scikit-learn.org>`_ directly).
"""
class SvmClassifier:
def __init__(self, *args, **kwargs):
raise NotImplementedError(__doc__)

View File

@@ -0,0 +1,122 @@
# Natural Language Toolkit: Interface to TADM Classifier
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import subprocess
import sys
from nltk.internals import find_binary
try:
import numpy
except ImportError:
pass
_tadm_bin = None
def config_tadm(bin=None):
global _tadm_bin
_tadm_bin = find_binary(
"tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
)
def write_tadm_file(train_toks, encoding, stream):
"""
Generate an input file for ``tadm`` based on the given corpus of
classified tokens.
:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.
:type encoding: TadmEventMaxentFeatureEncoding
:param encoding: A feature encoding, used to convert featuresets
into feature vectors.
:type stream: stream
:param stream: The stream to which the ``tadm`` input file should be
written.
"""
# See the following for a file format description:
#
# https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
# https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
labels = encoding.labels()
for featureset, label in train_toks:
length_line = "%d\n" % len(labels)
stream.write(length_line)
for known_label in labels:
v = encoding.encode(featureset, known_label)
line = "%d %d %s\n" % (
int(label == known_label),
len(v),
" ".join("%d %d" % u for u in v),
)
stream.write(line)
def parse_tadm_weights(paramfile):
"""
Given the stdout output generated by ``tadm`` when training a
model, return a ``numpy`` array containing the corresponding weight
vector.
"""
weights = []
for line in paramfile:
weights.append(float(line.strip()))
return numpy.array(weights, "d")
def call_tadm(args):
"""
Call the ``tadm`` binary with the given arguments.
"""
if isinstance(args, str):
raise TypeError("args should be a list of strings")
if _tadm_bin is None:
config_tadm()
# Call tadm via a subprocess
cmd = [_tadm_bin] + args
p = subprocess.Popen(cmd, stdout=sys.stdout)
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
print()
print(stderr)
raise OSError("tadm command failed!")
def names_demo():
from nltk.classify.maxent import TadmMaxentClassifier
from nltk.classify.util import names_demo
classifier = names_demo(TadmMaxentClassifier.train)
def encoding_demo():
import sys
from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
tokens = [
({"f0": 1, "f1": 1, "f3": 1}, "A"),
({"f0": 1, "f2": 1, "f4": 1}, "B"),
({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
]
encoding = TadmEventMaxentFeatureEncoding.train(tokens)
write_tadm_file(tokens, encoding, sys.stdout)
print()
for i in range(encoding.length()):
print("%s --> %d" % (encoding.describe(i), i))
print()
if __name__ == "__main__":
encoding_demo()
names_demo()

View File

@@ -0,0 +1,193 @@
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".
The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.
Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.
For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
"""
from sys import maxsize
from nltk.util import trigrams
# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
import regex as re
except ImportError:
re = None
######################################################################
## Language identification using TextCat
######################################################################
class TextCat:
_corpus = None
fingerprints = {}
_START_CHAR = "<"
_END_CHAR = ">"
last_distances = {}
def __init__(self):
if not re:
raise OSError(
"classify.textcat requires the regex module that "
"supports unicode. Try '$ pip install regex' and "
"see https://pypi.python.org/pypi/regex for "
"further details."
)
from nltk.corpus import crubadan
self._corpus = crubadan
# Load all language ngrams into cache
for lang in self._corpus.langs():
self._corpus.lang_freq(lang)
def remove_punctuation(self, text):
"""Get rid of punctuation except apostrophes"""
return re.sub(r"[^\P{P}\']+", "", text)
def profile(self, text):
"""Create FreqDist of trigrams within text"""
from nltk import FreqDist, word_tokenize
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
def calc_dist(self, lang, trigram, text_profile):
"""Calculate the "out-of-place" measure between the
text and language profile for a single trigram"""
lang_fd = self._corpus.lang_freq(lang)
dist = 0
if trigram in lang_fd:
idx_lang_profile = list(lang_fd.keys()).index(trigram)
idx_text = list(text_profile.keys()).index(trigram)
# print(idx_lang_profile, ", ", idx_text)
dist = abs(idx_lang_profile - idx_text)
else:
# Arbitrary but should be larger than
# any possible trigram file length
# in terms of total lines
dist = maxsize
return dist
def lang_dists(self, text):
"""Calculate the "out-of-place" measure between
the text and all languages"""
distances = {}
profile = self.profile(text)
# For all the languages
for lang in self._corpus._all_lang_freq.keys():
# Calculate distance metric for every trigram in
# input text to be identified
lang_dist = 0
for trigram in profile:
lang_dist += self.calc_dist(lang, trigram, profile)
distances[lang] = lang_dist
return distances
def guess_language(self, text):
"""Find the language with the min distance
to the text and return its ISO 639-3 code"""
self.last_distances = self.lang_dists(text)
return min(self.last_distances, key=self.last_distances.get)
#################################################')
def demo():
from nltk.corpus import udhr
langs = [
"Kurdish-UTF8",
"Abkhaz-UTF8",
"Farsi_Persian-UTF8",
"Hindi-UTF8",
"Hawaiian-UTF8",
"Russian-UTF8",
"Vietnamese-UTF8",
"Serbian_Srpski-UTF8",
"Esperanto-UTF8",
]
friendly = {
"kmr": "Northern Kurdish",
"abk": "Abkhazian",
"pes": "Iranian Persian",
"hin": "Hindi",
"haw": "Hawaiian",
"rus": "Russian",
"vie": "Vietnamese",
"srp": "Serbian",
"epo": "Esperanto",
}
tc = TextCat()
for cur_lang in langs:
# Get raw data from UDHR corpus
raw_sentences = udhr.sents(cur_lang)
rows = len(raw_sentences) - 1
cols = list(map(len, raw_sentences))
sample = ""
# Generate a sample text of the language
for i in range(0, rows):
cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
sample += cur_sent
# Try to detect what it is
print("Language snippet: " + sample[0:140] + "...")
guess = tc.guess_language(sample)
print(f"Language detection: {guess} ({friendly[guess]})")
print("#" * 140)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,347 @@
# Natural Language Toolkit: Classifier Utility Functions
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility functions and classes for classifiers.
"""
import math
# from nltk.util import Deprecated
import nltk.classify.util # for accuracy & log_likelihood
from nltk.util import LazyMap
######################################################################
# { Helper Functions
######################################################################
# alternative name possibility: 'map_featurefunc()'?
# alternative name possibility: 'detect_features()'?
# alternative name possibility: 'map_featuredetect()'?
# or.. just have users use LazyMap directly?
def apply_features(feature_func, toks, labeled=None):
"""
Use the ``LazyMap`` class to construct a lazy list-like
object that is analogous to ``map(feature_func, toks)``. In
particular, if ``labeled=False``, then the returned list-like
object's values are equal to::
[feature_func(tok) for tok in toks]
If ``labeled=True``, then the returned list-like object's values
are equal to::
[(feature_func(tok), label) for (tok, label) in toks]
The primary purpose of this function is to avoid the memory
overhead involved in storing all the featuresets for every token
in a corpus. Instead, these featuresets are constructed lazily,
as-needed. The reduction in memory overhead can be especially
significant when the underlying list of tokens is itself lazy (as
is the case with many corpus readers).
:param feature_func: The function that will be applied to each
token. It should return a featureset -- i.e., a dict
mapping feature names to feature values.
:param toks: The list of tokens to which ``feature_func`` should be
applied. If ``labeled=True``, then the list elements will be
passed directly to ``feature_func()``. If ``labeled=False``,
then the list elements should be tuples ``(tok,label)``, and
``tok`` will be passed to ``feature_func()``.
:param labeled: If true, then ``toks`` contains labeled tokens --
i.e., tuples of the form ``(tok, label)``. (Default:
auto-detect based on types.)
"""
if labeled is None:
labeled = toks and isinstance(toks[0], (tuple, list))
if labeled:
def lazy_func(labeled_token):
return (feature_func(labeled_token[0]), labeled_token[1])
return LazyMap(lazy_func, toks)
else:
return LazyMap(feature_func, toks)
def attested_labels(tokens):
"""
:return: A list of all labels that are attested in the given list
of tokens.
:rtype: list of (immutable)
:param tokens: The list of classified tokens from which to extract
labels. A classified token has the form ``(token, label)``.
:type tokens: list
"""
return tuple({label for (tok, label) in tokens})
def log_likelihood(classifier, gold):
results = classifier.prob_classify_many([fs for (fs, l) in gold])
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
return math.log(sum(ll) / len(ll))
def accuracy(classifier, gold):
results = classifier.classify_many([fs for (fs, l) in gold])
correct = [l == r for ((fs, l), r) in zip(gold, results)]
if correct:
return sum(correct) / len(correct)
else:
return 0
class CutoffChecker:
"""
A helper class that implements cutoff checks based on number of
iterations and log likelihood.
Accuracy cutoffs are also implemented, but they're almost never
a good idea to use.
"""
def __init__(self, cutoffs):
self.cutoffs = cutoffs.copy()
if "min_ll" in cutoffs:
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
if "min_lldelta" in cutoffs:
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
self.ll = None
self.acc = None
self.iter = 1
def check(self, classifier, train_toks):
cutoffs = self.cutoffs
self.iter += 1
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
return True # iteration cutoff.
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
if math.isnan(new_ll):
return True
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
return True # log likelihood cutoff
if (
"min_lldelta" in cutoffs
and self.ll
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
):
return True # log likelihood delta cutoff
self.ll = new_ll
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
return True # log likelihood cutoff
if (
"min_accdelta" in cutoffs
and self.acc
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
):
return True # log likelihood delta cutoff
self.acc = new_acc
return False # no cutoff reached.
######################################################################
# { Demos
######################################################################
def names_demo_features(name):
features = {}
features["alwayson"] = True
features["startswith"] = name[0].lower()
features["endswith"] = name[-1].lower()
for letter in "abcdefghijklmnopqrstuvwxyz":
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = letter in name.lower()
return features
def binary_names_demo_features(name):
features = {}
features["alwayson"] = True
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
for letter in "abcdefghijklmnopqrstuvwxyz":
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = letter in name.lower()
features["startswith(%s)" % letter] = letter == name[0].lower()
features["endswith(%s)" % letter] = letter == name[-1].lower()
return features
def names_demo(trainer, features=names_demo_features):
import random
from nltk.corpus import names
# Construct a list of classified names, using the names corpus.
namelist = [(name, "male") for name in names.words("male.txt")] + [
(name, "female") for name in names.words("female.txt")
]
# Randomly split the names into a test & train set.
random.seed(123456)
random.shuffle(namelist)
train = namelist[:5000]
test = namelist[5000:5500]
# Train up a classifier.
print("Training classifier...")
classifier = trainer([(features(n), g) for (n, g) in train])
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, g) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
print()
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
for (name, gender), pdist in list(zip(test, pdists))[:5]:
if gender == "male":
fmt = " %-15s *%6.4f %6.4f"
else:
fmt = " %-15s %6.4f *%6.4f"
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
except NotImplementedError:
pass
# Return the classifier
return classifier
def partial_names_demo(trainer, features=names_demo_features):
import random
from nltk.corpus import names
male_names = names.words("male.txt")
female_names = names.words("female.txt")
random.seed(654321)
random.shuffle(male_names)
random.shuffle(female_names)
# Create a list of male names to be used as positive-labeled examples for training
positive = map(features, male_names[:2000])
# Create a list of male and female names to be used as unlabeled examples
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
# Create a test set with correctly-labeled male and female names
test = [(name, True) for name in male_names[2500:2750]] + [
(name, False) for name in female_names[500:750]
]
random.shuffle(test)
# Train up a classifier.
print("Training classifier...")
classifier = trainer(positive, unlabeled)
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(n) for (n, m) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
print()
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
for (name, is_male), pdist in zip(test, pdists)[:5]:
if is_male == True:
fmt = " %-15s *%6.4f %6.4f"
else:
fmt = " %-15s %6.4f *%6.4f"
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
except NotImplementedError:
pass
# Return the classifier
return classifier
_inst_cache = {}
def wsd_demo(trainer, word, features, n=1000):
import random
from nltk.corpus import senseval
# Get the instances.
print("Reading data...")
global _inst_cache
if word not in _inst_cache:
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
instances = _inst_cache[word][:]
if n > len(instances):
n = len(instances)
senses = list({l for (i, l) in instances})
print(" Senses: " + " ".join(senses))
# Randomly split the names into a test & train set.
print("Splitting into test & train...")
random.seed(123456)
random.shuffle(instances)
train = instances[: int(0.8 * n)]
test = instances[int(0.8 * n) : n]
# Train up a classifier.
print("Training classifier...")
classifier = trainer([(features(i), l) for (i, l) in train])
# Run the classifier on the test data.
print("Testing classifier...")
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
print("Accuracy: %6.4f" % acc)
# For classifiers that can find probabilities, show the log
# likelihood and some sample probability distributions.
try:
test_featuresets = [features(i) for (i, n) in test]
pdists = classifier.prob_classify_many(test_featuresets)
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
except NotImplementedError:
pass
# Return the classifier
return classifier
def check_megam_config():
"""
Checks whether the MEGAM binary is configured.
"""
try:
_megam_bin
except NameError as e:
err_msg = str(
"Please configure your megam binary first, e.g.\n"
">>> nltk.config_megam('/usr/bin/local/megam')"
)
raise NameError(err_msg) from e

View File

@@ -0,0 +1,377 @@
# Natural Language Toolkit: Interface to Weka Classsifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Classifiers that make use of the external 'Weka' package.
"""
import os
import re
import subprocess
import tempfile
import time
import zipfile
from sys import stdin
from nltk.classify.api import ClassifierI
from nltk.internals import config_java, java
from nltk.probability import DictionaryProbDist
_weka_classpath = None
_weka_search = [
".",
"/usr/share/weka",
"/usr/local/share/weka",
"/usr/lib/weka",
"/usr/local/lib/weka",
]
def config_weka(classpath=None):
global _weka_classpath
# Make sure java's configured first.
config_java()
if classpath is not None:
_weka_classpath = classpath
if _weka_classpath is None:
searchpath = _weka_search
if "WEKAHOME" in os.environ:
searchpath.insert(0, os.environ["WEKAHOME"])
for path in searchpath:
if os.path.exists(os.path.join(path, "weka.jar")):
_weka_classpath = os.path.join(path, "weka.jar")
version = _check_weka_version(_weka_classpath)
if version:
print(f"[Found Weka: {_weka_classpath} (version {version})]")
else:
print("[Found Weka: %s]" % _weka_classpath)
_check_weka_version(_weka_classpath)
if _weka_classpath is None:
raise LookupError(
"Unable to find weka.jar! Use config_weka() "
"or set the WEKAHOME environment variable. "
"For more information about Weka, please see "
"https://www.cs.waikato.ac.nz/ml/weka/"
)
def _check_weka_version(jar):
try:
zf = zipfile.ZipFile(jar)
except (SystemExit, KeyboardInterrupt):
raise
except:
return None
try:
try:
return zf.read("weka/core/version.txt")
except KeyError:
return None
finally:
zf.close()
class WekaClassifier(ClassifierI):
def __init__(self, formatter, model_filename):
self._formatter = formatter
self._model = model_filename
def prob_classify_many(self, featuresets):
return self._classify_many(featuresets, ["-p", "0", "-distribution"])
def classify_many(self, featuresets):
return self._classify_many(featuresets, ["-p", "0"])
def _classify_many(self, featuresets, options):
# Make sure we can find java & weka.
config_weka()
temp_dir = tempfile.mkdtemp()
try:
# Write the test data file.
test_filename = os.path.join(temp_dir, "test.arff")
self._formatter.write(test_filename, featuresets)
# Call weka to classify the data.
cmd = [
"weka.classifiers.bayes.NaiveBayes",
"-l",
self._model,
"-T",
test_filename,
] + options
(stdout, stderr) = java(
cmd,
classpath=_weka_classpath,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
# Check if something went wrong:
if stderr and not stdout:
if "Illegal options: -distribution" in stderr:
raise ValueError(
"The installed version of weka does "
"not support probability distribution "
"output."
)
else:
raise ValueError("Weka failed to generate output:\n%s" % stderr)
# Parse weka's output.
return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
finally:
for f in os.listdir(temp_dir):
os.remove(os.path.join(temp_dir, f))
os.rmdir(temp_dir)
def parse_weka_distribution(self, s):
probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
probs = dict(zip(self._formatter.labels(), probs))
return DictionaryProbDist(probs)
def parse_weka_output(self, lines):
# Strip unwanted text from stdout
for i, line in enumerate(lines):
if line.strip().startswith("inst#"):
lines = lines[i:]
break
if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
elif lines[0].split() == [
"inst#",
"actual",
"predicted",
"error",
"distribution",
]:
return [
self.parse_weka_distribution(line.split()[-1])
for line in lines[1:]
if line.strip()
]
# is this safe:?
elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
return [line.split()[1] for line in lines if line.strip()]
else:
for line in lines[:10]:
print(line)
raise ValueError(
"Unhandled output format -- your version "
"of weka may not be supported.\n"
" Header: %s" % lines[0]
)
# [xx] full list of classifiers (some may be abstract?):
# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
# DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
# JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
# LogisticBase, M5Base, MultilayerPerceptron,
# MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
# NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
# PreConstructedLinearModel, Prism, RandomForest,
# RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
# RuleNode, SimpleLinearRegression, SimpleLogistic,
# SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
# VotedPerceptron, Winnow, ZeroR
_CLASSIFIER_CLASS = {
"naivebayes": "weka.classifiers.bayes.NaiveBayes",
"C4.5": "weka.classifiers.trees.J48",
"log_regression": "weka.classifiers.functions.Logistic",
"svm": "weka.classifiers.functions.SMO",
"kstar": "weka.classifiers.lazy.KStar",
"ripper": "weka.classifiers.rules.JRip",
}
@classmethod
def train(
cls,
model_filename,
featuresets,
classifier="naivebayes",
options=[],
quiet=True,
):
# Make sure we can find java & weka.
config_weka()
# Build an ARFF formatter.
formatter = ARFF_Formatter.from_train(featuresets)
temp_dir = tempfile.mkdtemp()
try:
# Write the training data file.
train_filename = os.path.join(temp_dir, "train.arff")
formatter.write(train_filename, featuresets)
if classifier in cls._CLASSIFIER_CLASS:
javaclass = cls._CLASSIFIER_CLASS[classifier]
elif classifier in cls._CLASSIFIER_CLASS.values():
javaclass = classifier
else:
raise ValueError("Unknown classifier %s" % classifier)
# Train the weka model.
cmd = [javaclass, "-d", model_filename, "-t", train_filename]
cmd += list(options)
if quiet:
stdout = subprocess.PIPE
else:
stdout = None
java(cmd, classpath=_weka_classpath, stdout=stdout)
# Return the new classifier.
return WekaClassifier(formatter, model_filename)
finally:
for f in os.listdir(temp_dir):
os.remove(os.path.join(temp_dir, f))
os.rmdir(temp_dir)
class ARFF_Formatter:
"""
Converts featuresets and labeled featuresets to ARFF-formatted
strings, appropriate for input into Weka.
Features and classes can be specified manually in the constructor, or may
be determined from data using ``from_train``.
"""
def __init__(self, labels, features):
"""
:param labels: A list of all class labels that can be generated.
:param features: A list of feature specifications, where
each feature specification is a tuple (fname, ftype);
and ftype is an ARFF type string such as NUMERIC or
STRING.
"""
self._labels = labels
self._features = features
def format(self, tokens):
"""Returns a string representation of ARFF output for the given data."""
return self.header_section() + self.data_section(tokens)
def labels(self):
"""Returns the list of classes."""
return list(self._labels)
def write(self, outfile, tokens):
"""Writes ARFF data to a file for the given data."""
if not hasattr(outfile, "write"):
outfile = open(outfile, "w")
outfile.write(self.format(tokens))
outfile.close()
@staticmethod
def from_train(tokens):
"""
Constructs an ARFF_Formatter instance with class labels and feature
types determined from the given data. Handles boolean, numeric and
string (note: not nominal) types.
"""
# Find the set of all attested labels.
labels = {label for (tok, label) in tokens}
# Determine the types of all features.
features = {}
for tok, label in tokens:
for fname, fval in tok.items():
if issubclass(type(fval), bool):
ftype = "{True, False}"
elif issubclass(type(fval), (int, float, bool)):
ftype = "NUMERIC"
elif issubclass(type(fval), str):
ftype = "STRING"
elif fval is None:
continue # can't tell the type.
else:
raise ValueError("Unsupported value type %r" % ftype)
if features.get(fname, ftype) != ftype:
raise ValueError("Inconsistent type for %s" % fname)
features[fname] = ftype
features = sorted(features.items())
return ARFF_Formatter(labels, features)
def header_section(self):
"""Returns an ARFF header as a string."""
# Header comment.
s = (
"% Weka ARFF file\n"
+ "% Generated automatically by NLTK\n"
+ "%% %s\n\n" % time.ctime()
)
# Relation name
s += "@RELATION rel\n\n"
# Input attribute specifications
for fname, ftype in self._features:
s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
# Label attribute specification
s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
return s
def data_section(self, tokens, labeled=None):
"""
Returns the ARFF data section for the given data.
:param tokens: a list of featuresets (dicts) or labelled featuresets
which are tuples (featureset, label).
:param labeled: Indicates whether the given tokens are labeled
or not. If None, then the tokens will be assumed to be
labeled if the first token's value is a tuple or list.
"""
# Check if the tokens are labeled or unlabeled. If unlabeled,
# then use 'None'
if labeled is None:
labeled = tokens and isinstance(tokens[0], (tuple, list))
if not labeled:
tokens = [(tok, None) for tok in tokens]
# Data section
s = "\n@DATA\n"
for tok, label in tokens:
for fname, ftype in self._features:
s += "%s," % self._fmt_arff_val(tok.get(fname))
s += "%s\n" % self._fmt_arff_val(label)
return s
def _fmt_arff_val(self, fval):
if fval is None:
return "?"
elif isinstance(fval, (bool, int)):
return "%s" % fval
elif isinstance(fval, float):
return "%r" % fval
else:
return "%r" % fval
if __name__ == "__main__":
from nltk.classify.util import binary_names_demo_features, names_demo
def make_classifier(featuresets):
return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
classifier = names_demo(make_classifier, binary_names_demo_features)