updates
This commit is contained in:
@@ -0,0 +1,101 @@
|
||||
# Natural Language Toolkit: Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classes and interfaces for labeling tokens with category labels (or
|
||||
"class labels"). Typically, labels are represented with strings
|
||||
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
|
||||
perform a wide range of classification tasks. For example,
|
||||
classifiers can be used...
|
||||
|
||||
- to classify documents by topic
|
||||
- to classify ambiguous words by which word sense is intended
|
||||
- to classify acoustic signals by which phoneme they represent
|
||||
- to classify sentences by their author
|
||||
|
||||
Features
|
||||
========
|
||||
In order to decide which category label is appropriate for a given
|
||||
token, classifiers examine one or more 'features' of the token. These
|
||||
"features" are typically chosen by hand, and indicate which aspects
|
||||
of the token are relevant to the classification decision. For
|
||||
example, a document classifier might use a separate feature for each
|
||||
word, recording how often that word occurred in the document.
|
||||
|
||||
Featuresets
|
||||
===========
|
||||
The features describing a token are encoded using a "featureset",
|
||||
which is a dictionary that maps from "feature names" to "feature
|
||||
values". Feature names are unique strings that indicate what aspect
|
||||
of the token is encoded by the feature. Examples include
|
||||
``'prevword'``, for a feature whose value is the previous word; and
|
||||
``'contains-word(library)'`` for a feature that is true when a document
|
||||
contains the word ``'library'``. Feature values are typically
|
||||
booleans, numbers, or strings, depending on which feature they
|
||||
describe.
|
||||
|
||||
Featuresets are typically constructed using a "feature detector"
|
||||
(also known as a "feature extractor"). A feature detector is a
|
||||
function that takes a token (and sometimes information about its
|
||||
context) as its input, and returns a featureset describing that token.
|
||||
For example, the following feature detector converts a document
|
||||
(stored as a list of words) to a featureset describing the set of
|
||||
words included in the document:
|
||||
|
||||
>>> # Define a feature detector function.
|
||||
>>> def document_features(document):
|
||||
... return dict([('contains-word(%s)' % w, True) for w in document])
|
||||
|
||||
Feature detectors are typically applied to each token before it is fed
|
||||
to the classifier:
|
||||
|
||||
>>> # Classify each Gutenberg document.
|
||||
>>> from nltk.corpus import gutenberg
|
||||
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
|
||||
... doc = gutenberg.words(fileid) # doctest: +SKIP
|
||||
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
|
||||
|
||||
The parameters that a feature detector expects will vary, depending on
|
||||
the task and the needs of the feature detector. For example, a
|
||||
feature detector for word sense disambiguation (WSD) might take as its
|
||||
input a sentence, and the index of a word that should be classified,
|
||||
and return a featureset for that word. The following feature detector
|
||||
for WSD includes features describing the left and right contexts of
|
||||
the target word:
|
||||
|
||||
>>> def wsd_features(sentence, index):
|
||||
... featureset = {}
|
||||
... for i in range(max(0, index-3), index):
|
||||
... featureset['left-context(%s)' % sentence[i]] = True
|
||||
... for i in range(index, max(index+3, len(sentence))):
|
||||
... featureset['right-context(%s)' % sentence[i]] = True
|
||||
... return featureset
|
||||
|
||||
Training Classifiers
|
||||
====================
|
||||
Most classifiers are built by training them on a list of hand-labeled
|
||||
examples, known as the "training set". Training sets are represented
|
||||
as lists of ``(featuredict, label)`` tuples.
|
||||
"""
|
||||
|
||||
from nltk.classify.api import ClassifierI, MultiClassifierI
|
||||
from nltk.classify.decisiontree import DecisionTreeClassifier
|
||||
from nltk.classify.maxent import (
|
||||
BinaryMaxentFeatureEncoding,
|
||||
ConditionalExponentialClassifier,
|
||||
MaxentClassifier,
|
||||
TypedMaxentFeatureEncoding,
|
||||
)
|
||||
from nltk.classify.megam import call_megam, config_megam
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
|
||||
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
|
||||
from nltk.classify.scikitlearn import SklearnClassifier
|
||||
from nltk.classify.senna import Senna
|
||||
from nltk.classify.textcat import TextCat
|
||||
from nltk.classify.util import accuracy, apply_features, log_likelihood
|
||||
from nltk.classify.weka import WekaClassifier, config_weka
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
195
Backend/venv/lib/python3.12/site-packages/nltk/classify/api.py
Normal file
195
Backend/venv/lib/python3.12/site-packages/nltk/classify/api.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# Natural Language Toolkit: Classifier Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Interfaces for labeling tokens with category labels (or "class labels").
|
||||
|
||||
``ClassifierI`` is a standard interface for "single-category
|
||||
classification", in which the set of categories is known, the number
|
||||
of categories is finite, and each text belongs to exactly one
|
||||
category.
|
||||
|
||||
``MultiClassifierI`` is a standard interface for "multi-category
|
||||
classification", which is like single-category classification except
|
||||
that each text belongs to zero or more categories.
|
||||
"""
|
||||
from nltk.internals import overridden
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
# { Classification Interfaces
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class ClassifierI:
|
||||
"""
|
||||
A processing interface for labeling tokens with a single category
|
||||
label (or "class"). Labels are typically strs or
|
||||
ints, but can be any immutable type. The set of labels
|
||||
that the classifier chooses from must be fixed and finite.
|
||||
|
||||
Subclasses must define:
|
||||
- ``labels()``
|
||||
- either ``classify()`` or ``classify_many()`` (or both)
|
||||
|
||||
Subclasses may define:
|
||||
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
||||
"""
|
||||
|
||||
def labels(self):
|
||||
"""
|
||||
:return: the list of category labels used by this classifier.
|
||||
:rtype: list of (immutable)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify(self, featureset):
|
||||
"""
|
||||
:return: the most appropriate label for the given featureset.
|
||||
:rtype: label
|
||||
"""
|
||||
if overridden(self.classify_many):
|
||||
return self.classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
"""
|
||||
:return: a probability distribution over labels for the given
|
||||
featureset.
|
||||
:rtype: ProbDistI
|
||||
"""
|
||||
if overridden(self.prob_classify_many):
|
||||
return self.prob_classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(label)
|
||||
"""
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(ProbDistI)
|
||||
"""
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
|
||||
class MultiClassifierI:
|
||||
"""
|
||||
A processing interface for labeling tokens with zero or more
|
||||
category labels (or "labels"). Labels are typically strs
|
||||
or ints, but can be any immutable type. The set of labels
|
||||
that the multi-classifier chooses from must be fixed and finite.
|
||||
|
||||
Subclasses must define:
|
||||
- ``labels()``
|
||||
- either ``classify()`` or ``classify_many()`` (or both)
|
||||
|
||||
Subclasses may define:
|
||||
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
||||
"""
|
||||
|
||||
def labels(self):
|
||||
"""
|
||||
:return: the list of category labels used by this classifier.
|
||||
:rtype: list of (immutable)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify(self, featureset):
|
||||
"""
|
||||
:return: the most appropriate set of labels for the given featureset.
|
||||
:rtype: set(label)
|
||||
"""
|
||||
if overridden(self.classify_many):
|
||||
return self.classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
"""
|
||||
:return: a probability distribution over sets of labels for the
|
||||
given featureset.
|
||||
:rtype: ProbDistI
|
||||
"""
|
||||
if overridden(self.prob_classify_many):
|
||||
return self.prob_classify_many([featureset])[0]
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(set(label))
|
||||
"""
|
||||
return [self.classify(fs) for fs in featuresets]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""
|
||||
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
||||
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
:rtype: list(ProbDistI)
|
||||
"""
|
||||
return [self.prob_classify(fs) for fs in featuresets]
|
||||
|
||||
|
||||
# # [XX] IN PROGRESS:
|
||||
# class SequenceClassifierI:
|
||||
# """
|
||||
# A processing interface for labeling sequences of tokens with a
|
||||
# single category label (or "class"). Labels are typically
|
||||
# strs or ints, but can be any immutable type. The set
|
||||
# of labels that the classifier chooses from must be fixed and
|
||||
# finite.
|
||||
# """
|
||||
# def labels(self):
|
||||
# """
|
||||
# :return: the list of category labels used by this classifier.
|
||||
# :rtype: list of (immutable)
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def prob_classify(self, featureset):
|
||||
# """
|
||||
# Return a probability distribution over labels for the given
|
||||
# featureset.
|
||||
|
||||
# If ``featureset`` is a list of featuresets, then return a
|
||||
# corresponding list containing the probability distribution
|
||||
# over labels for each of the given featuresets, where the
|
||||
# *i*\ th element of this list is the most appropriate label for
|
||||
# the *i*\ th element of ``featuresets``.
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def classify(self, featureset):
|
||||
# """
|
||||
# Return the most appropriate label for the given featureset.
|
||||
|
||||
# If ``featureset`` is a list of featuresets, then return a
|
||||
# corresponding list containing the most appropriate label for
|
||||
# each of the given featuresets, where the *i*\ th element of
|
||||
# this list is the most appropriate label for the *i*\ th element
|
||||
# of ``featuresets``.
|
||||
# """
|
||||
# raise NotImplementedError()
|
||||
@@ -0,0 +1,349 @@
|
||||
# Natural Language Toolkit: Decision Tree Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A classifier model that decides which label to assign to a token on
|
||||
the basis of a tree structure, where branches correspond to conditions
|
||||
on feature values, and leaves correspond to label assignments.
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import FreqDist, MLEProbDist, entropy
|
||||
|
||||
|
||||
class DecisionTreeClassifier(ClassifierI):
|
||||
def __init__(self, label, feature_name=None, decisions=None, default=None):
|
||||
"""
|
||||
:param label: The most likely label for tokens that reach
|
||||
this node in the decision tree. If this decision tree
|
||||
has no children, then this label will be assigned to
|
||||
any token that reaches this decision tree.
|
||||
:param feature_name: The name of the feature that this
|
||||
decision tree selects for.
|
||||
:param decisions: A dictionary mapping from feature values
|
||||
for the feature identified by ``feature_name`` to
|
||||
child decision trees.
|
||||
:param default: The child that will be used if the value of
|
||||
feature ``feature_name`` does not match any of the keys in
|
||||
``decisions``. This is used when constructing binary
|
||||
decision trees.
|
||||
"""
|
||||
self._label = label
|
||||
self._fname = feature_name
|
||||
self._decisions = decisions
|
||||
self._default = default
|
||||
|
||||
def labels(self):
|
||||
labels = [self._label]
|
||||
if self._decisions is not None:
|
||||
for dt in self._decisions.values():
|
||||
labels.extend(dt.labels())
|
||||
if self._default is not None:
|
||||
labels.extend(self._default.labels())
|
||||
return list(set(labels))
|
||||
|
||||
def classify(self, featureset):
|
||||
# Decision leaf:
|
||||
if self._fname is None:
|
||||
return self._label
|
||||
|
||||
# Decision tree:
|
||||
fval = featureset.get(self._fname)
|
||||
if fval in self._decisions:
|
||||
return self._decisions[fval].classify(featureset)
|
||||
elif self._default is not None:
|
||||
return self._default.classify(featureset)
|
||||
else:
|
||||
return self._label
|
||||
|
||||
def error(self, labeled_featuresets):
|
||||
errors = 0
|
||||
for featureset, label in labeled_featuresets:
|
||||
if self.classify(featureset) != label:
|
||||
errors += 1
|
||||
return errors / len(labeled_featuresets)
|
||||
|
||||
def pretty_format(self, width=70, prefix="", depth=4):
|
||||
"""
|
||||
Return a string containing a pretty-printed version of this
|
||||
decision tree. Each line in this string corresponds to a
|
||||
single decision tree node or leaf, and indentation is used to
|
||||
display the structure of the decision tree.
|
||||
"""
|
||||
# [xx] display default!!
|
||||
if self._fname is None:
|
||||
n = width - len(prefix) - 15
|
||||
return "{}{} {}\n".format(prefix, "." * n, self._label)
|
||||
s = ""
|
||||
for i, (fval, result) in enumerate(
|
||||
sorted(
|
||||
self._decisions.items(),
|
||||
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
||||
)
|
||||
):
|
||||
hdr = f"{prefix}{self._fname}={fval}? "
|
||||
n = width - 15 - len(hdr)
|
||||
s += "{}{} {}\n".format(hdr, "." * (n), result._label)
|
||||
if result._fname is not None and depth > 1:
|
||||
s += result.pretty_format(width, prefix + " ", depth - 1)
|
||||
if self._default is not None:
|
||||
n = width - len(prefix) - 21
|
||||
s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
|
||||
if self._default._fname is not None and depth > 1:
|
||||
s += self._default.pretty_format(width, prefix + " ", depth - 1)
|
||||
return s
|
||||
|
||||
def pseudocode(self, prefix="", depth=4):
|
||||
"""
|
||||
Return a string representation of this decision tree that
|
||||
expresses the decisions it makes as a nested set of pseudocode
|
||||
if statements.
|
||||
"""
|
||||
if self._fname is None:
|
||||
return f"{prefix}return {self._label!r}\n"
|
||||
s = ""
|
||||
for fval, result in sorted(
|
||||
self._decisions.items(),
|
||||
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
||||
):
|
||||
s += f"{prefix}if {self._fname} == {fval!r}: "
|
||||
if result._fname is not None and depth > 1:
|
||||
s += "\n" + result.pseudocode(prefix + " ", depth - 1)
|
||||
else:
|
||||
s += f"return {result._label!r}\n"
|
||||
if self._default is not None:
|
||||
if len(self._decisions) == 1:
|
||||
s += "{}if {} != {!r}: ".format(
|
||||
prefix, self._fname, list(self._decisions.keys())[0]
|
||||
)
|
||||
else:
|
||||
s += f"{prefix}else: "
|
||||
if self._default._fname is not None and depth > 1:
|
||||
s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
|
||||
else:
|
||||
s += f"return {self._default._label!r}\n"
|
||||
return s
|
||||
|
||||
def __str__(self):
|
||||
return self.pretty_format()
|
||||
|
||||
@staticmethod
|
||||
def train(
|
||||
labeled_featuresets,
|
||||
entropy_cutoff=0.05,
|
||||
depth_cutoff=100,
|
||||
support_cutoff=10,
|
||||
binary=False,
|
||||
feature_values=None,
|
||||
verbose=False,
|
||||
):
|
||||
"""
|
||||
:param binary: If true, then treat all feature/value pairs as
|
||||
individual binary features, rather than using a single n-way
|
||||
branch for each feature.
|
||||
"""
|
||||
# Collect a list of all feature names.
|
||||
feature_names = set()
|
||||
for featureset, label in labeled_featuresets:
|
||||
for fname in featureset:
|
||||
feature_names.add(fname)
|
||||
|
||||
# Collect a list of the values each feature can take.
|
||||
if feature_values is None and binary:
|
||||
feature_values = defaultdict(set)
|
||||
for featureset, label in labeled_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
feature_values[fname].add(fval)
|
||||
|
||||
# Start with a stump.
|
||||
if not binary:
|
||||
tree = DecisionTreeClassifier.best_stump(
|
||||
feature_names, labeled_featuresets, verbose
|
||||
)
|
||||
else:
|
||||
tree = DecisionTreeClassifier.best_binary_stump(
|
||||
feature_names, labeled_featuresets, feature_values, verbose
|
||||
)
|
||||
|
||||
# Refine the stump.
|
||||
tree.refine(
|
||||
labeled_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff - 1,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
|
||||
# Return it
|
||||
return tree
|
||||
|
||||
@staticmethod
|
||||
def leaf(labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
return DecisionTreeClassifier(label)
|
||||
|
||||
@staticmethod
|
||||
def stump(feature_name, labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
|
||||
# Find the best label for each value.
|
||||
freqs = defaultdict(FreqDist) # freq(label|value)
|
||||
for featureset, label in labeled_featuresets:
|
||||
feature_value = featureset.get(feature_name)
|
||||
freqs[feature_value][label] += 1
|
||||
|
||||
decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
|
||||
return DecisionTreeClassifier(label, feature_name, decisions)
|
||||
|
||||
def refine(
|
||||
self,
|
||||
labeled_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary=False,
|
||||
feature_values=None,
|
||||
verbose=False,
|
||||
):
|
||||
if len(labeled_featuresets) <= support_cutoff:
|
||||
return
|
||||
if self._fname is None:
|
||||
return
|
||||
if depth_cutoff <= 0:
|
||||
return
|
||||
for fval in self._decisions:
|
||||
fval_featuresets = [
|
||||
(featureset, label)
|
||||
for (featureset, label) in labeled_featuresets
|
||||
if featureset.get(self._fname) == fval
|
||||
]
|
||||
|
||||
label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
|
||||
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
||||
self._decisions[fval] = DecisionTreeClassifier.train(
|
||||
fval_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
if self._default is not None:
|
||||
default_featuresets = [
|
||||
(featureset, label)
|
||||
for (featureset, label) in labeled_featuresets
|
||||
if featureset.get(self._fname) not in self._decisions
|
||||
]
|
||||
label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
|
||||
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
||||
self._default = DecisionTreeClassifier.train(
|
||||
default_featuresets,
|
||||
entropy_cutoff,
|
||||
depth_cutoff,
|
||||
support_cutoff,
|
||||
binary,
|
||||
feature_values,
|
||||
verbose,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def best_stump(feature_names, labeled_featuresets, verbose=False):
|
||||
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
||||
best_error = best_stump.error(labeled_featuresets)
|
||||
for fname in feature_names:
|
||||
stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
|
||||
stump_error = stump.error(labeled_featuresets)
|
||||
if stump_error < best_error:
|
||||
best_error = stump_error
|
||||
best_stump = stump
|
||||
if verbose:
|
||||
print(
|
||||
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
||||
len(labeled_featuresets), best_stump._fname, best_error
|
||||
)
|
||||
)
|
||||
return best_stump
|
||||
|
||||
@staticmethod
|
||||
def binary_stump(feature_name, feature_value, labeled_featuresets):
|
||||
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
||||
|
||||
# Find the best label for each value.
|
||||
pos_fdist = FreqDist()
|
||||
neg_fdist = FreqDist()
|
||||
for featureset, label in labeled_featuresets:
|
||||
if featureset.get(feature_name) == feature_value:
|
||||
pos_fdist[label] += 1
|
||||
else:
|
||||
neg_fdist[label] += 1
|
||||
|
||||
decisions = {}
|
||||
default = label
|
||||
# But hopefully we have observations!
|
||||
if pos_fdist.N() > 0:
|
||||
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
|
||||
if neg_fdist.N() > 0:
|
||||
default = DecisionTreeClassifier(neg_fdist.max())
|
||||
|
||||
return DecisionTreeClassifier(label, feature_name, decisions, default)
|
||||
|
||||
@staticmethod
|
||||
def best_binary_stump(
|
||||
feature_names, labeled_featuresets, feature_values, verbose=False
|
||||
):
|
||||
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
||||
best_error = best_stump.error(labeled_featuresets)
|
||||
for fname in feature_names:
|
||||
for fval in feature_values[fname]:
|
||||
stump = DecisionTreeClassifier.binary_stump(
|
||||
fname, fval, labeled_featuresets
|
||||
)
|
||||
stump_error = stump.error(labeled_featuresets)
|
||||
if stump_error < best_error:
|
||||
best_error = stump_error
|
||||
best_stump = stump
|
||||
if verbose:
|
||||
if best_stump._decisions:
|
||||
descr = "{}={}".format(
|
||||
best_stump._fname, list(best_stump._decisions.keys())[0]
|
||||
)
|
||||
else:
|
||||
descr = "(default)"
|
||||
print(
|
||||
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
||||
len(labeled_featuresets), descr, best_error
|
||||
)
|
||||
)
|
||||
return best_stump
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def f(x):
|
||||
return DecisionTreeClassifier.train(x, binary=True, verbose=True)
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import binary_names_demo_features, names_demo
|
||||
|
||||
classifier = names_demo(
|
||||
f, binary_names_demo_features # DecisionTreeClassifier.train,
|
||||
)
|
||||
print(classifier.pretty_format(depth=7))
|
||||
print(classifier.pseudocode(depth=7))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
1631
Backend/venv/lib/python3.12/site-packages/nltk/classify/maxent.py
Normal file
1631
Backend/venv/lib/python3.12/site-packages/nltk/classify/maxent.py
Normal file
File diff suppressed because it is too large
Load Diff
184
Backend/venv/lib/python3.12/site-packages/nltk/classify/megam.py
Normal file
184
Backend/venv/lib/python3.12/site-packages/nltk/classify/megam.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Natural Language Toolkit: Interface to Megam Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A set of functions used to interface with the external megam_ maxent
|
||||
optimization package. Before megam can be used, you should tell NLTK where it
|
||||
can find the megam binary, using the ``config_megam()`` function. Typical
|
||||
usage:
|
||||
|
||||
>>> from nltk.classify import megam
|
||||
>>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
|
||||
[Found megam: ...]
|
||||
|
||||
Use with MaxentClassifier. Example below, see MaxentClassifier documentation
|
||||
for details.
|
||||
|
||||
nltk.classify.MaxentClassifier.train(corpus, 'megam')
|
||||
|
||||
.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
from nltk.internals import find_binary
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
numpy = None
|
||||
|
||||
######################################################################
|
||||
# { Configuration
|
||||
######################################################################
|
||||
|
||||
_megam_bin = None
|
||||
|
||||
|
||||
def config_megam(bin=None):
|
||||
"""
|
||||
Configure NLTK's interface to the ``megam`` maxent optimization
|
||||
package.
|
||||
|
||||
:param bin: The full path to the ``megam`` binary. If not specified,
|
||||
then nltk will search the system for a ``megam`` binary; and if
|
||||
one is not found, it will raise a ``LookupError`` exception.
|
||||
:type bin: str
|
||||
"""
|
||||
global _megam_bin
|
||||
_megam_bin = find_binary(
|
||||
"megam",
|
||||
bin,
|
||||
env_vars=["MEGAM"],
|
||||
binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
|
||||
url="https://www.umiacs.umd.edu/~hal/megam/index.html",
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Megam Interface Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
|
||||
"""
|
||||
Generate an input file for ``megam`` based on the given corpus of
|
||||
classified tokens.
|
||||
|
||||
:type train_toks: list(tuple(dict, str))
|
||||
:param train_toks: Training data, represented as a list of
|
||||
pairs, the first member of which is a feature dictionary,
|
||||
and the second of which is a classification label.
|
||||
|
||||
:type encoding: MaxentFeatureEncodingI
|
||||
:param encoding: A feature encoding, used to convert featuresets
|
||||
into feature vectors. May optionally implement a cost() method
|
||||
in order to assign different costs to different class predictions.
|
||||
|
||||
:type stream: stream
|
||||
:param stream: The stream to which the megam input file should be
|
||||
written.
|
||||
|
||||
:param bernoulli: If true, then use the 'bernoulli' format. I.e.,
|
||||
all joint features have binary values, and are listed iff they
|
||||
are true. Otherwise, list feature values explicitly. If
|
||||
``bernoulli=False``, then you must call ``megam`` with the
|
||||
``-fvals`` option.
|
||||
|
||||
:param explicit: If true, then use the 'explicit' format. I.e.,
|
||||
list the features that would fire for any of the possible
|
||||
labels, for each token. If ``explicit=True``, then you must
|
||||
call ``megam`` with the ``-explicit`` option.
|
||||
"""
|
||||
# Look up the set of labels.
|
||||
labels = encoding.labels()
|
||||
labelnum = {label: i for (i, label) in enumerate(labels)}
|
||||
|
||||
# Write the file, which contains one line per instance.
|
||||
for featureset, label in train_toks:
|
||||
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
|
||||
if hasattr(encoding, "cost"):
|
||||
stream.write(
|
||||
":".join(str(encoding.cost(featureset, label, l)) for l in labels)
|
||||
)
|
||||
else:
|
||||
stream.write("%d" % labelnum[label])
|
||||
|
||||
# For implicit file formats, just list the features that fire
|
||||
# for this instance's actual label.
|
||||
if not explicit:
|
||||
_write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
|
||||
|
||||
# For explicit formats, list the features that would fire for
|
||||
# any of the possible labels.
|
||||
else:
|
||||
for l in labels:
|
||||
stream.write(" #")
|
||||
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
|
||||
|
||||
# End of the instance.
|
||||
stream.write("\n")
|
||||
|
||||
|
||||
def parse_megam_weights(s, features_count, explicit=True):
|
||||
"""
|
||||
Given the stdout output generated by ``megam`` when training a
|
||||
model, return a ``numpy`` array containing the corresponding weight
|
||||
vector. This function does not currently handle bias features.
|
||||
"""
|
||||
if numpy is None:
|
||||
raise ValueError("This function requires that numpy be installed")
|
||||
assert explicit, "non-explicit not supported yet"
|
||||
lines = s.strip().split("\n")
|
||||
weights = numpy.zeros(features_count, "d")
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
fid, weight = line.split()
|
||||
weights[int(fid)] = float(weight)
|
||||
return weights
|
||||
|
||||
|
||||
def _write_megam_features(vector, stream, bernoulli):
|
||||
if not vector:
|
||||
raise ValueError(
|
||||
"MEGAM classifier requires the use of an " "always-on feature."
|
||||
)
|
||||
for fid, fval in vector:
|
||||
if bernoulli:
|
||||
if fval == 1:
|
||||
stream.write(" %s" % fid)
|
||||
elif fval != 0:
|
||||
raise ValueError(
|
||||
"If bernoulli=True, then all" "features must be binary."
|
||||
)
|
||||
else:
|
||||
stream.write(f" {fid} {fval}")
|
||||
|
||||
|
||||
def call_megam(args):
|
||||
"""
|
||||
Call the ``megam`` binary with the given arguments.
|
||||
"""
|
||||
if isinstance(args, str):
|
||||
raise TypeError("args should be a list of strings")
|
||||
if _megam_bin is None:
|
||||
config_megam()
|
||||
|
||||
# Call megam via a subprocess
|
||||
cmd = [_megam_bin] + args
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
||||
(stdout, stderr) = p.communicate()
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
print()
|
||||
print(stderr)
|
||||
raise OSError("megam command failed!")
|
||||
|
||||
if isinstance(stdout, str):
|
||||
return stdout
|
||||
else:
|
||||
return stdout.decode("utf-8")
|
||||
@@ -0,0 +1,260 @@
|
||||
# Natural Language Toolkit: Naive Bayes Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A classifier based on the Naive Bayes algorithm. In order to find the
|
||||
probability for a label, this algorithm first uses the Bayes rule to
|
||||
express P(label|features) in terms of P(label) and P(features|label):
|
||||
|
||||
| P(label) * P(features|label)
|
||||
| P(label|features) = ------------------------------
|
||||
| P(features)
|
||||
|
||||
The algorithm then makes the 'naive' assumption that all features are
|
||||
independent, given the label:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| P(features)
|
||||
|
||||
Rather than computing P(features) explicitly, the algorithm just
|
||||
calculates the numerator for each label, and normalizes them so they
|
||||
sum to one:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Naive Bayes Classifier
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class NaiveBayesClassifier(ClassifierI):
|
||||
"""
|
||||
A Naive Bayes classifier. Naive Bayes classifiers are
|
||||
paramaterized by two probability distributions:
|
||||
|
||||
- P(label) gives the probability that an input will receive each
|
||||
label, given no information about the input's features.
|
||||
|
||||
- P(fname=fval|label) gives the probability that a given feature
|
||||
(fname) will receive a given value (fval), given that the
|
||||
label (label).
|
||||
|
||||
If the classifier encounters an input with a feature that has
|
||||
never been seen with any label, then rather than assigning a
|
||||
probability of 0 to all labels, it will ignore that feature.
|
||||
|
||||
The feature value 'None' is reserved for unseen feature values;
|
||||
you generally should not use 'None' as a feature value for one of
|
||||
your own features.
|
||||
"""
|
||||
|
||||
def __init__(self, label_probdist, feature_probdist):
|
||||
"""
|
||||
:param label_probdist: P(label), the probability distribution
|
||||
over labels. It is expressed as a ``ProbDistI`` whose
|
||||
samples are labels. I.e., P(label) =
|
||||
``label_probdist.prob(label)``.
|
||||
|
||||
:param feature_probdist: P(fname=fval|label), the probability
|
||||
distribution for feature values, given labels. It is
|
||||
expressed as a dictionary whose keys are ``(label, fname)``
|
||||
pairs and whose values are ``ProbDistI`` objects over feature
|
||||
values. I.e., P(fname=fval|label) =
|
||||
``feature_probdist[label,fname].prob(fval)``. If a given
|
||||
``(label,fname)`` is not a key in ``feature_probdist``, then
|
||||
it is assumed that the corresponding P(fname=fval|label)
|
||||
is 0 for all values of ``fval``.
|
||||
"""
|
||||
self._label_probdist = label_probdist
|
||||
self._feature_probdist = feature_probdist
|
||||
self._labels = list(label_probdist.samples())
|
||||
|
||||
def labels(self):
|
||||
return self._labels
|
||||
|
||||
def classify(self, featureset):
|
||||
return self.prob_classify(featureset).max()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
# Discard any feature names that we've never seen before.
|
||||
# Otherwise, we'll just assign a probability of 0 to
|
||||
# everything.
|
||||
featureset = featureset.copy()
|
||||
for fname in list(featureset.keys()):
|
||||
for label in self._labels:
|
||||
if (label, fname) in self._feature_probdist:
|
||||
break
|
||||
else:
|
||||
# print('Ignoring unseen feature %s' % fname)
|
||||
del featureset[fname]
|
||||
|
||||
# Find the log probability of each label, given the features.
|
||||
# Start with the log probability of the label itself.
|
||||
logprob = {}
|
||||
for label in self._labels:
|
||||
logprob[label] = self._label_probdist.logprob(label)
|
||||
|
||||
# Then add in the log probability of features given labels.
|
||||
for label in self._labels:
|
||||
for fname, fval in featureset.items():
|
||||
if (label, fname) in self._feature_probdist:
|
||||
feature_probs = self._feature_probdist[label, fname]
|
||||
logprob[label] += feature_probs.logprob(fval)
|
||||
else:
|
||||
# nb: This case will never come up if the
|
||||
# classifier was created by
|
||||
# NaiveBayesClassifier.train().
|
||||
logprob[label] += sum_logs([]) # = -INF.
|
||||
|
||||
return DictionaryProbDist(logprob, normalize=True, log=True)
|
||||
|
||||
def show_most_informative_features(self, n=10):
|
||||
# Determine the most relevant features, and display them.
|
||||
cpdist = self._feature_probdist
|
||||
print("Most Informative Features")
|
||||
|
||||
for fname, fval in self.most_informative_features(n):
|
||||
|
||||
def labelprob(l):
|
||||
return cpdist[l, fname].prob(fval)
|
||||
|
||||
labels = sorted(
|
||||
(l for l in self._labels if fval in cpdist[l, fname].samples()),
|
||||
key=lambda element: (-labelprob(element), element),
|
||||
reverse=True,
|
||||
)
|
||||
if len(labels) == 1:
|
||||
continue
|
||||
l0 = labels[0]
|
||||
l1 = labels[-1]
|
||||
if cpdist[l0, fname].prob(fval) == 0:
|
||||
ratio = "INF"
|
||||
else:
|
||||
ratio = "%8.1f" % (
|
||||
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
|
||||
)
|
||||
print(
|
||||
"%24s = %-14r %6s : %-6s = %s : 1.0"
|
||||
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
|
||||
)
|
||||
|
||||
def most_informative_features(self, n=100):
|
||||
"""
|
||||
Return a list of the 'most informative' features used by this
|
||||
classifier. For the purpose of this function, the
|
||||
informativeness of a feature ``(fname,fval)`` is equal to the
|
||||
highest value of P(fname=fval|label), for any label, divided by
|
||||
the lowest value of P(fname=fval|label), for any label:
|
||||
|
||||
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
|
||||
"""
|
||||
if hasattr(self, "_most_informative_features"):
|
||||
return self._most_informative_features[:n]
|
||||
else:
|
||||
# The set of (fname, fval) pairs used by this classifier.
|
||||
features = set()
|
||||
# The max & min probability associated w/ each (fname, fval)
|
||||
# pair. Maps (fname,fval) -> float.
|
||||
maxprob = defaultdict(float)
|
||||
minprob = defaultdict(lambda: 1.0)
|
||||
|
||||
for (label, fname), probdist in self._feature_probdist.items():
|
||||
for fval in probdist.samples():
|
||||
feature = (fname, fval)
|
||||
features.add(feature)
|
||||
p = probdist.prob(fval)
|
||||
maxprob[feature] = max(p, maxprob[feature])
|
||||
minprob[feature] = min(p, minprob[feature])
|
||||
if minprob[feature] == 0:
|
||||
features.discard(feature)
|
||||
|
||||
# Convert features to a list, & sort it by how informative
|
||||
# features are.
|
||||
self._most_informative_features = sorted(
|
||||
features,
|
||||
key=lambda feature_: (
|
||||
minprob[feature_] / maxprob[feature_],
|
||||
feature_[0],
|
||||
feature_[1] in [None, False, True],
|
||||
str(feature_[1]).lower(),
|
||||
),
|
||||
)
|
||||
return self._most_informative_features[:n]
|
||||
|
||||
@classmethod
|
||||
def train(cls, labeled_featuresets, estimator=ELEProbDist):
|
||||
"""
|
||||
:param labeled_featuresets: A list of classified featuresets,
|
||||
i.e., a list of tuples ``(featureset, label)``.
|
||||
"""
|
||||
label_freqdist = FreqDist()
|
||||
feature_freqdist = defaultdict(FreqDist)
|
||||
feature_values = defaultdict(set)
|
||||
fnames = set()
|
||||
|
||||
# Count up how many times each feature value occurred, given
|
||||
# the label and featurename.
|
||||
for featureset, label in labeled_featuresets:
|
||||
label_freqdist[label] += 1
|
||||
for fname, fval in featureset.items():
|
||||
# Increment freq(fval|label, fname)
|
||||
feature_freqdist[label, fname][fval] += 1
|
||||
# Record that fname can take the value fval.
|
||||
feature_values[fname].add(fval)
|
||||
# Keep a list of all feature names.
|
||||
fnames.add(fname)
|
||||
|
||||
# If a feature didn't have a value given for an instance, then
|
||||
# we assume that it gets the implicit value 'None.' This loop
|
||||
# counts up the number of 'missing' feature values for each
|
||||
# (label,fname) pair, and increments the count of the fval
|
||||
# 'None' by that amount.
|
||||
for label in label_freqdist:
|
||||
num_samples = label_freqdist[label]
|
||||
for fname in fnames:
|
||||
count = feature_freqdist[label, fname].N()
|
||||
# Only add a None key when necessary, i.e. if there are
|
||||
# any samples with feature 'fname' missing.
|
||||
if num_samples - count > 0:
|
||||
feature_freqdist[label, fname][None] += num_samples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
# Create the P(label) distribution
|
||||
label_probdist = estimator(label_freqdist)
|
||||
|
||||
# Create the P(fval|label, fname) distribution
|
||||
feature_probdist = {}
|
||||
for (label, fname), freqdist in feature_freqdist.items():
|
||||
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
feature_probdist[label, fname] = probdist
|
||||
|
||||
return cls(label_probdist, feature_probdist)
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import names_demo
|
||||
|
||||
classifier = names_demo(NaiveBayesClassifier.train)
|
||||
classifier.show_most_informative_features()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,180 @@
|
||||
# Natural Language Toolkit: Positive Naive Bayes Classifier
|
||||
#
|
||||
# Copyright (C) 2012 NLTK Project
|
||||
# Author: Alessandro Presta <alessandro.presta@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A variant of the Naive Bayes Classifier that performs binary classification with
|
||||
partially-labeled training sets. In other words, assume we want to build a classifier
|
||||
that assigns each example to one of two complementary classes (e.g., male names and
|
||||
female names).
|
||||
If we have a training set with labeled examples for both classes, we can use a
|
||||
standard Naive Bayes Classifier. However, consider the case when we only have labeled
|
||||
examples for one of the classes, and other, unlabeled, examples.
|
||||
Then, assuming a prior distribution on the two labels, we can use the unlabeled set
|
||||
to estimate the frequencies of the various features.
|
||||
|
||||
Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
|
||||
and unlabeled examples. We are also given an estimate of P(1).
|
||||
|
||||
We compute P(feature|1) exactly as in the standard case.
|
||||
|
||||
To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
|
||||
assuming that the unlabeled examples are drawn according to the given prior distribution)
|
||||
and then express the conditional probability as:
|
||||
|
||||
| P(feature) - P(feature|1) * P(1)
|
||||
| P(feature|0) = ----------------------------------
|
||||
| P(0)
|
||||
|
||||
Example:
|
||||
|
||||
>>> from nltk.classify import PositiveNaiveBayesClassifier
|
||||
|
||||
Some sentences about sports:
|
||||
|
||||
>>> sports_sentences = [ 'The team dominated the game',
|
||||
... 'They lost the ball',
|
||||
... 'The game was intense',
|
||||
... 'The goalkeeper catched the ball',
|
||||
... 'The other team controlled the ball' ]
|
||||
|
||||
Mixed topics, including sports:
|
||||
|
||||
>>> various_sentences = [ 'The President did not comment',
|
||||
... 'I lost the keys',
|
||||
... 'The team won the game',
|
||||
... 'Sara has two kids',
|
||||
... 'The ball went off the court',
|
||||
... 'They had the ball for the whole game',
|
||||
... 'The show is over' ]
|
||||
|
||||
The features of a sentence are simply the words it contains:
|
||||
|
||||
>>> def features(sentence):
|
||||
... words = sentence.lower().split()
|
||||
... return dict(('contains(%s)' % w, True) for w in words)
|
||||
|
||||
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
|
||||
|
||||
>>> positive_featuresets = map(features, sports_sentences)
|
||||
>>> unlabeled_featuresets = map(features, various_sentences)
|
||||
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
|
||||
... unlabeled_featuresets)
|
||||
|
||||
Is the following sentence about sports?
|
||||
|
||||
>>> classifier.classify(features('The cat is on the table'))
|
||||
False
|
||||
|
||||
What about this one?
|
||||
|
||||
>>> classifier.classify(features('My team lost the game'))
|
||||
True
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Positive Naive Bayes Classifier
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
|
||||
@staticmethod
|
||||
def train(
|
||||
positive_featuresets,
|
||||
unlabeled_featuresets,
|
||||
positive_prob_prior=0.5,
|
||||
estimator=ELEProbDist,
|
||||
):
|
||||
"""
|
||||
:param positive_featuresets: An iterable of featuresets that are known as positive
|
||||
examples (i.e., their label is ``True``).
|
||||
|
||||
:param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
|
||||
|
||||
:param positive_prob_prior: A prior estimate of the probability of the label
|
||||
``True`` (default 0.5).
|
||||
"""
|
||||
positive_feature_freqdist = defaultdict(FreqDist)
|
||||
unlabeled_feature_freqdist = defaultdict(FreqDist)
|
||||
feature_values = defaultdict(set)
|
||||
fnames = set()
|
||||
|
||||
# Count up how many times each feature value occurred in positive examples.
|
||||
num_positive_examples = 0
|
||||
for featureset in positive_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
positive_feature_freqdist[fname][fval] += 1
|
||||
feature_values[fname].add(fval)
|
||||
fnames.add(fname)
|
||||
num_positive_examples += 1
|
||||
|
||||
# Count up how many times each feature value occurred in unlabeled examples.
|
||||
num_unlabeled_examples = 0
|
||||
for featureset in unlabeled_featuresets:
|
||||
for fname, fval in featureset.items():
|
||||
unlabeled_feature_freqdist[fname][fval] += 1
|
||||
feature_values[fname].add(fval)
|
||||
fnames.add(fname)
|
||||
num_unlabeled_examples += 1
|
||||
|
||||
# If a feature didn't have a value given for an instance, then we assume that
|
||||
# it gets the implicit value 'None'.
|
||||
for fname in fnames:
|
||||
count = positive_feature_freqdist[fname].N()
|
||||
positive_feature_freqdist[fname][None] += num_positive_examples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
for fname in fnames:
|
||||
count = unlabeled_feature_freqdist[fname].N()
|
||||
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
negative_prob_prior = 1.0 - positive_prob_prior
|
||||
|
||||
# Create the P(label) distribution.
|
||||
label_probdist = DictionaryProbDist(
|
||||
{True: positive_prob_prior, False: negative_prob_prior}
|
||||
)
|
||||
|
||||
# Create the P(fval|label, fname) distribution.
|
||||
feature_probdist = {}
|
||||
for fname, freqdist in positive_feature_freqdist.items():
|
||||
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
feature_probdist[True, fname] = probdist
|
||||
|
||||
for fname, freqdist in unlabeled_feature_freqdist.items():
|
||||
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
negative_feature_probs = {}
|
||||
for fval in feature_values[fname]:
|
||||
prob = (
|
||||
global_probdist.prob(fval)
|
||||
- positive_prob_prior * feature_probdist[True, fname].prob(fval)
|
||||
) / negative_prob_prior
|
||||
# TODO: We need to add some kind of smoothing here, instead of
|
||||
# setting negative probabilities to zero and normalizing.
|
||||
negative_feature_probs[fval] = max(prob, 0.0)
|
||||
feature_probdist[False, fname] = DictionaryProbDist(
|
||||
negative_feature_probs, normalize=True
|
||||
)
|
||||
|
||||
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import partial_names_demo
|
||||
|
||||
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
|
||||
classifier.show_most_informative_features()
|
||||
@@ -0,0 +1,183 @@
|
||||
# Natural Language Toolkit: RTE Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Simple classifier for RTE corpus.
|
||||
|
||||
It calculates the overlap in words and named entities between text and
|
||||
hypothesis, and also whether there are words / named entities in the
|
||||
hypothesis which fail to occur in the text, since this is an indicator that
|
||||
the hypothesis is more informative than (i.e not entailed by) the text.
|
||||
|
||||
TO DO: better Named Entity classification
|
||||
TO DO: add lemmatization
|
||||
"""
|
||||
|
||||
from nltk.classify.maxent import MaxentClassifier
|
||||
from nltk.classify.util import accuracy
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
|
||||
|
||||
class RTEFeatureExtractor:
|
||||
"""
|
||||
This builds a bag of words for both the text and the hypothesis after
|
||||
throwing away some stopwords, then calculates overlap and difference.
|
||||
"""
|
||||
|
||||
def __init__(self, rtepair, stop=True, use_lemmatize=False):
|
||||
"""
|
||||
:param rtepair: a ``RTEPair`` from which features should be extracted
|
||||
:param stop: if ``True``, stopwords are thrown away.
|
||||
:type stop: bool
|
||||
"""
|
||||
self.stop = stop
|
||||
self.stopwords = {
|
||||
"a",
|
||||
"the",
|
||||
"it",
|
||||
"they",
|
||||
"of",
|
||||
"in",
|
||||
"to",
|
||||
"is",
|
||||
"have",
|
||||
"are",
|
||||
"were",
|
||||
"and",
|
||||
"very",
|
||||
".",
|
||||
",",
|
||||
}
|
||||
|
||||
self.negwords = {"no", "not", "never", "failed", "rejected", "denied"}
|
||||
# Try to tokenize so that abbreviations, monetary amounts, email
|
||||
# addresses, URLs are single tokens.
|
||||
tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+")
|
||||
|
||||
# Get the set of word types for text and hypothesis
|
||||
self.text_tokens = tokenizer.tokenize(rtepair.text)
|
||||
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
|
||||
self.text_words = set(self.text_tokens)
|
||||
self.hyp_words = set(self.hyp_tokens)
|
||||
|
||||
if use_lemmatize:
|
||||
self.text_words = {self._lemmatize(token) for token in self.text_tokens}
|
||||
self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens}
|
||||
|
||||
if self.stop:
|
||||
self.text_words = self.text_words - self.stopwords
|
||||
self.hyp_words = self.hyp_words - self.stopwords
|
||||
|
||||
self._overlap = self.hyp_words & self.text_words
|
||||
self._hyp_extra = self.hyp_words - self.text_words
|
||||
self._txt_extra = self.text_words - self.hyp_words
|
||||
|
||||
def overlap(self, toktype, debug=False):
|
||||
"""
|
||||
Compute the overlap between text and hypothesis.
|
||||
|
||||
:param toktype: distinguish Named Entities from ordinary words
|
||||
:type toktype: 'ne' or 'word'
|
||||
"""
|
||||
ne_overlap = {token for token in self._overlap if self._ne(token)}
|
||||
if toktype == "ne":
|
||||
if debug:
|
||||
print("ne overlap", ne_overlap)
|
||||
return ne_overlap
|
||||
elif toktype == "word":
|
||||
if debug:
|
||||
print("word overlap", self._overlap - ne_overlap)
|
||||
return self._overlap - ne_overlap
|
||||
else:
|
||||
raise ValueError("Type not recognized:'%s'" % toktype)
|
||||
|
||||
def hyp_extra(self, toktype, debug=True):
|
||||
"""
|
||||
Compute the extraneous material in the hypothesis.
|
||||
|
||||
:param toktype: distinguish Named Entities from ordinary words
|
||||
:type toktype: 'ne' or 'word'
|
||||
"""
|
||||
ne_extra = {token for token in self._hyp_extra if self._ne(token)}
|
||||
if toktype == "ne":
|
||||
return ne_extra
|
||||
elif toktype == "word":
|
||||
return self._hyp_extra - ne_extra
|
||||
else:
|
||||
raise ValueError("Type not recognized: '%s'" % toktype)
|
||||
|
||||
@staticmethod
|
||||
def _ne(token):
|
||||
"""
|
||||
This just assumes that words in all caps or titles are
|
||||
named entities.
|
||||
|
||||
:type token: str
|
||||
"""
|
||||
if token.istitle() or token.isupper():
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _lemmatize(word):
|
||||
"""
|
||||
Use morphy from WordNet to find the base form of verbs.
|
||||
"""
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
lemma = wn.morphy(word, pos=wn.VERB)
|
||||
if lemma is not None:
|
||||
return lemma
|
||||
return word
|
||||
|
||||
|
||||
def rte_features(rtepair):
|
||||
extractor = RTEFeatureExtractor(rtepair)
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["word_overlap"] = len(extractor.overlap("word"))
|
||||
features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
|
||||
features["ne_overlap"] = len(extractor.overlap("ne"))
|
||||
features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
|
||||
features["neg_txt"] = len(extractor.negwords & extractor.text_words)
|
||||
features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
|
||||
return features
|
||||
|
||||
|
||||
def rte_featurize(rte_pairs):
|
||||
return [(rte_features(pair), pair.value) for pair in rte_pairs]
|
||||
|
||||
|
||||
def rte_classifier(algorithm, sample_N=None):
|
||||
from nltk.corpus import rte as rte_corpus
|
||||
|
||||
train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
|
||||
test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
|
||||
|
||||
if sample_N is not None:
|
||||
train_set = train_set[:sample_N]
|
||||
test_set = test_set[:sample_N]
|
||||
|
||||
featurized_train_set = rte_featurize(train_set)
|
||||
featurized_test_set = rte_featurize(test_set)
|
||||
|
||||
# Train the classifier
|
||||
print("Training classifier...")
|
||||
if algorithm in ["megam"]: # MEGAM based algorithms.
|
||||
clf = MaxentClassifier.train(featurized_train_set, algorithm)
|
||||
elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm
|
||||
clf = MaxentClassifier.train(featurized_train_set, algorithm)
|
||||
else:
|
||||
err_msg = str(
|
||||
"RTEClassifier only supports these algorithms:\n "
|
||||
"'megam', 'GIS', 'IIS'.\n"
|
||||
)
|
||||
raise Exception(err_msg)
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(clf, featurized_test_set)
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
return clf
|
||||
@@ -0,0 +1,143 @@
|
||||
# Natural Language Toolkit: Interface to scikit-learn classifiers
|
||||
#
|
||||
# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
scikit-learn (https://scikit-learn.org) is a machine learning library for
|
||||
Python. It supports many classification algorithms, including SVMs,
|
||||
Naive Bayes, logistic regression (MaxEnt) and decision trees.
|
||||
|
||||
This package implements a wrapper around scikit-learn classifiers. To use this
|
||||
wrapper, construct a scikit-learn estimator object, then use that to construct
|
||||
a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
|
||||
|
||||
>>> from sklearn.svm import LinearSVC
|
||||
>>> from nltk.classify.scikitlearn import SklearnClassifier
|
||||
>>> classif = SklearnClassifier(LinearSVC())
|
||||
|
||||
A scikit-learn classifier may include preprocessing steps when it's wrapped
|
||||
in a Pipeline object. The following constructs and wraps a Naive Bayes text
|
||||
classifier with tf-idf weighting and chi-square feature selection to get the
|
||||
best 1000 features:
|
||||
|
||||
>>> from sklearn.feature_extraction.text import TfidfTransformer
|
||||
>>> from sklearn.feature_selection import SelectKBest, chi2
|
||||
>>> from sklearn.naive_bayes import MultinomialNB
|
||||
>>> from sklearn.pipeline import Pipeline
|
||||
>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
|
||||
... ('chi2', SelectKBest(chi2, k=1000)),
|
||||
... ('nb', MultinomialNB())])
|
||||
>>> classif = SklearnClassifier(pipeline)
|
||||
"""
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
try:
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
__all__ = ["SklearnClassifier"]
|
||||
|
||||
|
||||
class SklearnClassifier(ClassifierI):
|
||||
"""Wrapper for scikit-learn classifiers."""
|
||||
|
||||
def __init__(self, estimator, dtype=float, sparse=True):
|
||||
"""
|
||||
:param estimator: scikit-learn classifier object.
|
||||
|
||||
:param dtype: data type used when building feature array.
|
||||
scikit-learn estimators work exclusively on numeric data. The
|
||||
default value should be fine for almost all situations.
|
||||
|
||||
:param sparse: Whether to use sparse matrices internally.
|
||||
The estimator must support these; not all scikit-learn classifiers
|
||||
do (see their respective documentation and look for "sparse
|
||||
matrix"). The default value is True, since most NLP problems
|
||||
involve sparse feature sets. Setting this to False may take a
|
||||
great amount of memory.
|
||||
:type sparse: boolean.
|
||||
"""
|
||||
self._clf = estimator
|
||||
self._encoder = LabelEncoder()
|
||||
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
|
||||
|
||||
def __repr__(self):
|
||||
return "<SklearnClassifier(%r)>" % self._clf
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
"""Classify a batch of samples.
|
||||
|
||||
:param featuresets: An iterable over featuresets, each a dict mapping
|
||||
strings to either numbers, booleans or strings.
|
||||
:return: The predicted class label for each input sample.
|
||||
:rtype: list
|
||||
"""
|
||||
X = self._vectorizer.transform(featuresets)
|
||||
classes = self._encoder.classes_
|
||||
return [classes[i] for i in self._clf.predict(X)]
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
"""Compute per-class probabilities for a batch of samples.
|
||||
|
||||
:param featuresets: An iterable over featuresets, each a dict mapping
|
||||
strings to either numbers, booleans or strings.
|
||||
:rtype: list of ``ProbDistI``
|
||||
"""
|
||||
X = self._vectorizer.transform(featuresets)
|
||||
y_proba_list = self._clf.predict_proba(X)
|
||||
return [self._make_probdist(y_proba) for y_proba in y_proba_list]
|
||||
|
||||
def labels(self):
|
||||
"""The class labels used by this classifier.
|
||||
|
||||
:rtype: list
|
||||
"""
|
||||
return list(self._encoder.classes_)
|
||||
|
||||
def train(self, labeled_featuresets):
|
||||
"""
|
||||
Train (fit) the scikit-learn estimator.
|
||||
|
||||
:param labeled_featuresets: A list of ``(featureset, label)``
|
||||
where each ``featureset`` is a dict mapping strings to either
|
||||
numbers, booleans or strings.
|
||||
"""
|
||||
|
||||
X, y = list(zip(*labeled_featuresets))
|
||||
X = self._vectorizer.fit_transform(X)
|
||||
y = self._encoder.fit_transform(y)
|
||||
self._clf.fit(X, y)
|
||||
|
||||
return self
|
||||
|
||||
def _make_probdist(self, y_proba):
|
||||
classes = self._encoder.classes_
|
||||
return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
|
||||
from nltk.classify.util import names_demo, names_demo_features
|
||||
|
||||
# Bernoulli Naive Bayes is designed for binary classification. We set the
|
||||
# binarize option to False since we know we're passing boolean features.
|
||||
print("scikit-learn Naive Bayes:")
|
||||
names_demo(
|
||||
SklearnClassifier(BernoulliNB(binarize=False)).train,
|
||||
features=names_demo_features,
|
||||
)
|
||||
|
||||
# The C parameter on logistic regression (MaxEnt) controls regularization.
|
||||
# The higher it's set, the less regularized the classifier is.
|
||||
print("\n\nscikit-learn logistic regression:")
|
||||
names_demo(
|
||||
SklearnClassifier(LogisticRegression(C=1000)).train,
|
||||
features=names_demo_features,
|
||||
)
|
||||
175
Backend/venv/lib/python3.12/site-packages/nltk/classify/senna.py
Normal file
175
Backend/venv/lib/python3.12/site-packages/nltk/classify/senna.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# Natural Language Toolkit: Senna Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A general interface to the SENNA pipeline that supports any of the
|
||||
operations specified in SUPPORTED_OPERATIONS.
|
||||
|
||||
Applying multiple operations at once has the speed advantage. For example,
|
||||
Senna will automatically determine POS tags if you are extracting named
|
||||
entities. Applying both of the operations will cost only the time of
|
||||
extracting the named entities.
|
||||
|
||||
The SENNA pipeline has a fixed maximum size of the sentences that it can read.
|
||||
By default it is 1024 token/sentence. If you have larger sentences, changing
|
||||
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
|
||||
system specific binary should be rebuilt. Otherwise this could introduce
|
||||
misalignment errors.
|
||||
|
||||
The input is:
|
||||
|
||||
- path to the directory that contains SENNA executables. If the path is incorrect,
|
||||
Senna will automatically search for executable file specified in SENNA environment variable
|
||||
- List of the operations needed to be performed.
|
||||
- (optionally) the encoding of the input data (default:utf-8)
|
||||
|
||||
Note: Unit tests for this module can be found in test/unit/test_senna.py
|
||||
|
||||
>>> from nltk.classify import Senna
|
||||
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP
|
||||
>>> sent = 'Dusseldorf is an international business center'.split()
|
||||
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
|
||||
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
|
||||
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
|
||||
"""
|
||||
|
||||
from os import environ, path, sep
|
||||
from platform import architecture, system
|
||||
from subprocess import PIPE, Popen
|
||||
|
||||
from nltk.tag.api import TaggerI
|
||||
|
||||
|
||||
class Senna(TaggerI):
|
||||
SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
|
||||
|
||||
def __init__(self, senna_path, operations, encoding="utf-8"):
|
||||
self._encoding = encoding
|
||||
self._path = path.normpath(senna_path) + sep
|
||||
|
||||
# Verifies the existence of the executable on the self._path first
|
||||
# senna_binary_file_1 = self.executable(self._path)
|
||||
exe_file_1 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_1):
|
||||
# Check for the system environment
|
||||
if "SENNA" in environ:
|
||||
# self._path = path.join(environ['SENNA'],'')
|
||||
self._path = path.normpath(environ["SENNA"]) + sep
|
||||
exe_file_2 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_2):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s or %s but not found"
|
||||
% (exe_file_1, exe_file_2)
|
||||
)
|
||||
|
||||
self.operations = operations
|
||||
|
||||
def executable(self, base_path):
|
||||
"""
|
||||
The function that determines the system specific binary that should be
|
||||
used in the pipeline. In case, the system is not known the default senna binary will
|
||||
be used.
|
||||
"""
|
||||
os_name = system()
|
||||
if os_name == "Linux":
|
||||
bits = architecture()[0]
|
||||
if bits == "64bit":
|
||||
return path.join(base_path, "senna-linux64")
|
||||
return path.join(base_path, "senna-linux32")
|
||||
if os_name == "Windows":
|
||||
return path.join(base_path, "senna-win32.exe")
|
||||
if os_name == "Darwin":
|
||||
return path.join(base_path, "senna-osx")
|
||||
return path.join(base_path, "senna")
|
||||
|
||||
def _map(self):
|
||||
"""
|
||||
A method that calculates the order of the columns that SENNA pipeline
|
||||
will output the tags into. This depends on the operations being ordered.
|
||||
"""
|
||||
_map = {}
|
||||
i = 1
|
||||
for operation in Senna.SUPPORTED_OPERATIONS:
|
||||
if operation in self.operations:
|
||||
_map[operation] = i
|
||||
i += 1
|
||||
return _map
|
||||
|
||||
def tag(self, tokens):
|
||||
"""
|
||||
Applies the specified operation(s) on a list of tokens.
|
||||
"""
|
||||
return self.tag_sents([tokens])[0]
|
||||
|
||||
def tag_sents(self, sentences):
|
||||
"""
|
||||
Applies the tag method over a list of sentences. This method will return a
|
||||
list of dictionaries. Every dictionary will contain a word with its
|
||||
calculated annotations/tags.
|
||||
"""
|
||||
encoding = self._encoding
|
||||
|
||||
if not path.isfile(self.executable(self._path)):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s but not found"
|
||||
% self.executable(self._path)
|
||||
)
|
||||
|
||||
# Build the senna command to run the tagger
|
||||
_senna_cmd = [
|
||||
self.executable(self._path),
|
||||
"-path",
|
||||
self._path,
|
||||
"-usrtokens",
|
||||
"-iobtags",
|
||||
]
|
||||
_senna_cmd.extend(["-" + op for op in self.operations])
|
||||
|
||||
# Serialize the actual sentences to a temporary string
|
||||
_input = "\n".join(" ".join(x) for x in sentences) + "\n"
|
||||
if isinstance(_input, str) and encoding:
|
||||
_input = _input.encode(encoding)
|
||||
|
||||
# Run the tagger and get the output
|
||||
p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
||||
(stdout, stderr) = p.communicate(input=_input)
|
||||
senna_output = stdout
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
raise RuntimeError("Senna command failed! Details: %s" % stderr)
|
||||
|
||||
if encoding:
|
||||
senna_output = stdout.decode(encoding)
|
||||
|
||||
# Output the tagged sentences
|
||||
map_ = self._map()
|
||||
tagged_sentences = [[]]
|
||||
sentence_index = 0
|
||||
token_index = 0
|
||||
for tagged_word in senna_output.strip().split("\n"):
|
||||
if not tagged_word:
|
||||
tagged_sentences.append([])
|
||||
sentence_index += 1
|
||||
token_index = 0
|
||||
continue
|
||||
tags = tagged_word.split("\t")
|
||||
result = {}
|
||||
for tag in map_:
|
||||
result[tag] = tags[map_[tag]].strip()
|
||||
try:
|
||||
result["word"] = sentences[sentence_index][token_index]
|
||||
except IndexError as e:
|
||||
raise IndexError(
|
||||
"Misalignment error occurred at sentence number %d. Possible reason"
|
||||
" is that the sentence size exceeded the maximum size. Check the "
|
||||
"documentation of Senna class for more information."
|
||||
% sentence_index
|
||||
) from e
|
||||
tagged_sentences[-1].append(result)
|
||||
token_index += 1
|
||||
return tagged_sentences
|
||||
@@ -0,0 +1,17 @@
|
||||
# Natural Language Toolkit: SVM-based classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
nltk.classify.svm was deprecated. For classification based
|
||||
on support vector machines SVMs use nltk.classify.scikitlearn
|
||||
(or `scikit-learn <https://scikit-learn.org>`_ directly).
|
||||
"""
|
||||
|
||||
|
||||
class SvmClassifier:
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise NotImplementedError(__doc__)
|
||||
122
Backend/venv/lib/python3.12/site-packages/nltk/classify/tadm.py
Normal file
122
Backend/venv/lib/python3.12/site-packages/nltk/classify/tadm.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# Natural Language Toolkit: Interface to TADM Classifier
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joseph Frazee <jfrazee@mail.utexas.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from nltk.internals import find_binary
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
_tadm_bin = None
|
||||
|
||||
|
||||
def config_tadm(bin=None):
|
||||
global _tadm_bin
|
||||
_tadm_bin = find_binary(
|
||||
"tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
|
||||
)
|
||||
|
||||
|
||||
def write_tadm_file(train_toks, encoding, stream):
|
||||
"""
|
||||
Generate an input file for ``tadm`` based on the given corpus of
|
||||
classified tokens.
|
||||
|
||||
:type train_toks: list(tuple(dict, str))
|
||||
:param train_toks: Training data, represented as a list of
|
||||
pairs, the first member of which is a feature dictionary,
|
||||
and the second of which is a classification label.
|
||||
:type encoding: TadmEventMaxentFeatureEncoding
|
||||
:param encoding: A feature encoding, used to convert featuresets
|
||||
into feature vectors.
|
||||
:type stream: stream
|
||||
:param stream: The stream to which the ``tadm`` input file should be
|
||||
written.
|
||||
"""
|
||||
# See the following for a file format description:
|
||||
#
|
||||
# https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054
|
||||
# https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
|
||||
labels = encoding.labels()
|
||||
for featureset, label in train_toks:
|
||||
length_line = "%d\n" % len(labels)
|
||||
stream.write(length_line)
|
||||
for known_label in labels:
|
||||
v = encoding.encode(featureset, known_label)
|
||||
line = "%d %d %s\n" % (
|
||||
int(label == known_label),
|
||||
len(v),
|
||||
" ".join("%d %d" % u for u in v),
|
||||
)
|
||||
stream.write(line)
|
||||
|
||||
|
||||
def parse_tadm_weights(paramfile):
|
||||
"""
|
||||
Given the stdout output generated by ``tadm`` when training a
|
||||
model, return a ``numpy`` array containing the corresponding weight
|
||||
vector.
|
||||
"""
|
||||
weights = []
|
||||
for line in paramfile:
|
||||
weights.append(float(line.strip()))
|
||||
return numpy.array(weights, "d")
|
||||
|
||||
|
||||
def call_tadm(args):
|
||||
"""
|
||||
Call the ``tadm`` binary with the given arguments.
|
||||
"""
|
||||
if isinstance(args, str):
|
||||
raise TypeError("args should be a list of strings")
|
||||
if _tadm_bin is None:
|
||||
config_tadm()
|
||||
|
||||
# Call tadm via a subprocess
|
||||
cmd = [_tadm_bin] + args
|
||||
p = subprocess.Popen(cmd, stdout=sys.stdout)
|
||||
(stdout, stderr) = p.communicate()
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
print()
|
||||
print(stderr)
|
||||
raise OSError("tadm command failed!")
|
||||
|
||||
|
||||
def names_demo():
|
||||
from nltk.classify.maxent import TadmMaxentClassifier
|
||||
from nltk.classify.util import names_demo
|
||||
|
||||
classifier = names_demo(TadmMaxentClassifier.train)
|
||||
|
||||
|
||||
def encoding_demo():
|
||||
import sys
|
||||
|
||||
from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
|
||||
|
||||
tokens = [
|
||||
({"f0": 1, "f1": 1, "f3": 1}, "A"),
|
||||
({"f0": 1, "f2": 1, "f4": 1}, "B"),
|
||||
({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
|
||||
]
|
||||
encoding = TadmEventMaxentFeatureEncoding.train(tokens)
|
||||
write_tadm_file(tokens, encoding, sys.stdout)
|
||||
print()
|
||||
for i in range(encoding.length()):
|
||||
print("%s --> %d" % (encoding.describe(i), i))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
encoding_demo()
|
||||
names_demo()
|
||||
@@ -0,0 +1,193 @@
|
||||
# Natural Language Toolkit: Language ID module using TextCat algorithm
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A module for language identification using the TextCat algorithm.
|
||||
An implementation of the text categorization algorithm
|
||||
presented in Cavnar, W. B. and J. M. Trenkle,
|
||||
"N-Gram-Based Text Categorization".
|
||||
|
||||
The algorithm takes advantage of Zipf's law and uses
|
||||
n-gram frequencies to profile languages and text-yet to
|
||||
be identified-then compares using a distance measure.
|
||||
|
||||
Language n-grams are provided by the "An Crubadan"
|
||||
project. A corpus reader was created separately to read
|
||||
those files.
|
||||
|
||||
For details regarding the algorithm, see:
|
||||
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
|
||||
|
||||
For details about An Crubadan, see:
|
||||
https://borel.slu.edu/crubadan/index.html
|
||||
"""
|
||||
|
||||
from sys import maxsize
|
||||
|
||||
from nltk.util import trigrams
|
||||
|
||||
# Note: this is NOT "re" you're likely used to. The regex module
|
||||
# is an alternative to the standard re module that supports
|
||||
# Unicode codepoint properties with the \p{} syntax.
|
||||
# You may have to "pip install regx"
|
||||
try:
|
||||
import regex as re
|
||||
except ImportError:
|
||||
re = None
|
||||
######################################################################
|
||||
## Language identification using TextCat
|
||||
######################################################################
|
||||
|
||||
|
||||
class TextCat:
|
||||
_corpus = None
|
||||
fingerprints = {}
|
||||
_START_CHAR = "<"
|
||||
_END_CHAR = ">"
|
||||
|
||||
last_distances = {}
|
||||
|
||||
def __init__(self):
|
||||
if not re:
|
||||
raise OSError(
|
||||
"classify.textcat requires the regex module that "
|
||||
"supports unicode. Try '$ pip install regex' and "
|
||||
"see https://pypi.python.org/pypi/regex for "
|
||||
"further details."
|
||||
)
|
||||
|
||||
from nltk.corpus import crubadan
|
||||
|
||||
self._corpus = crubadan
|
||||
# Load all language ngrams into cache
|
||||
for lang in self._corpus.langs():
|
||||
self._corpus.lang_freq(lang)
|
||||
|
||||
def remove_punctuation(self, text):
|
||||
"""Get rid of punctuation except apostrophes"""
|
||||
return re.sub(r"[^\P{P}\']+", "", text)
|
||||
|
||||
def profile(self, text):
|
||||
"""Create FreqDist of trigrams within text"""
|
||||
from nltk import FreqDist, word_tokenize
|
||||
|
||||
clean_text = self.remove_punctuation(text)
|
||||
tokens = word_tokenize(clean_text)
|
||||
|
||||
fingerprint = FreqDist()
|
||||
for t in tokens:
|
||||
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
|
||||
token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
|
||||
|
||||
for cur_trigram in token_trigrams:
|
||||
if cur_trigram in fingerprint:
|
||||
fingerprint[cur_trigram] += 1
|
||||
else:
|
||||
fingerprint[cur_trigram] = 1
|
||||
|
||||
return fingerprint
|
||||
|
||||
def calc_dist(self, lang, trigram, text_profile):
|
||||
"""Calculate the "out-of-place" measure between the
|
||||
text and language profile for a single trigram"""
|
||||
|
||||
lang_fd = self._corpus.lang_freq(lang)
|
||||
dist = 0
|
||||
|
||||
if trigram in lang_fd:
|
||||
idx_lang_profile = list(lang_fd.keys()).index(trigram)
|
||||
idx_text = list(text_profile.keys()).index(trigram)
|
||||
|
||||
# print(idx_lang_profile, ", ", idx_text)
|
||||
dist = abs(idx_lang_profile - idx_text)
|
||||
else:
|
||||
# Arbitrary but should be larger than
|
||||
# any possible trigram file length
|
||||
# in terms of total lines
|
||||
dist = maxsize
|
||||
|
||||
return dist
|
||||
|
||||
def lang_dists(self, text):
|
||||
"""Calculate the "out-of-place" measure between
|
||||
the text and all languages"""
|
||||
|
||||
distances = {}
|
||||
profile = self.profile(text)
|
||||
# For all the languages
|
||||
for lang in self._corpus._all_lang_freq.keys():
|
||||
# Calculate distance metric for every trigram in
|
||||
# input text to be identified
|
||||
lang_dist = 0
|
||||
for trigram in profile:
|
||||
lang_dist += self.calc_dist(lang, trigram, profile)
|
||||
|
||||
distances[lang] = lang_dist
|
||||
|
||||
return distances
|
||||
|
||||
def guess_language(self, text):
|
||||
"""Find the language with the min distance
|
||||
to the text and return its ISO 639-3 code"""
|
||||
self.last_distances = self.lang_dists(text)
|
||||
|
||||
return min(self.last_distances, key=self.last_distances.get)
|
||||
#################################################')
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.corpus import udhr
|
||||
|
||||
langs = [
|
||||
"Kurdish-UTF8",
|
||||
"Abkhaz-UTF8",
|
||||
"Farsi_Persian-UTF8",
|
||||
"Hindi-UTF8",
|
||||
"Hawaiian-UTF8",
|
||||
"Russian-UTF8",
|
||||
"Vietnamese-UTF8",
|
||||
"Serbian_Srpski-UTF8",
|
||||
"Esperanto-UTF8",
|
||||
]
|
||||
|
||||
friendly = {
|
||||
"kmr": "Northern Kurdish",
|
||||
"abk": "Abkhazian",
|
||||
"pes": "Iranian Persian",
|
||||
"hin": "Hindi",
|
||||
"haw": "Hawaiian",
|
||||
"rus": "Russian",
|
||||
"vie": "Vietnamese",
|
||||
"srp": "Serbian",
|
||||
"epo": "Esperanto",
|
||||
}
|
||||
|
||||
tc = TextCat()
|
||||
|
||||
for cur_lang in langs:
|
||||
# Get raw data from UDHR corpus
|
||||
raw_sentences = udhr.sents(cur_lang)
|
||||
rows = len(raw_sentences) - 1
|
||||
cols = list(map(len, raw_sentences))
|
||||
|
||||
sample = ""
|
||||
|
||||
# Generate a sample text of the language
|
||||
for i in range(0, rows):
|
||||
cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
|
||||
sample += cur_sent
|
||||
|
||||
# Try to detect what it is
|
||||
print("Language snippet: " + sample[0:140] + "...")
|
||||
guess = tc.guess_language(sample)
|
||||
print(f"Language detection: {guess} ({friendly[guess]})")
|
||||
print("#" * 140)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
347
Backend/venv/lib/python3.12/site-packages/nltk/classify/util.py
Normal file
347
Backend/venv/lib/python3.12/site-packages/nltk/classify/util.py
Normal file
@@ -0,0 +1,347 @@
|
||||
# Natural Language Toolkit: Classifier Utility Functions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Utility functions and classes for classifiers.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
# from nltk.util import Deprecated
|
||||
import nltk.classify.util # for accuracy & log_likelihood
|
||||
from nltk.util import LazyMap
|
||||
|
||||
######################################################################
|
||||
# { Helper Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
# alternative name possibility: 'map_featurefunc()'?
|
||||
# alternative name possibility: 'detect_features()'?
|
||||
# alternative name possibility: 'map_featuredetect()'?
|
||||
# or.. just have users use LazyMap directly?
|
||||
def apply_features(feature_func, toks, labeled=None):
|
||||
"""
|
||||
Use the ``LazyMap`` class to construct a lazy list-like
|
||||
object that is analogous to ``map(feature_func, toks)``. In
|
||||
particular, if ``labeled=False``, then the returned list-like
|
||||
object's values are equal to::
|
||||
|
||||
[feature_func(tok) for tok in toks]
|
||||
|
||||
If ``labeled=True``, then the returned list-like object's values
|
||||
are equal to::
|
||||
|
||||
[(feature_func(tok), label) for (tok, label) in toks]
|
||||
|
||||
The primary purpose of this function is to avoid the memory
|
||||
overhead involved in storing all the featuresets for every token
|
||||
in a corpus. Instead, these featuresets are constructed lazily,
|
||||
as-needed. The reduction in memory overhead can be especially
|
||||
significant when the underlying list of tokens is itself lazy (as
|
||||
is the case with many corpus readers).
|
||||
|
||||
:param feature_func: The function that will be applied to each
|
||||
token. It should return a featureset -- i.e., a dict
|
||||
mapping feature names to feature values.
|
||||
:param toks: The list of tokens to which ``feature_func`` should be
|
||||
applied. If ``labeled=True``, then the list elements will be
|
||||
passed directly to ``feature_func()``. If ``labeled=False``,
|
||||
then the list elements should be tuples ``(tok,label)``, and
|
||||
``tok`` will be passed to ``feature_func()``.
|
||||
:param labeled: If true, then ``toks`` contains labeled tokens --
|
||||
i.e., tuples of the form ``(tok, label)``. (Default:
|
||||
auto-detect based on types.)
|
||||
"""
|
||||
if labeled is None:
|
||||
labeled = toks and isinstance(toks[0], (tuple, list))
|
||||
if labeled:
|
||||
|
||||
def lazy_func(labeled_token):
|
||||
return (feature_func(labeled_token[0]), labeled_token[1])
|
||||
|
||||
return LazyMap(lazy_func, toks)
|
||||
else:
|
||||
return LazyMap(feature_func, toks)
|
||||
|
||||
|
||||
def attested_labels(tokens):
|
||||
"""
|
||||
:return: A list of all labels that are attested in the given list
|
||||
of tokens.
|
||||
:rtype: list of (immutable)
|
||||
:param tokens: The list of classified tokens from which to extract
|
||||
labels. A classified token has the form ``(token, label)``.
|
||||
:type tokens: list
|
||||
"""
|
||||
return tuple({label for (tok, label) in tokens})
|
||||
|
||||
|
||||
def log_likelihood(classifier, gold):
|
||||
results = classifier.prob_classify_many([fs for (fs, l) in gold])
|
||||
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
|
||||
return math.log(sum(ll) / len(ll))
|
||||
|
||||
|
||||
def accuracy(classifier, gold):
|
||||
results = classifier.classify_many([fs for (fs, l) in gold])
|
||||
correct = [l == r for ((fs, l), r) in zip(gold, results)]
|
||||
if correct:
|
||||
return sum(correct) / len(correct)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class CutoffChecker:
|
||||
"""
|
||||
A helper class that implements cutoff checks based on number of
|
||||
iterations and log likelihood.
|
||||
|
||||
Accuracy cutoffs are also implemented, but they're almost never
|
||||
a good idea to use.
|
||||
"""
|
||||
|
||||
def __init__(self, cutoffs):
|
||||
self.cutoffs = cutoffs.copy()
|
||||
if "min_ll" in cutoffs:
|
||||
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
|
||||
if "min_lldelta" in cutoffs:
|
||||
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
|
||||
self.ll = None
|
||||
self.acc = None
|
||||
self.iter = 1
|
||||
|
||||
def check(self, classifier, train_toks):
|
||||
cutoffs = self.cutoffs
|
||||
self.iter += 1
|
||||
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
|
||||
return True # iteration cutoff.
|
||||
|
||||
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if math.isnan(new_ll):
|
||||
return True
|
||||
|
||||
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
|
||||
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_lldelta" in cutoffs
|
||||
and self.ll
|
||||
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.ll = new_ll
|
||||
|
||||
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
|
||||
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
|
||||
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
|
||||
return True # log likelihood cutoff
|
||||
if (
|
||||
"min_accdelta" in cutoffs
|
||||
and self.acc
|
||||
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
|
||||
):
|
||||
return True # log likelihood delta cutoff
|
||||
self.acc = new_acc
|
||||
|
||||
return False # no cutoff reached.
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Demos
|
||||
######################################################################
|
||||
|
||||
|
||||
def names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith"] = name[0].lower()
|
||||
features["endswith"] = name[-1].lower()
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
return features
|
||||
|
||||
|
||||
def binary_names_demo_features(name):
|
||||
features = {}
|
||||
features["alwayson"] = True
|
||||
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
|
||||
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
|
||||
for letter in "abcdefghijklmnopqrstuvwxyz":
|
||||
features["count(%s)" % letter] = name.lower().count(letter)
|
||||
features["has(%s)" % letter] = letter in name.lower()
|
||||
features["startswith(%s)" % letter] = letter == name[0].lower()
|
||||
features["endswith(%s)" % letter] = letter == name[-1].lower()
|
||||
return features
|
||||
|
||||
|
||||
def names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
# Construct a list of classified names, using the names corpus.
|
||||
namelist = [(name, "male") for name in names.words("male.txt")] + [
|
||||
(name, "female") for name in names.words("female.txt")
|
||||
]
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
random.seed(123456)
|
||||
random.shuffle(namelist)
|
||||
train = namelist[:5000]
|
||||
test = namelist[5000:5500]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(n), g) for (n, g) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, g) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, gender), pdist in list(zip(test, pdists))[:5]:
|
||||
if gender == "male":
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def partial_names_demo(trainer, features=names_demo_features):
|
||||
import random
|
||||
|
||||
from nltk.corpus import names
|
||||
|
||||
male_names = names.words("male.txt")
|
||||
female_names = names.words("female.txt")
|
||||
|
||||
random.seed(654321)
|
||||
random.shuffle(male_names)
|
||||
random.shuffle(female_names)
|
||||
|
||||
# Create a list of male names to be used as positive-labeled examples for training
|
||||
positive = map(features, male_names[:2000])
|
||||
|
||||
# Create a list of male and female names to be used as unlabeled examples
|
||||
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
|
||||
|
||||
# Create a test set with correctly-labeled male and female names
|
||||
test = [(name, True) for name in male_names[2500:2750]] + [
|
||||
(name, False) for name in female_names[500:750]
|
||||
]
|
||||
|
||||
random.shuffle(test)
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer(positive, unlabeled)
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(n) for (n, m) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
print()
|
||||
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
||||
for (name, is_male), pdist in zip(test, pdists)[:5]:
|
||||
if is_male == True:
|
||||
fmt = " %-15s *%6.4f %6.4f"
|
||||
else:
|
||||
fmt = " %-15s %6.4f *%6.4f"
|
||||
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
_inst_cache = {}
|
||||
|
||||
|
||||
def wsd_demo(trainer, word, features, n=1000):
|
||||
import random
|
||||
|
||||
from nltk.corpus import senseval
|
||||
|
||||
# Get the instances.
|
||||
print("Reading data...")
|
||||
global _inst_cache
|
||||
if word not in _inst_cache:
|
||||
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
|
||||
instances = _inst_cache[word][:]
|
||||
if n > len(instances):
|
||||
n = len(instances)
|
||||
senses = list({l for (i, l) in instances})
|
||||
print(" Senses: " + " ".join(senses))
|
||||
|
||||
# Randomly split the names into a test & train set.
|
||||
print("Splitting into test & train...")
|
||||
random.seed(123456)
|
||||
random.shuffle(instances)
|
||||
train = instances[: int(0.8 * n)]
|
||||
test = instances[int(0.8 * n) : n]
|
||||
|
||||
# Train up a classifier.
|
||||
print("Training classifier...")
|
||||
classifier = trainer([(features(i), l) for (i, l) in train])
|
||||
|
||||
# Run the classifier on the test data.
|
||||
print("Testing classifier...")
|
||||
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
|
||||
print("Accuracy: %6.4f" % acc)
|
||||
|
||||
# For classifiers that can find probabilities, show the log
|
||||
# likelihood and some sample probability distributions.
|
||||
try:
|
||||
test_featuresets = [features(i) for (i, n) in test]
|
||||
pdists = classifier.prob_classify_many(test_featuresets)
|
||||
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
||||
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Return the classifier
|
||||
return classifier
|
||||
|
||||
|
||||
def check_megam_config():
|
||||
"""
|
||||
Checks whether the MEGAM binary is configured.
|
||||
"""
|
||||
try:
|
||||
_megam_bin
|
||||
except NameError as e:
|
||||
err_msg = str(
|
||||
"Please configure your megam binary first, e.g.\n"
|
||||
">>> nltk.config_megam('/usr/bin/local/megam')"
|
||||
)
|
||||
raise NameError(err_msg) from e
|
||||
377
Backend/venv/lib/python3.12/site-packages/nltk/classify/weka.py
Normal file
377
Backend/venv/lib/python3.12/site-packages/nltk/classify/weka.py
Normal file
@@ -0,0 +1,377 @@
|
||||
# Natural Language Toolkit: Interface to Weka Classsifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classifiers that make use of the external 'Weka' package.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import zipfile
|
||||
from sys import stdin
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.internals import config_java, java
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
_weka_classpath = None
|
||||
_weka_search = [
|
||||
".",
|
||||
"/usr/share/weka",
|
||||
"/usr/local/share/weka",
|
||||
"/usr/lib/weka",
|
||||
"/usr/local/lib/weka",
|
||||
]
|
||||
|
||||
|
||||
def config_weka(classpath=None):
|
||||
global _weka_classpath
|
||||
|
||||
# Make sure java's configured first.
|
||||
config_java()
|
||||
|
||||
if classpath is not None:
|
||||
_weka_classpath = classpath
|
||||
|
||||
if _weka_classpath is None:
|
||||
searchpath = _weka_search
|
||||
if "WEKAHOME" in os.environ:
|
||||
searchpath.insert(0, os.environ["WEKAHOME"])
|
||||
|
||||
for path in searchpath:
|
||||
if os.path.exists(os.path.join(path, "weka.jar")):
|
||||
_weka_classpath = os.path.join(path, "weka.jar")
|
||||
version = _check_weka_version(_weka_classpath)
|
||||
if version:
|
||||
print(f"[Found Weka: {_weka_classpath} (version {version})]")
|
||||
else:
|
||||
print("[Found Weka: %s]" % _weka_classpath)
|
||||
_check_weka_version(_weka_classpath)
|
||||
|
||||
if _weka_classpath is None:
|
||||
raise LookupError(
|
||||
"Unable to find weka.jar! Use config_weka() "
|
||||
"or set the WEKAHOME environment variable. "
|
||||
"For more information about Weka, please see "
|
||||
"https://www.cs.waikato.ac.nz/ml/weka/"
|
||||
)
|
||||
|
||||
|
||||
def _check_weka_version(jar):
|
||||
try:
|
||||
zf = zipfile.ZipFile(jar)
|
||||
except (SystemExit, KeyboardInterrupt):
|
||||
raise
|
||||
except:
|
||||
return None
|
||||
try:
|
||||
try:
|
||||
return zf.read("weka/core/version.txt")
|
||||
except KeyError:
|
||||
return None
|
||||
finally:
|
||||
zf.close()
|
||||
|
||||
|
||||
class WekaClassifier(ClassifierI):
|
||||
def __init__(self, formatter, model_filename):
|
||||
self._formatter = formatter
|
||||
self._model = model_filename
|
||||
|
||||
def prob_classify_many(self, featuresets):
|
||||
return self._classify_many(featuresets, ["-p", "0", "-distribution"])
|
||||
|
||||
def classify_many(self, featuresets):
|
||||
return self._classify_many(featuresets, ["-p", "0"])
|
||||
|
||||
def _classify_many(self, featuresets, options):
|
||||
# Make sure we can find java & weka.
|
||||
config_weka()
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
# Write the test data file.
|
||||
test_filename = os.path.join(temp_dir, "test.arff")
|
||||
self._formatter.write(test_filename, featuresets)
|
||||
|
||||
# Call weka to classify the data.
|
||||
cmd = [
|
||||
"weka.classifiers.bayes.NaiveBayes",
|
||||
"-l",
|
||||
self._model,
|
||||
"-T",
|
||||
test_filename,
|
||||
] + options
|
||||
(stdout, stderr) = java(
|
||||
cmd,
|
||||
classpath=_weka_classpath,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
# Check if something went wrong:
|
||||
if stderr and not stdout:
|
||||
if "Illegal options: -distribution" in stderr:
|
||||
raise ValueError(
|
||||
"The installed version of weka does "
|
||||
"not support probability distribution "
|
||||
"output."
|
||||
)
|
||||
else:
|
||||
raise ValueError("Weka failed to generate output:\n%s" % stderr)
|
||||
|
||||
# Parse weka's output.
|
||||
return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
def parse_weka_distribution(self, s):
|
||||
probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
|
||||
probs = dict(zip(self._formatter.labels(), probs))
|
||||
return DictionaryProbDist(probs)
|
||||
|
||||
def parse_weka_output(self, lines):
|
||||
# Strip unwanted text from stdout
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip().startswith("inst#"):
|
||||
lines = lines[i:]
|
||||
break
|
||||
|
||||
if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
|
||||
return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
|
||||
elif lines[0].split() == [
|
||||
"inst#",
|
||||
"actual",
|
||||
"predicted",
|
||||
"error",
|
||||
"distribution",
|
||||
]:
|
||||
return [
|
||||
self.parse_weka_distribution(line.split()[-1])
|
||||
for line in lines[1:]
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
# is this safe:?
|
||||
elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
|
||||
return [line.split()[1] for line in lines if line.strip()]
|
||||
|
||||
else:
|
||||
for line in lines[:10]:
|
||||
print(line)
|
||||
raise ValueError(
|
||||
"Unhandled output format -- your version "
|
||||
"of weka may not be supported.\n"
|
||||
" Header: %s" % lines[0]
|
||||
)
|
||||
|
||||
# [xx] full list of classifiers (some may be abstract?):
|
||||
# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
|
||||
# DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
|
||||
# JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
|
||||
# LogisticBase, M5Base, MultilayerPerceptron,
|
||||
# MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
|
||||
# NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
|
||||
# PreConstructedLinearModel, Prism, RandomForest,
|
||||
# RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
|
||||
# RuleNode, SimpleLinearRegression, SimpleLogistic,
|
||||
# SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
|
||||
# VotedPerceptron, Winnow, ZeroR
|
||||
|
||||
_CLASSIFIER_CLASS = {
|
||||
"naivebayes": "weka.classifiers.bayes.NaiveBayes",
|
||||
"C4.5": "weka.classifiers.trees.J48",
|
||||
"log_regression": "weka.classifiers.functions.Logistic",
|
||||
"svm": "weka.classifiers.functions.SMO",
|
||||
"kstar": "weka.classifiers.lazy.KStar",
|
||||
"ripper": "weka.classifiers.rules.JRip",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def train(
|
||||
cls,
|
||||
model_filename,
|
||||
featuresets,
|
||||
classifier="naivebayes",
|
||||
options=[],
|
||||
quiet=True,
|
||||
):
|
||||
# Make sure we can find java & weka.
|
||||
config_weka()
|
||||
|
||||
# Build an ARFF formatter.
|
||||
formatter = ARFF_Formatter.from_train(featuresets)
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
# Write the training data file.
|
||||
train_filename = os.path.join(temp_dir, "train.arff")
|
||||
formatter.write(train_filename, featuresets)
|
||||
|
||||
if classifier in cls._CLASSIFIER_CLASS:
|
||||
javaclass = cls._CLASSIFIER_CLASS[classifier]
|
||||
elif classifier in cls._CLASSIFIER_CLASS.values():
|
||||
javaclass = classifier
|
||||
else:
|
||||
raise ValueError("Unknown classifier %s" % classifier)
|
||||
|
||||
# Train the weka model.
|
||||
cmd = [javaclass, "-d", model_filename, "-t", train_filename]
|
||||
cmd += list(options)
|
||||
if quiet:
|
||||
stdout = subprocess.PIPE
|
||||
else:
|
||||
stdout = None
|
||||
java(cmd, classpath=_weka_classpath, stdout=stdout)
|
||||
|
||||
# Return the new classifier.
|
||||
return WekaClassifier(formatter, model_filename)
|
||||
|
||||
finally:
|
||||
for f in os.listdir(temp_dir):
|
||||
os.remove(os.path.join(temp_dir, f))
|
||||
os.rmdir(temp_dir)
|
||||
|
||||
|
||||
class ARFF_Formatter:
|
||||
"""
|
||||
Converts featuresets and labeled featuresets to ARFF-formatted
|
||||
strings, appropriate for input into Weka.
|
||||
|
||||
Features and classes can be specified manually in the constructor, or may
|
||||
be determined from data using ``from_train``.
|
||||
"""
|
||||
|
||||
def __init__(self, labels, features):
|
||||
"""
|
||||
:param labels: A list of all class labels that can be generated.
|
||||
:param features: A list of feature specifications, where
|
||||
each feature specification is a tuple (fname, ftype);
|
||||
and ftype is an ARFF type string such as NUMERIC or
|
||||
STRING.
|
||||
"""
|
||||
self._labels = labels
|
||||
self._features = features
|
||||
|
||||
def format(self, tokens):
|
||||
"""Returns a string representation of ARFF output for the given data."""
|
||||
return self.header_section() + self.data_section(tokens)
|
||||
|
||||
def labels(self):
|
||||
"""Returns the list of classes."""
|
||||
return list(self._labels)
|
||||
|
||||
def write(self, outfile, tokens):
|
||||
"""Writes ARFF data to a file for the given data."""
|
||||
if not hasattr(outfile, "write"):
|
||||
outfile = open(outfile, "w")
|
||||
outfile.write(self.format(tokens))
|
||||
outfile.close()
|
||||
|
||||
@staticmethod
|
||||
def from_train(tokens):
|
||||
"""
|
||||
Constructs an ARFF_Formatter instance with class labels and feature
|
||||
types determined from the given data. Handles boolean, numeric and
|
||||
string (note: not nominal) types.
|
||||
"""
|
||||
# Find the set of all attested labels.
|
||||
labels = {label for (tok, label) in tokens}
|
||||
|
||||
# Determine the types of all features.
|
||||
features = {}
|
||||
for tok, label in tokens:
|
||||
for fname, fval in tok.items():
|
||||
if issubclass(type(fval), bool):
|
||||
ftype = "{True, False}"
|
||||
elif issubclass(type(fval), (int, float, bool)):
|
||||
ftype = "NUMERIC"
|
||||
elif issubclass(type(fval), str):
|
||||
ftype = "STRING"
|
||||
elif fval is None:
|
||||
continue # can't tell the type.
|
||||
else:
|
||||
raise ValueError("Unsupported value type %r" % ftype)
|
||||
|
||||
if features.get(fname, ftype) != ftype:
|
||||
raise ValueError("Inconsistent type for %s" % fname)
|
||||
features[fname] = ftype
|
||||
features = sorted(features.items())
|
||||
|
||||
return ARFF_Formatter(labels, features)
|
||||
|
||||
def header_section(self):
|
||||
"""Returns an ARFF header as a string."""
|
||||
# Header comment.
|
||||
s = (
|
||||
"% Weka ARFF file\n"
|
||||
+ "% Generated automatically by NLTK\n"
|
||||
+ "%% %s\n\n" % time.ctime()
|
||||
)
|
||||
|
||||
# Relation name
|
||||
s += "@RELATION rel\n\n"
|
||||
|
||||
# Input attribute specifications
|
||||
for fname, ftype in self._features:
|
||||
s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
|
||||
|
||||
# Label attribute specification
|
||||
s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
|
||||
|
||||
return s
|
||||
|
||||
def data_section(self, tokens, labeled=None):
|
||||
"""
|
||||
Returns the ARFF data section for the given data.
|
||||
|
||||
:param tokens: a list of featuresets (dicts) or labelled featuresets
|
||||
which are tuples (featureset, label).
|
||||
:param labeled: Indicates whether the given tokens are labeled
|
||||
or not. If None, then the tokens will be assumed to be
|
||||
labeled if the first token's value is a tuple or list.
|
||||
"""
|
||||
# Check if the tokens are labeled or unlabeled. If unlabeled,
|
||||
# then use 'None'
|
||||
if labeled is None:
|
||||
labeled = tokens and isinstance(tokens[0], (tuple, list))
|
||||
if not labeled:
|
||||
tokens = [(tok, None) for tok in tokens]
|
||||
|
||||
# Data section
|
||||
s = "\n@DATA\n"
|
||||
for tok, label in tokens:
|
||||
for fname, ftype in self._features:
|
||||
s += "%s," % self._fmt_arff_val(tok.get(fname))
|
||||
s += "%s\n" % self._fmt_arff_val(label)
|
||||
|
||||
return s
|
||||
|
||||
def _fmt_arff_val(self, fval):
|
||||
if fval is None:
|
||||
return "?"
|
||||
elif isinstance(fval, (bool, int)):
|
||||
return "%s" % fval
|
||||
elif isinstance(fval, float):
|
||||
return "%r" % fval
|
||||
else:
|
||||
return "%r" % fval
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from nltk.classify.util import binary_names_demo_features, names_demo
|
||||
|
||||
def make_classifier(featuresets):
|
||||
return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
|
||||
|
||||
classifier = names_demo(make_classifier, binary_names_demo_features)
|
||||
Reference in New Issue
Block a user