updates
This commit is contained in:
@@ -0,0 +1,260 @@
|
||||
# Natural Language Toolkit: Naive Bayes Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A classifier based on the Naive Bayes algorithm. In order to find the
|
||||
probability for a label, this algorithm first uses the Bayes rule to
|
||||
express P(label|features) in terms of P(label) and P(features|label):
|
||||
|
||||
| P(label) * P(features|label)
|
||||
| P(label|features) = ------------------------------
|
||||
| P(features)
|
||||
|
||||
The algorithm then makes the 'naive' assumption that all features are
|
||||
independent, given the label:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| P(features)
|
||||
|
||||
Rather than computing P(features) explicitly, the algorithm just
|
||||
calculates the numerator for each label, and normalizes them so they
|
||||
sum to one:
|
||||
|
||||
| P(label) * P(f1|label) * ... * P(fn|label)
|
||||
| P(label|features) = --------------------------------------------
|
||||
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.api import ClassifierI
|
||||
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Naive Bayes Classifier
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class NaiveBayesClassifier(ClassifierI):
|
||||
"""
|
||||
A Naive Bayes classifier. Naive Bayes classifiers are
|
||||
paramaterized by two probability distributions:
|
||||
|
||||
- P(label) gives the probability that an input will receive each
|
||||
label, given no information about the input's features.
|
||||
|
||||
- P(fname=fval|label) gives the probability that a given feature
|
||||
(fname) will receive a given value (fval), given that the
|
||||
label (label).
|
||||
|
||||
If the classifier encounters an input with a feature that has
|
||||
never been seen with any label, then rather than assigning a
|
||||
probability of 0 to all labels, it will ignore that feature.
|
||||
|
||||
The feature value 'None' is reserved for unseen feature values;
|
||||
you generally should not use 'None' as a feature value for one of
|
||||
your own features.
|
||||
"""
|
||||
|
||||
def __init__(self, label_probdist, feature_probdist):
|
||||
"""
|
||||
:param label_probdist: P(label), the probability distribution
|
||||
over labels. It is expressed as a ``ProbDistI`` whose
|
||||
samples are labels. I.e., P(label) =
|
||||
``label_probdist.prob(label)``.
|
||||
|
||||
:param feature_probdist: P(fname=fval|label), the probability
|
||||
distribution for feature values, given labels. It is
|
||||
expressed as a dictionary whose keys are ``(label, fname)``
|
||||
pairs and whose values are ``ProbDistI`` objects over feature
|
||||
values. I.e., P(fname=fval|label) =
|
||||
``feature_probdist[label,fname].prob(fval)``. If a given
|
||||
``(label,fname)`` is not a key in ``feature_probdist``, then
|
||||
it is assumed that the corresponding P(fname=fval|label)
|
||||
is 0 for all values of ``fval``.
|
||||
"""
|
||||
self._label_probdist = label_probdist
|
||||
self._feature_probdist = feature_probdist
|
||||
self._labels = list(label_probdist.samples())
|
||||
|
||||
def labels(self):
|
||||
return self._labels
|
||||
|
||||
def classify(self, featureset):
|
||||
return self.prob_classify(featureset).max()
|
||||
|
||||
def prob_classify(self, featureset):
|
||||
# Discard any feature names that we've never seen before.
|
||||
# Otherwise, we'll just assign a probability of 0 to
|
||||
# everything.
|
||||
featureset = featureset.copy()
|
||||
for fname in list(featureset.keys()):
|
||||
for label in self._labels:
|
||||
if (label, fname) in self._feature_probdist:
|
||||
break
|
||||
else:
|
||||
# print('Ignoring unseen feature %s' % fname)
|
||||
del featureset[fname]
|
||||
|
||||
# Find the log probability of each label, given the features.
|
||||
# Start with the log probability of the label itself.
|
||||
logprob = {}
|
||||
for label in self._labels:
|
||||
logprob[label] = self._label_probdist.logprob(label)
|
||||
|
||||
# Then add in the log probability of features given labels.
|
||||
for label in self._labels:
|
||||
for fname, fval in featureset.items():
|
||||
if (label, fname) in self._feature_probdist:
|
||||
feature_probs = self._feature_probdist[label, fname]
|
||||
logprob[label] += feature_probs.logprob(fval)
|
||||
else:
|
||||
# nb: This case will never come up if the
|
||||
# classifier was created by
|
||||
# NaiveBayesClassifier.train().
|
||||
logprob[label] += sum_logs([]) # = -INF.
|
||||
|
||||
return DictionaryProbDist(logprob, normalize=True, log=True)
|
||||
|
||||
def show_most_informative_features(self, n=10):
|
||||
# Determine the most relevant features, and display them.
|
||||
cpdist = self._feature_probdist
|
||||
print("Most Informative Features")
|
||||
|
||||
for fname, fval in self.most_informative_features(n):
|
||||
|
||||
def labelprob(l):
|
||||
return cpdist[l, fname].prob(fval)
|
||||
|
||||
labels = sorted(
|
||||
(l for l in self._labels if fval in cpdist[l, fname].samples()),
|
||||
key=lambda element: (-labelprob(element), element),
|
||||
reverse=True,
|
||||
)
|
||||
if len(labels) == 1:
|
||||
continue
|
||||
l0 = labels[0]
|
||||
l1 = labels[-1]
|
||||
if cpdist[l0, fname].prob(fval) == 0:
|
||||
ratio = "INF"
|
||||
else:
|
||||
ratio = "%8.1f" % (
|
||||
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
|
||||
)
|
||||
print(
|
||||
"%24s = %-14r %6s : %-6s = %s : 1.0"
|
||||
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
|
||||
)
|
||||
|
||||
def most_informative_features(self, n=100):
|
||||
"""
|
||||
Return a list of the 'most informative' features used by this
|
||||
classifier. For the purpose of this function, the
|
||||
informativeness of a feature ``(fname,fval)`` is equal to the
|
||||
highest value of P(fname=fval|label), for any label, divided by
|
||||
the lowest value of P(fname=fval|label), for any label:
|
||||
|
||||
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
|
||||
"""
|
||||
if hasattr(self, "_most_informative_features"):
|
||||
return self._most_informative_features[:n]
|
||||
else:
|
||||
# The set of (fname, fval) pairs used by this classifier.
|
||||
features = set()
|
||||
# The max & min probability associated w/ each (fname, fval)
|
||||
# pair. Maps (fname,fval) -> float.
|
||||
maxprob = defaultdict(float)
|
||||
minprob = defaultdict(lambda: 1.0)
|
||||
|
||||
for (label, fname), probdist in self._feature_probdist.items():
|
||||
for fval in probdist.samples():
|
||||
feature = (fname, fval)
|
||||
features.add(feature)
|
||||
p = probdist.prob(fval)
|
||||
maxprob[feature] = max(p, maxprob[feature])
|
||||
minprob[feature] = min(p, minprob[feature])
|
||||
if minprob[feature] == 0:
|
||||
features.discard(feature)
|
||||
|
||||
# Convert features to a list, & sort it by how informative
|
||||
# features are.
|
||||
self._most_informative_features = sorted(
|
||||
features,
|
||||
key=lambda feature_: (
|
||||
minprob[feature_] / maxprob[feature_],
|
||||
feature_[0],
|
||||
feature_[1] in [None, False, True],
|
||||
str(feature_[1]).lower(),
|
||||
),
|
||||
)
|
||||
return self._most_informative_features[:n]
|
||||
|
||||
@classmethod
|
||||
def train(cls, labeled_featuresets, estimator=ELEProbDist):
|
||||
"""
|
||||
:param labeled_featuresets: A list of classified featuresets,
|
||||
i.e., a list of tuples ``(featureset, label)``.
|
||||
"""
|
||||
label_freqdist = FreqDist()
|
||||
feature_freqdist = defaultdict(FreqDist)
|
||||
feature_values = defaultdict(set)
|
||||
fnames = set()
|
||||
|
||||
# Count up how many times each feature value occurred, given
|
||||
# the label and featurename.
|
||||
for featureset, label in labeled_featuresets:
|
||||
label_freqdist[label] += 1
|
||||
for fname, fval in featureset.items():
|
||||
# Increment freq(fval|label, fname)
|
||||
feature_freqdist[label, fname][fval] += 1
|
||||
# Record that fname can take the value fval.
|
||||
feature_values[fname].add(fval)
|
||||
# Keep a list of all feature names.
|
||||
fnames.add(fname)
|
||||
|
||||
# If a feature didn't have a value given for an instance, then
|
||||
# we assume that it gets the implicit value 'None.' This loop
|
||||
# counts up the number of 'missing' feature values for each
|
||||
# (label,fname) pair, and increments the count of the fval
|
||||
# 'None' by that amount.
|
||||
for label in label_freqdist:
|
||||
num_samples = label_freqdist[label]
|
||||
for fname in fnames:
|
||||
count = feature_freqdist[label, fname].N()
|
||||
# Only add a None key when necessary, i.e. if there are
|
||||
# any samples with feature 'fname' missing.
|
||||
if num_samples - count > 0:
|
||||
feature_freqdist[label, fname][None] += num_samples - count
|
||||
feature_values[fname].add(None)
|
||||
|
||||
# Create the P(label) distribution
|
||||
label_probdist = estimator(label_freqdist)
|
||||
|
||||
# Create the P(fval|label, fname) distribution
|
||||
feature_probdist = {}
|
||||
for (label, fname), freqdist in feature_freqdist.items():
|
||||
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
||||
feature_probdist[label, fname] = probdist
|
||||
|
||||
return cls(label_probdist, feature_probdist)
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demo
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.classify.util import names_demo
|
||||
|
||||
classifier = names_demo(NaiveBayesClassifier.train)
|
||||
classifier.show_most_informative_features()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
Reference in New Issue
Block a user