updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/classify/naivebayes.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/classify/naivebayes.py
@@ -0,0 +1,260 @@
+# Natural Language Toolkit: Naive Bayes Classifiers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A classifier based on the Naive Bayes algorithm.  In order to find the
+probability for a label, this algorithm first uses the Bayes rule to
+express P(label|features) in terms of P(label) and P(features|label):
+
+|                       P(label) * P(features|label)
+|  P(label|features) = ------------------------------
+|                              P(features)
+
+The algorithm then makes the 'naive' assumption that all features are
+independent, given the label:
+
+|                       P(label) * P(f1|label) * ... * P(fn|label)
+|  P(label|features) = --------------------------------------------
+|                                         P(features)
+
+Rather than computing P(features) explicitly, the algorithm just
+calculates the numerator for each label, and normalizes them so they
+sum to one:
+
+|                       P(label) * P(f1|label) * ... * P(fn|label)
+|  P(label|features) = --------------------------------------------
+|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
+"""
+
+from collections import defaultdict
+
+from nltk.classify.api import ClassifierI
+from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
+
+##//////////////////////////////////////////////////////
+##  Naive Bayes Classifier
+##//////////////////////////////////////////////////////
+
+
+class NaiveBayesClassifier(ClassifierI):
+    """
+    A Naive Bayes classifier.  Naive Bayes classifiers are
+    paramaterized by two probability distributions:
+
+      - P(label) gives the probability that an input will receive each
+        label, given no information about the input's features.
+
+      - P(fname=fval|label) gives the probability that a given feature
+        (fname) will receive a given value (fval), given that the
+        label (label).
+
+    If the classifier encounters an input with a feature that has
+    never been seen with any label, then rather than assigning a
+    probability of 0 to all labels, it will ignore that feature.
+
+    The feature value 'None' is reserved for unseen feature values;
+    you generally should not use 'None' as a feature value for one of
+    your own features.
+    """
+
+    def __init__(self, label_probdist, feature_probdist):
+        """
+        :param label_probdist: P(label), the probability distribution
+            over labels.  It is expressed as a ``ProbDistI`` whose
+            samples are labels.  I.e., P(label) =
+            ``label_probdist.prob(label)``.
+
+        :param feature_probdist: P(fname=fval|label), the probability
+            distribution for feature values, given labels.  It is
+            expressed as a dictionary whose keys are ``(label, fname)``
+            pairs and whose values are ``ProbDistI`` objects over feature
+            values.  I.e., P(fname=fval|label) =
+            ``feature_probdist[label,fname].prob(fval)``.  If a given
+            ``(label,fname)`` is not a key in ``feature_probdist``, then
+            it is assumed that the corresponding P(fname=fval|label)
+            is 0 for all values of ``fval``.
+        """
+        self._label_probdist = label_probdist
+        self._feature_probdist = feature_probdist
+        self._labels = list(label_probdist.samples())
+
+    def labels(self):
+        return self._labels
+
+    def classify(self, featureset):
+        return self.prob_classify(featureset).max()
+
+    def prob_classify(self, featureset):
+        # Discard any feature names that we've never seen before.
+        # Otherwise, we'll just assign a probability of 0 to
+        # everything.
+        featureset = featureset.copy()
+        for fname in list(featureset.keys()):
+            for label in self._labels:
+                if (label, fname) in self._feature_probdist:
+                    break
+            else:
+                # print('Ignoring unseen feature %s' % fname)
+                del featureset[fname]
+
+        # Find the log probability of each label, given the features.
+        # Start with the log probability of the label itself.
+        logprob = {}
+        for label in self._labels:
+            logprob[label] = self._label_probdist.logprob(label)
+
+        # Then add in the log probability of features given labels.
+        for label in self._labels:
+            for fname, fval in featureset.items():
+                if (label, fname) in self._feature_probdist:
+                    feature_probs = self._feature_probdist[label, fname]
+                    logprob[label] += feature_probs.logprob(fval)
+                else:
+                    # nb: This case will never come up if the
+                    # classifier was created by
+                    # NaiveBayesClassifier.train().
+                    logprob[label] += sum_logs([])  # = -INF.
+
+        return DictionaryProbDist(logprob, normalize=True, log=True)
+
+    def show_most_informative_features(self, n=10):
+        # Determine the most relevant features, and display them.
+        cpdist = self._feature_probdist
+        print("Most Informative Features")
+
+        for fname, fval in self.most_informative_features(n):
+
+            def labelprob(l):
+                return cpdist[l, fname].prob(fval)
+
+            labels = sorted(
+                (l for l in self._labels if fval in cpdist[l, fname].samples()),
+                key=lambda element: (-labelprob(element), element),
+                reverse=True,
+            )
+            if len(labels) == 1:
+                continue
+            l0 = labels[0]
+            l1 = labels[-1]
+            if cpdist[l0, fname].prob(fval) == 0:
+                ratio = "INF"
+            else:
+                ratio = "%8.1f" % (
+                    cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
+                )
+            print(
+                "%24s = %-14r %6s : %-6s = %s : 1.0"
+                % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
+            )
+
+    def most_informative_features(self, n=100):
+        """
+        Return a list of the 'most informative' features used by this
+        classifier.  For the purpose of this function, the
+        informativeness of a feature ``(fname,fval)`` is equal to the
+        highest value of P(fname=fval|label), for any label, divided by
+        the lowest value of P(fname=fval|label), for any label:
+
+        |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
+        """
+        if hasattr(self, "_most_informative_features"):
+            return self._most_informative_features[:n]
+        else:
+            # The set of (fname, fval) pairs used by this classifier.
+            features = set()
+            # The max & min probability associated w/ each (fname, fval)
+            # pair.  Maps (fname,fval) -> float.
+            maxprob = defaultdict(float)
+            minprob = defaultdict(lambda: 1.0)
+
+            for (label, fname), probdist in self._feature_probdist.items():
+                for fval in probdist.samples():
+                    feature = (fname, fval)
+                    features.add(feature)
+                    p = probdist.prob(fval)
+                    maxprob[feature] = max(p, maxprob[feature])
+                    minprob[feature] = min(p, minprob[feature])
+                    if minprob[feature] == 0:
+                        features.discard(feature)
+
+            # Convert features to a list, & sort it by how informative
+            # features are.
+            self._most_informative_features = sorted(
+                features,
+                key=lambda feature_: (
+                    minprob[feature_] / maxprob[feature_],
+                    feature_[0],
+                    feature_[1] in [None, False, True],
+                    str(feature_[1]).lower(),
+                ),
+            )
+        return self._most_informative_features[:n]
+
+    @classmethod
+    def train(cls, labeled_featuresets, estimator=ELEProbDist):
+        """
+        :param labeled_featuresets: A list of classified featuresets,
+            i.e., a list of tuples ``(featureset, label)``.
+        """
+        label_freqdist = FreqDist()
+        feature_freqdist = defaultdict(FreqDist)
+        feature_values = defaultdict(set)
+        fnames = set()
+
+        # Count up how many times each feature value occurred, given
+        # the label and featurename.
+        for featureset, label in labeled_featuresets:
+            label_freqdist[label] += 1
+            for fname, fval in featureset.items():
+                # Increment freq(fval|label, fname)
+                feature_freqdist[label, fname][fval] += 1
+                # Record that fname can take the value fval.
+                feature_values[fname].add(fval)
+                # Keep a list of all feature names.
+                fnames.add(fname)
+
+        # If a feature didn't have a value given for an instance, then
+        # we assume that it gets the implicit value 'None.'  This loop
+        # counts up the number of 'missing' feature values for each
+        # (label,fname) pair, and increments the count of the fval
+        # 'None' by that amount.
+        for label in label_freqdist:
+            num_samples = label_freqdist[label]
+            for fname in fnames:
+                count = feature_freqdist[label, fname].N()
+                # Only add a None key when necessary, i.e. if there are
+                # any samples with feature 'fname' missing.
+                if num_samples - count > 0:
+                    feature_freqdist[label, fname][None] += num_samples - count
+                    feature_values[fname].add(None)
+
+        # Create the P(label) distribution
+        label_probdist = estimator(label_freqdist)
+
+        # Create the P(fval|label, fname) distribution
+        feature_probdist = {}
+        for (label, fname), freqdist in feature_freqdist.items():
+            probdist = estimator(freqdist, bins=len(feature_values[fname]))
+            feature_probdist[label, fname] = probdist
+
+        return cls(label_probdist, feature_probdist)
+
+
+##//////////////////////////////////////////////////////
+##  Demo
+##//////////////////////////////////////////////////////
+
+
+def demo():
+    from nltk.classify.util import names_demo
+
+    classifier = names_demo(NaiveBayesClassifier.train)
+    classifier.show_most_informative_features()
+
+
+if __name__ == "__main__":
+    demo()