updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/classify/scikitlearn.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/classify/scikitlearn.py
@@ -0,0 +1,143 @@
+# Natural Language Toolkit: Interface to scikit-learn classifiers
+#
+# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+"""
+scikit-learn (https://scikit-learn.org) is a machine learning library for
+Python. It supports many classification algorithms, including SVMs,
+Naive Bayes, logistic regression (MaxEnt) and decision trees.
+
+This package implements a wrapper around scikit-learn classifiers. To use this
+wrapper, construct a scikit-learn estimator object, then use that to construct
+a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
+
+>>> from sklearn.svm import LinearSVC
+>>> from nltk.classify.scikitlearn import SklearnClassifier
+>>> classif = SklearnClassifier(LinearSVC())
+
+A scikit-learn classifier may include preprocessing steps when it's wrapped
+in a Pipeline object. The following constructs and wraps a Naive Bayes text
+classifier with tf-idf weighting and chi-square feature selection to get the
+best 1000 features:
+
+>>> from sklearn.feature_extraction.text import TfidfTransformer
+>>> from sklearn.feature_selection import SelectKBest, chi2
+>>> from sklearn.naive_bayes import MultinomialNB
+>>> from sklearn.pipeline import Pipeline
+>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
+...                      ('chi2', SelectKBest(chi2, k=1000)),
+...                      ('nb', MultinomialNB())])
+>>> classif = SklearnClassifier(pipeline)
+"""
+
+from nltk.classify.api import ClassifierI
+from nltk.probability import DictionaryProbDist
+
+try:
+    from sklearn.feature_extraction import DictVectorizer
+    from sklearn.preprocessing import LabelEncoder
+except ImportError:
+    pass
+
+__all__ = ["SklearnClassifier"]
+
+
+class SklearnClassifier(ClassifierI):
+    """Wrapper for scikit-learn classifiers."""
+
+    def __init__(self, estimator, dtype=float, sparse=True):
+        """
+        :param estimator: scikit-learn classifier object.
+
+        :param dtype: data type used when building feature array.
+            scikit-learn estimators work exclusively on numeric data. The
+            default value should be fine for almost all situations.
+
+        :param sparse: Whether to use sparse matrices internally.
+            The estimator must support these; not all scikit-learn classifiers
+            do (see their respective documentation and look for "sparse
+            matrix"). The default value is True, since most NLP problems
+            involve sparse feature sets. Setting this to False may take a
+            great amount of memory.
+        :type sparse: boolean.
+        """
+        self._clf = estimator
+        self._encoder = LabelEncoder()
+        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
+
+    def __repr__(self):
+        return "<SklearnClassifier(%r)>" % self._clf
+
+    def classify_many(self, featuresets):
+        """Classify a batch of samples.
+
+        :param featuresets: An iterable over featuresets, each a dict mapping
+            strings to either numbers, booleans or strings.
+        :return: The predicted class label for each input sample.
+        :rtype: list
+        """
+        X = self._vectorizer.transform(featuresets)
+        classes = self._encoder.classes_
+        return [classes[i] for i in self._clf.predict(X)]
+
+    def prob_classify_many(self, featuresets):
+        """Compute per-class probabilities for a batch of samples.
+
+        :param featuresets: An iterable over featuresets, each a dict mapping
+            strings to either numbers, booleans or strings.
+        :rtype: list of ``ProbDistI``
+        """
+        X = self._vectorizer.transform(featuresets)
+        y_proba_list = self._clf.predict_proba(X)
+        return [self._make_probdist(y_proba) for y_proba in y_proba_list]
+
+    def labels(self):
+        """The class labels used by this classifier.
+
+        :rtype: list
+        """
+        return list(self._encoder.classes_)
+
+    def train(self, labeled_featuresets):
+        """
+        Train (fit) the scikit-learn estimator.
+
+        :param labeled_featuresets: A list of ``(featureset, label)``
+            where each ``featureset`` is a dict mapping strings to either
+            numbers, booleans or strings.
+        """
+
+        X, y = list(zip(*labeled_featuresets))
+        X = self._vectorizer.fit_transform(X)
+        y = self._encoder.fit_transform(y)
+        self._clf.fit(X, y)
+
+        return self
+
+    def _make_probdist(self, y_proba):
+        classes = self._encoder.classes_
+        return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
+
+
+if __name__ == "__main__":
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.naive_bayes import BernoulliNB
+
+    from nltk.classify.util import names_demo, names_demo_features
+
+    # Bernoulli Naive Bayes is designed for binary classification. We set the
+    # binarize option to False since we know we're passing boolean features.
+    print("scikit-learn Naive Bayes:")
+    names_demo(
+        SklearnClassifier(BernoulliNB(binarize=False)).train,
+        features=names_demo_features,
+    )
+
+    # The C parameter on logistic regression (MaxEnt) controls regularization.
+    # The higher it's set, the less regularized the classifier is.
+    print("\n\nscikit-learn logistic regression:")
+    names_demo(
+        SklearnClassifier(LogisticRegression(C=1000)).train,
+        features=names_demo_features,
+    )