updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/classify/util.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/classify/util.py
@@ -0,0 +1,347 @@
+# Natural Language Toolkit: Classifier Utility Functions
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility functions and classes for classifiers.
+"""
+
+import math
+
+# from nltk.util import Deprecated
+import nltk.classify.util  # for accuracy & log_likelihood
+from nltk.util import LazyMap
+
+######################################################################
+# { Helper Functions
+######################################################################
+
+
+# alternative name possibility: 'map_featurefunc()'?
+# alternative name possibility: 'detect_features()'?
+# alternative name possibility: 'map_featuredetect()'?
+# or.. just have users use LazyMap directly?
+def apply_features(feature_func, toks, labeled=None):
+    """
+    Use the ``LazyMap`` class to construct a lazy list-like
+    object that is analogous to ``map(feature_func, toks)``.  In
+    particular, if ``labeled=False``, then the returned list-like
+    object's values are equal to::
+
+        [feature_func(tok) for tok in toks]
+
+    If ``labeled=True``, then the returned list-like object's values
+    are equal to::
+
+        [(feature_func(tok), label) for (tok, label) in toks]
+
+    The primary purpose of this function is to avoid the memory
+    overhead involved in storing all the featuresets for every token
+    in a corpus.  Instead, these featuresets are constructed lazily,
+    as-needed.  The reduction in memory overhead can be especially
+    significant when the underlying list of tokens is itself lazy (as
+    is the case with many corpus readers).
+
+    :param feature_func: The function that will be applied to each
+        token.  It should return a featureset -- i.e., a dict
+        mapping feature names to feature values.
+    :param toks: The list of tokens to which ``feature_func`` should be
+        applied.  If ``labeled=True``, then the list elements will be
+        passed directly to ``feature_func()``.  If ``labeled=False``,
+        then the list elements should be tuples ``(tok,label)``, and
+        ``tok`` will be passed to ``feature_func()``.
+    :param labeled: If true, then ``toks`` contains labeled tokens --
+        i.e., tuples of the form ``(tok, label)``.  (Default:
+        auto-detect based on types.)
+    """
+    if labeled is None:
+        labeled = toks and isinstance(toks[0], (tuple, list))
+    if labeled:
+
+        def lazy_func(labeled_token):
+            return (feature_func(labeled_token[0]), labeled_token[1])
+
+        return LazyMap(lazy_func, toks)
+    else:
+        return LazyMap(feature_func, toks)
+
+
+def attested_labels(tokens):
+    """
+    :return: A list of all labels that are attested in the given list
+        of tokens.
+    :rtype: list of (immutable)
+    :param tokens: The list of classified tokens from which to extract
+        labels.  A classified token has the form ``(token, label)``.
+    :type tokens: list
+    """
+    return tuple({label for (tok, label) in tokens})
+
+
+def log_likelihood(classifier, gold):
+    results = classifier.prob_classify_many([fs for (fs, l) in gold])
+    ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
+    return math.log(sum(ll) / len(ll))
+
+
+def accuracy(classifier, gold):
+    results = classifier.classify_many([fs for (fs, l) in gold])
+    correct = [l == r for ((fs, l), r) in zip(gold, results)]
+    if correct:
+        return sum(correct) / len(correct)
+    else:
+        return 0
+
+
+class CutoffChecker:
+    """
+    A helper class that implements cutoff checks based on number of
+    iterations and log likelihood.
+
+    Accuracy cutoffs are also implemented, but they're almost never
+    a good idea to use.
+    """
+
+    def __init__(self, cutoffs):
+        self.cutoffs = cutoffs.copy()
+        if "min_ll" in cutoffs:
+            cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
+        if "min_lldelta" in cutoffs:
+            cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
+        self.ll = None
+        self.acc = None
+        self.iter = 1
+
+    def check(self, classifier, train_toks):
+        cutoffs = self.cutoffs
+        self.iter += 1
+        if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
+            return True  # iteration cutoff.
+
+        new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
+        if math.isnan(new_ll):
+            return True
+
+        if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
+            if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
+                return True  # log likelihood cutoff
+            if (
+                "min_lldelta" in cutoffs
+                and self.ll
+                and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
+            ):
+                return True  # log likelihood delta cutoff
+            self.ll = new_ll
+
+        if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
+            new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
+            if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
+                return True  # log likelihood cutoff
+            if (
+                "min_accdelta" in cutoffs
+                and self.acc
+                and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
+            ):
+                return True  # log likelihood delta cutoff
+            self.acc = new_acc
+
+            return False  # no cutoff reached.
+
+
+######################################################################
+# { Demos
+######################################################################
+
+
+def names_demo_features(name):
+    features = {}
+    features["alwayson"] = True
+    features["startswith"] = name[0].lower()
+    features["endswith"] = name[-1].lower()
+    for letter in "abcdefghijklmnopqrstuvwxyz":
+        features["count(%s)" % letter] = name.lower().count(letter)
+        features["has(%s)" % letter] = letter in name.lower()
+    return features
+
+
+def binary_names_demo_features(name):
+    features = {}
+    features["alwayson"] = True
+    features["startswith(vowel)"] = name[0].lower() in "aeiouy"
+    features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
+    for letter in "abcdefghijklmnopqrstuvwxyz":
+        features["count(%s)" % letter] = name.lower().count(letter)
+        features["has(%s)" % letter] = letter in name.lower()
+        features["startswith(%s)" % letter] = letter == name[0].lower()
+        features["endswith(%s)" % letter] = letter == name[-1].lower()
+    return features
+
+
+def names_demo(trainer, features=names_demo_features):
+    import random
+
+    from nltk.corpus import names
+
+    # Construct a list of classified names, using the names corpus.
+    namelist = [(name, "male") for name in names.words("male.txt")] + [
+        (name, "female") for name in names.words("female.txt")
+    ]
+
+    # Randomly split the names into a test & train set.
+    random.seed(123456)
+    random.shuffle(namelist)
+    train = namelist[:5000]
+    test = namelist[5000:5500]
+
+    # Train up a classifier.
+    print("Training classifier...")
+    classifier = trainer([(features(n), g) for (n, g) in train])
+
+    # Run the classifier on the test data.
+    print("Testing classifier...")
+    acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
+    print("Accuracy: %6.4f" % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(n) for (n, g) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
+        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+        print()
+        print("Unseen Names      P(Male)  P(Female)\n" + "-" * 40)
+        for (name, gender), pdist in list(zip(test, pdists))[:5]:
+            if gender == "male":
+                fmt = "  %-15s *%6.4f   %6.4f"
+            else:
+                fmt = "  %-15s  %6.4f  *%6.4f"
+            print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+
+def partial_names_demo(trainer, features=names_demo_features):
+    import random
+
+    from nltk.corpus import names
+
+    male_names = names.words("male.txt")
+    female_names = names.words("female.txt")
+
+    random.seed(654321)
+    random.shuffle(male_names)
+    random.shuffle(female_names)
+
+    # Create a list of male names to be used as positive-labeled examples for training
+    positive = map(features, male_names[:2000])
+
+    # Create a list of male and female names to be used as unlabeled examples
+    unlabeled = map(features, male_names[2000:2500] + female_names[:500])
+
+    # Create a test set with correctly-labeled male and female names
+    test = [(name, True) for name in male_names[2500:2750]] + [
+        (name, False) for name in female_names[500:750]
+    ]
+
+    random.shuffle(test)
+
+    # Train up a classifier.
+    print("Training classifier...")
+    classifier = trainer(positive, unlabeled)
+
+    # Run the classifier on the test data.
+    print("Testing classifier...")
+    acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
+    print("Accuracy: %6.4f" % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(n) for (n, m) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
+        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+        print()
+        print("Unseen Names      P(Male)  P(Female)\n" + "-" * 40)
+        for (name, is_male), pdist in zip(test, pdists)[:5]:
+            if is_male == True:
+                fmt = "  %-15s *%6.4f   %6.4f"
+            else:
+                fmt = "  %-15s  %6.4f  *%6.4f"
+            print(fmt % (name, pdist.prob(True), pdist.prob(False)))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+
+_inst_cache = {}
+
+
+def wsd_demo(trainer, word, features, n=1000):
+    import random
+
+    from nltk.corpus import senseval
+
+    # Get the instances.
+    print("Reading data...")
+    global _inst_cache
+    if word not in _inst_cache:
+        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
+    instances = _inst_cache[word][:]
+    if n > len(instances):
+        n = len(instances)
+    senses = list({l for (i, l) in instances})
+    print("  Senses: " + " ".join(senses))
+
+    # Randomly split the names into a test & train set.
+    print("Splitting into test & train...")
+    random.seed(123456)
+    random.shuffle(instances)
+    train = instances[: int(0.8 * n)]
+    test = instances[int(0.8 * n) : n]
+
+    # Train up a classifier.
+    print("Training classifier...")
+    classifier = trainer([(features(i), l) for (i, l) in train])
+
+    # Run the classifier on the test data.
+    print("Testing classifier...")
+    acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
+    print("Accuracy: %6.4f" % acc)
+
+    # For classifiers that can find probabilities, show the log
+    # likelihood and some sample probability distributions.
+    try:
+        test_featuresets = [features(i) for (i, n) in test]
+        pdists = classifier.prob_classify_many(test_featuresets)
+        ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
+        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+    except NotImplementedError:
+        pass
+
+    # Return the classifier
+    return classifier
+
+
+def check_megam_config():
+    """
+    Checks whether the MEGAM binary is configured.
+    """
+    try:
+        _megam_bin
+    except NameError as e:
+        err_msg = str(
+            "Please configure your megam binary first, e.g.\n"
+            ">>> nltk.config_megam('/usr/bin/local/megam')"
+        )
+        raise NameError(err_msg) from e