updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/tbl/template.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/tbl/template.py
@@ -0,0 +1,325 @@
+# Natural Language Toolkit: Transformation-based learning
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Marcus Uneson <marcus.uneson@gmail.com>
+#   based on previous (nltk2) version by
+#   Christopher Maloof, Edward Loper, Steven Bird
+# URL: <https://www.nltk.org/>
+# For license information, see  LICENSE.TXT
+
+import itertools as it
+from abc import ABCMeta, abstractmethod
+
+from nltk.tbl.feature import Feature
+from nltk.tbl.rule import Rule
+
+
+class BrillTemplateI(metaclass=ABCMeta):
+    """
+    An interface for generating lists of transformational rules that
+    apply at given sentence positions.  ``BrillTemplateI`` is used by
+    ``Brill`` training algorithms to generate candidate rules.
+    """
+
+    @abstractmethod
+    def applicable_rules(self, tokens, i, correctTag):
+        """
+        Return a list of the transformational rules that would correct
+        the ``i``-th subtoken's tag in the given token.  In particular,
+        return a list of zero or more rules that would change
+        ``tokens[i][1]`` to ``correctTag``, if applied to ``token[i]``.
+
+        If the ``i``-th token already has the correct tag (i.e., if
+        ``tagged_tokens[i][1] == correctTag``), then
+        ``applicable_rules()`` should return the empty list.
+
+        :param tokens: The tagged tokens being tagged.
+        :type tokens: list(tuple)
+        :param i: The index of the token whose tag should be corrected.
+        :type i: int
+        :param correctTag: The correct tag for the ``i``-th token.
+        :type correctTag: any
+        :rtype: list(BrillRule)
+        """
+
+    @abstractmethod
+    def get_neighborhood(self, token, index):
+        """
+        Returns the set of indices *i* such that
+        ``applicable_rules(token, i, ...)`` depends on the value of
+        the *index*th token of *token*.
+
+        This method is used by the "fast" Brill tagger trainer.
+
+        :param token: The tokens being tagged.
+        :type token: list(tuple)
+        :param index: The index whose neighborhood should be returned.
+        :type index: int
+        :rtype: set
+        """
+
+
+class Template(BrillTemplateI):
+    """
+    A tbl Template that generates a list of L{Rule}s that apply at a given sentence
+    position.  In particular, each C{Template} is parameterized by a list of
+    independent features (a combination of a specific
+    property to extract and a list C{L} of relative positions at which to extract
+    it) and generates all Rules that:
+
+      - use the given features, each at its own independent position; and
+      - are applicable to the given token.
+    """
+
+    ALLTEMPLATES = []
+    # record a unique id of form "001", for each template created
+    # _ids = it.count(0)
+
+    def __init__(self, *features):
+        """
+        Construct a Template for generating Rules.
+
+        Takes a list of Features. A C{Feature} is a combination
+        of a specific property and its relative positions and should be
+        a subclass of L{nltk.tbl.feature.Feature}.
+
+        An alternative calling convention (kept for backwards compatibility,
+        but less expressive as it only permits one feature type) is
+        Template(Feature, (start1, end1), (start2, end2), ...)
+        In new code, that would be better written
+        Template(Feature(start1, end1), Feature(start2, end2), ...)
+
+        For instance, importing some features
+
+        >>> from nltk.tbl.template import Template
+        >>> from nltk.tag.brill import Word, Pos
+
+        Create some features
+
+        >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1]))
+
+        Create a single-feature template
+
+        >>> Template(wfeat1)
+        Template(Word([-1]))
+
+        Or a two-feature one
+
+        >>> Template(wfeat1, wfeat2)
+        Template(Word([-1]),Word([1, 2]))
+
+        Or a three-feature one with two different feature types
+
+        >>> Template(wfeat1, wfeat2, pfeat)
+        Template(Word([-1]),Word([1, 2]),Pos([-2, -1]))
+
+        deprecated api: Feature subclass, followed by list of (start,end) pairs
+        (permits only a single Feature)
+
+        >>> Template(Word, (-2,-1), (0,0))
+        Template(Word([-2, -1]),Word([0]))
+
+        Incorrect specification raises TypeError
+
+        >>> Template(Word, (-2,-1), Pos, (0,0))
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+          File "nltk/tag/tbl/template.py", line 143, in __init__
+            raise TypeError(
+        TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...
+
+        :type features: list of Features
+        :param features: the features to build this Template on
+        """
+        # determine the calling form: either
+        # Template(Feature, args1, [args2, ...)]
+        # Template(Feature1(args),  Feature2(args), ...)
+        if all(isinstance(f, Feature) for f in features):
+            self._features = features
+        elif issubclass(features[0], Feature) and all(
+            isinstance(a, tuple) for a in features[1:]
+        ):
+            self._features = [features[0](*tp) for tp in features[1:]]
+        else:
+            raise TypeError(
+                "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..."
+            )
+        self.id = f"{len(self.ALLTEMPLATES):03d}"
+        self.ALLTEMPLATES.append(self)
+
+    def __repr__(self):
+        return "{}({})".format(
+            self.__class__.__name__,
+            ",".join([str(f) for f in self._features]),
+        )
+
+    def applicable_rules(self, tokens, index, correct_tag):
+        if tokens[index][1] == correct_tag:
+            return []
+
+        # For each of this Template's features, find the conditions
+        # that are applicable for the given token.
+        # Then, generate one Rule for each combination of features
+        # (the crossproduct of the conditions).
+
+        applicable_conditions = self._applicable_conditions(tokens, index)
+        xs = list(it.product(*applicable_conditions))
+        return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs]
+
+    def _applicable_conditions(self, tokens, index):
+        """
+        :returns: A set of all conditions for rules
+        that are applicable to C{tokens[index]}.
+        """
+        conditions = []
+
+        for feature in self._features:
+            conditions.append([])
+            for pos in feature.positions:
+                if not (0 <= index + pos < len(tokens)):
+                    continue
+                value = feature.extract_property(tokens, index + pos)
+                conditions[-1].append((feature, value))
+        return conditions
+
+    def get_neighborhood(self, tokens, index):
+        # inherit docs from BrillTemplateI
+
+        # applicable_rules(tokens, index, ...) depends on index.
+        neighborhood = {index}  # set literal for python 2.7+
+
+        # applicable_rules(tokens, i, ...) depends on index if
+        # i+start < index <= i+end.
+
+        allpositions = [0] + [p for feat in self._features for p in feat.positions]
+        start, end = min(allpositions), max(allpositions)
+        s = max(0, index + (-end))
+        e = min(index + (-start) + 1, len(tokens))
+        for i in range(s, e):
+            neighborhood.add(i)
+        return neighborhood
+
+    @classmethod
+    def expand(cls, featurelists, combinations=None, skipintersecting=True):
+        """
+        Factory method to mass generate Templates from a list L of lists of  Features.
+
+        #With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2
+        #of the sublists in L; it will output all Templates formed by the Cartesian product
+        #of this selection, with duplicates and other semantically equivalent
+        #forms removed. Default for combinations is (1, len(L)).
+
+        The feature lists may have been specified
+        manually, or generated from Feature.expand(). For instance,
+
+        >>> from nltk.tbl.template import Template
+        >>> from nltk.tag.brill import Word, Pos
+
+        #creating some features
+        >>> (wd_0, wd_01) = (Word([0]), Word([0,1]))
+
+        >>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3]))
+
+        >>> list(Template.expand([[wd_0], [pos_m2]]))
+        [Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))]
+
+        >>> list(Template.expand([[wd_0, wd_01], [pos_m2]]))
+        [Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))]
+
+        #note: with Feature.expand(), it is very easy to generate more templates
+        #than your system can handle -- for instance,
+        >>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False)
+        >>> len(wordtpls)
+        7
+
+        >>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True)
+        >>> len(postpls)
+        9
+
+        #and now the Cartesian product of all non-empty combinations of two wordtpls and
+        #two postpls, with semantic equivalents removed
+        >>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls]))
+        >>> len(templates)
+        713
+
+
+          will return a list of eight templates
+              Template(Word([0])),
+              Template(Word([0, 1])),
+              Template(Pos([-2])),
+              Template(Pos([-1])),
+              Template(Pos([-2]),Word([0])),
+              Template(Pos([-1]),Word([0])),
+              Template(Pos([-2]),Word([0, 1])),
+              Template(Pos([-1]),Word([0, 1]))]
+
+
+        #Templates where one feature is a subset of another, such as
+        #Template(Word([0,1]), Word([1]), will not appear in the output.
+        #By default, this non-subset constraint is tightened to disjointness:
+        #Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out.
+        #With skipintersecting=False, then such Templates are allowed
+
+        WARNING: this method makes it very easy to fill all your memory when training
+        generated templates on any real-world corpus
+
+        :param featurelists: lists of Features, whose Cartesian product will return a set of Templates
+        :type featurelists: list of (list of Features)
+        :param combinations: given n featurelists: if combinations=k, all generated Templates will have
+                k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n
+        :type combinations: None, int, or (int, int)
+        :param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature)
+        :type skipintersecting: bool
+        :returns: generator of Templates
+
+        """
+
+        def nonempty_powerset(xs):  # xs is a list
+            # itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
+
+            # find the correct tuple given combinations, one of {None, k, (k1,k2)}
+            k = combinations  # for brevity
+            combrange = (
+                (1, len(xs) + 1)
+                if k is None
+                else (
+                    (k, k + 1)  # n over 1 .. n over n (all non-empty combinations)
+                    if isinstance(k, int)
+                    else (k[0], k[1] + 1)
+                )  # n over k (only
+            )  # n over k1, n over k1+1... n over k2
+            return it.chain.from_iterable(
+                it.combinations(xs, r) for r in range(*combrange)
+            )
+
+        seentemplates = set()
+        for picks in nonempty_powerset(featurelists):
+            for pick in it.product(*picks):
+                if any(
+                    i != j and x.issuperset(y)
+                    for (i, x) in enumerate(pick)
+                    for (j, y) in enumerate(pick)
+                ):
+                    continue
+                if skipintersecting and any(
+                    i != j and x.intersects(y)
+                    for (i, x) in enumerate(pick)
+                    for (j, y) in enumerate(pick)
+                ):
+                    continue
+                thistemplate = cls(*sorted(pick))
+                strpick = str(thistemplate)
+                #!!FIXME --this is hackish
+                if strpick in seentemplates:  # already added
+                    cls._poptemplate()
+                    continue
+                seentemplates.add(strpick)
+                yield thistemplate
+
+    @classmethod
+    def _cleartemplates(cls):
+        cls.ALLTEMPLATES = []
+
+    @classmethod
+    def _poptemplate(cls):
+        return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None