updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/parse/evaluate.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/parse/evaluate.py
@@ -0,0 +1,129 @@
+# Natural Language Toolkit: evaluation of dependency parser
+#
+# Author: Long Duong <longdt219@gmail.com>
+#
+# Copyright (C) 2001-2025 NLTK Project
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import unicodedata
+
+
+class DependencyEvaluator:
+    """
+    Class for measuring labelled and unlabelled attachment score for
+    dependency parsing. Note that the evaluation ignores punctuation.
+
+    >>> from nltk.parse import DependencyGraph, DependencyEvaluator
+
+    >>> gold_sent = DependencyGraph(\"""
+    ... Pierre  NNP     2       NMOD
+    ... Vinken  NNP     8       SUB
+    ... ,       ,       2       P
+    ... 61      CD      5       NMOD
+    ... years   NNS     6       AMOD
+    ... old     JJ      2       NMOD
+    ... ,       ,       2       P
+    ... will    MD      0       ROOT
+    ... join    VB      8       VC
+    ... the     DT      11      NMOD
+    ... board   NN      9       OBJ
+    ... as      IN      9       VMOD
+    ... a       DT      15      NMOD
+    ... nonexecutive    JJ      15      NMOD
+    ... director        NN      12      PMOD
+    ... Nov.    NNP     9       VMOD
+    ... 29      CD      16      NMOD
+    ... .       .       9       VMOD
+    ... \""")
+
+    >>> parsed_sent = DependencyGraph(\"""
+    ... Pierre  NNP     8       NMOD
+    ... Vinken  NNP     1       SUB
+    ... ,       ,       3       P
+    ... 61      CD      6       NMOD
+    ... years   NNS     6       AMOD
+    ... old     JJ      2       NMOD
+    ... ,       ,       3       AMOD
+    ... will    MD      0       ROOT
+    ... join    VB      8       VC
+    ... the     DT      11      AMOD
+    ... board   NN      9       OBJECT
+    ... as      IN      9       NMOD
+    ... a       DT      15      NMOD
+    ... nonexecutive    JJ      15      NMOD
+    ... director        NN      12      PMOD
+    ... Nov.    NNP     9       VMOD
+    ... 29      CD      16      NMOD
+    ... .       .       9       VMOD
+    ... \""")
+
+    >>> de = DependencyEvaluator([parsed_sent],[gold_sent])
+    >>> las, uas = de.eval()
+    >>> las
+    0.6
+    >>> uas
+    0.8
+    >>> abs(uas - 0.8) < 0.00001
+    True
+    """
+
+    def __init__(self, parsed_sents, gold_sents):
+        """
+        :param parsed_sents: the list of parsed_sents as the output of parser
+        :type parsed_sents: list(DependencyGraph)
+        """
+        self._parsed_sents = parsed_sents
+        self._gold_sents = gold_sents
+
+    def _remove_punct(self, inStr):
+        """
+        Function to remove punctuation from Unicode string.
+        :param input: the input string
+        :return: Unicode string after remove all punctuation
+        """
+        punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
+        return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat)
+
+    def eval(self):
+        """
+        Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS)
+
+        :return : tuple(float,float)
+        """
+        if len(self._parsed_sents) != len(self._gold_sents):
+            raise ValueError(
+                " Number of parsed sentence is different with number of gold sentence."
+            )
+
+        corr = 0
+        corrL = 0
+        total = 0
+
+        for i in range(len(self._parsed_sents)):
+            parsed_sent_nodes = self._parsed_sents[i].nodes
+            gold_sent_nodes = self._gold_sents[i].nodes
+
+            if len(parsed_sent_nodes) != len(gold_sent_nodes):
+                raise ValueError("Sentences must have equal length.")
+
+            for parsed_node_address, parsed_node in parsed_sent_nodes.items():
+                gold_node = gold_sent_nodes[parsed_node_address]
+
+                if parsed_node["word"] is None:
+                    continue
+                if parsed_node["word"] != gold_node["word"]:
+                    raise ValueError("Sentence sequence is not matched.")
+
+                # Ignore if word is punctuation by default
+                # if (parsed_sent[j]["word"] in string.punctuation):
+                if self._remove_punct(parsed_node["word"]) == "":
+                    continue
+
+                total += 1
+                if parsed_node["head"] == gold_node["head"]:
+                    corr += 1
+                    if parsed_node["rel"] == gold_node["rel"]:
+                        corrL += 1
+
+        return corrL / total, corr / total