updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sem/relextract.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sem/relextract.py
@@ -0,0 +1,539 @@
+# Natural Language Toolkit: Relation Extraction
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Code for extracting relational triples from the ieer and conll2002 corpora.
+
+Relations are stored internally as dictionaries ('reldicts').
+
+The two serialization outputs are "rtuple" and "clause".
+
+- An rtuple is a tuple of the form ``(subj, filler, obj)``,
+  where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words
+  occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to
+  circumvent locale variations in rendering utf-8 encoded strings.
+- A clause is an atom of the form ``relsym(subjsym, objsym)``,
+  where the relation, subject and object have been canonicalized to single strings.
+"""
+
+# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
+
+import html
+import re
+from collections import defaultdict
+
+# Dictionary that associates corpora with NE classes
+NE_CLASSES = {
+    "ieer": [
+        "LOCATION",
+        "ORGANIZATION",
+        "PERSON",
+        "DURATION",
+        "DATE",
+        "CARDINAL",
+        "PERCENT",
+        "MONEY",
+        "MEASURE",
+    ],
+    "conll2002": ["LOC", "PER", "ORG"],
+    "ace": [
+        "LOCATION",
+        "ORGANIZATION",
+        "PERSON",
+        "DURATION",
+        "DATE",
+        "CARDINAL",
+        "PERCENT",
+        "MONEY",
+        "MEASURE",
+        "FACILITY",
+        "GPE",
+    ],
+}
+
+# Allow abbreviated class labels
+short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON")
+long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER")
+
+
+def _expand(type):
+    """
+    Expand an NE class name.
+    :type type: str
+    :rtype: str
+    """
+    try:
+        return short2long[type]
+    except KeyError:
+        return type
+
+
+def class_abbrev(type):
+    """
+    Abbreviate an NE class name.
+    :type type: str
+    :rtype: str
+    """
+    try:
+        return long2short[type]
+    except KeyError:
+        return type
+
+
+def _join(lst, sep=" ", untag=False):
+    """
+    Join a list into a string, turning tags tuples into tag strings or just words.
+    :param untag: if ``True``, omit the tag from tagged input strings.
+    :type lst: list
+    :rtype: str
+    """
+    try:
+        return sep.join(lst)
+    except TypeError:
+        if untag:
+            return sep.join(tup[0] for tup in lst)
+        from nltk.tag import tuple2str
+
+        return sep.join(tuple2str(tup) for tup in lst)
+
+
+def descape_entity(m, defs=html.entities.entitydefs):
+    """
+    Translate one entity to its ISO Latin value.
+    Inspired by example from effbot.org
+
+
+    """
+    try:
+        return defs[m.group(1)]
+
+    except KeyError:
+        return m.group(0)  # use as is
+
+
+def list2sym(lst):
+    """
+    Convert a list of strings into a canonical symbol.
+    :type lst: list
+    :return: a Unicode string without whitespace
+    :rtype: unicode
+    """
+    sym = _join(lst, "_", untag=True)
+    sym = sym.lower()
+    ENT = re.compile(r"&(\w+?);")
+    sym = ENT.sub(descape_entity, sym)
+    sym = sym.replace(".", "")
+    return sym
+
+
+def tree2semi_rel(tree):
+    """
+    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
+
+    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
+    identifies pairs whose first member is a list (possibly empty) of terminal
+    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
+
+    :param tree: a chunk tree
+    :return: a list of pairs (list(str), ``Tree``)
+    :rtype: list of tuple
+    """
+
+    from nltk.tree import Tree
+
+    semi_rels = []
+    semi_rel = [[], None]
+
+    for dtr in tree:
+        if not isinstance(dtr, Tree):
+            semi_rel[0].append(dtr)
+        else:
+            # dtr is a Tree
+            semi_rel[1] = dtr
+            semi_rels.append(semi_rel)
+            semi_rel = [[], None]
+    return semi_rels
+
+
+def semi_rel2reldict(pairs, window=5, trace=False):
+    """
+    Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which
+    stores information about the subject and object NEs plus the filler between them.
+    Additionally, a left and right context of length =< window are captured (within
+    a given input sentence).
+
+    :param pairs: a pair of list(str) and ``Tree``, as generated by
+    :param window: a threshold for the number of items to include in the left and right context
+    :type window: int
+    :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
+    :rtype: list(defaultdict)
+    """
+    result = []
+    while len(pairs) > 2:
+        reldict = defaultdict(str)
+        reldict["lcon"] = _join(pairs[0][0][-window:])
+        reldict["subjclass"] = pairs[0][1].label()
+        reldict["subjtext"] = _join(pairs[0][1].leaves())
+        reldict["subjsym"] = list2sym(pairs[0][1].leaves())
+        reldict["filler"] = _join(pairs[1][0])
+        reldict["untagged_filler"] = _join(pairs[1][0], untag=True)
+        reldict["objclass"] = pairs[1][1].label()
+        reldict["objtext"] = _join(pairs[1][1].leaves())
+        reldict["objsym"] = list2sym(pairs[1][1].leaves())
+        reldict["rcon"] = _join(pairs[2][0][:window])
+        if trace:
+            print(
+                "(%s(%s, %s)"
+                % (
+                    reldict["untagged_filler"],
+                    reldict["subjclass"],
+                    reldict["objclass"],
+                )
+            )
+        result.append(reldict)
+        pairs = pairs[1:]
+    return result
+
+
+def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10):
+    """
+    Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
+
+    The parameters ``subjclass`` and ``objclass`` can be used to restrict the
+    Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
+    'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
+
+    :param subjclass: the class of the subject Named Entity.
+    :type subjclass: str
+    :param objclass: the class of the object Named Entity.
+    :type objclass: str
+    :param doc: input document
+    :type doc: ieer document or a list of chunk trees
+    :param corpus: name of the corpus to take as input; possible values are
+        'ieer' and 'conll2002'
+    :type corpus: str
+    :param pattern: a regular expression for filtering the fillers of
+        retrieved triples.
+    :type pattern: SRE_Pattern
+    :param window: filters out fillers which exceed this threshold
+    :type window: int
+    :return: see ``mk_reldicts``
+    :rtype: list(defaultdict)
+    """
+
+    if subjclass and subjclass not in NE_CLASSES[corpus]:
+        if _expand(subjclass) in NE_CLASSES[corpus]:
+            subjclass = _expand(subjclass)
+        else:
+            raise ValueError(
+                "your value for the subject type has not been recognized: %s"
+                % subjclass
+            )
+    if objclass and objclass not in NE_CLASSES[corpus]:
+        if _expand(objclass) in NE_CLASSES[corpus]:
+            objclass = _expand(objclass)
+        else:
+            raise ValueError(
+                "your value for the object type has not been recognized: %s" % objclass
+            )
+
+    if corpus == "ace" or corpus == "conll2002":
+        pairs = tree2semi_rel(doc)
+    elif corpus == "ieer":
+        pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
+    else:
+        raise ValueError("corpus type not recognized")
+
+    reldicts = semi_rel2reldict(pairs)
+
+    relfilter = lambda x: (
+        x["subjclass"] == subjclass
+        and len(x["filler"].split()) <= window
+        and pattern.match(x["filler"])
+        and x["objclass"] == objclass
+    )
+
+    return list(filter(relfilter, reldicts))
+
+
+def rtuple(reldict, lcon=False, rcon=False):
+    """
+    Pretty print the reldict as an rtuple.
+    :param reldict: a relation dictionary
+    :type reldict: defaultdict
+    """
+    items = [
+        class_abbrev(reldict["subjclass"]),
+        reldict["subjtext"],
+        reldict["filler"],
+        class_abbrev(reldict["objclass"]),
+        reldict["objtext"],
+    ]
+    format = "[%s: %r] %r [%s: %r]"
+    if lcon:
+        items = [reldict["lcon"]] + items
+        format = "...%r)" + format
+    if rcon:
+        items.append(reldict["rcon"])
+        format = format + "(%r..."
+    printargs = tuple(items)
+    return format % printargs
+
+
+def clause(reldict, relsym):
+    """
+    Print the relation in clausal form.
+    :param reldict: a relation dictionary
+    :type reldict: defaultdict
+    :param relsym: a label for the relation
+    :type relsym: str
+    """
+    items = (relsym, reldict["subjsym"], reldict["objsym"])
+    return "%s(%r, %r)" % items
+
+
+#######################################################
+# Demos of relation extraction with regular expressions
+#######################################################
+
+
+############################################
+# Example of in(ORG, LOC)
+############################################
+def in_demo(trace=0, sql=True):
+    """
+    Select pairs of organizations and locations whose mentions occur with an
+    intervening occurrence of the preposition "in".
+
+    If the sql parameter is set to True, then the entity pairs are loaded into
+    an in-memory database, and subsequently pulled out using an SQL "SELECT"
+    query.
+    """
+    from nltk.corpus import ieer
+
+    if sql:
+        try:
+            import sqlite3
+
+            connection = sqlite3.connect(":memory:")
+            cur = connection.cursor()
+            cur.execute(
+                """create table Locations
+            (OrgName text, LocationName text, DocID text)"""
+            )
+        except ImportError:
+            import warnings
+
+            warnings.warn("Cannot import sqlite; sql flag will be ignored.")
+
+    IN = re.compile(r".*\bin\b(?!\b.+ing)")
+
+    print()
+    print("IEER: in(ORG, LOC) -- just the clauses:")
+    print("=" * 45)
+
+    for file in ieer.fileids():
+        for doc in ieer.parsed_docs(file):
+            if trace:
+                print(doc.docno)
+                print("=" * 15)
+            for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
+                print(clause(rel, relsym="IN"))
+                if sql:
+                    try:
+                        rtuple = (rel["subjtext"], rel["objtext"], doc.docno)
+                        cur.execute(
+                            """insert into Locations
+                                    values (?, ?, ?)""",
+                            rtuple,
+                        )
+                        connection.commit()
+                    except NameError:
+                        pass
+
+    if sql:
+        try:
+            cur.execute(
+                """select OrgName from Locations
+                        where LocationName = 'Atlanta'"""
+            )
+            print()
+            print("Extract data from SQL table: ORGs in Atlanta")
+            print("-" * 15)
+            for row in cur:
+                print(row)
+        except NameError:
+            pass
+
+
+############################################
+# Example of has_role(PER, LOC)
+############################################
+
+
+def roles_demo(trace=0):
+    from nltk.corpus import ieer
+
+    roles = r"""
+    (.*(                   # assorted roles
+    analyst|
+    chair(wo)?man|
+    commissioner|
+    counsel|
+    director|
+    economist|
+    editor|
+    executive|
+    foreman|
+    governor|
+    head|
+    lawyer|
+    leader|
+    librarian).*)|
+    manager|
+    partner|
+    president|
+    producer|
+    professor|
+    researcher|
+    spokes(wo)?man|
+    writer|
+    ,\sof\sthe?\s*  # "X, of (the) Y"
+    """
+    ROLES = re.compile(roles, re.VERBOSE)
+
+    print()
+    print("IEER: has_role(PER, ORG) -- raw rtuples:")
+    print("=" * 45)
+
+    for file in ieer.fileids():
+        for doc in ieer.parsed_docs(file):
+            lcon = rcon = False
+            if trace:
+                print(doc.docno)
+                print("=" * 15)
+                lcon = rcon = True
+            for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES):
+                print(rtuple(rel, lcon=lcon, rcon=rcon))
+
+
+##############################################
+### Show what's in the IEER Headlines
+##############################################
+
+
+def ieer_headlines():
+    from nltk.corpus import ieer
+    from nltk.tree import Tree
+
+    print("IEER: First 20 Headlines")
+    print("=" * 45)
+
+    trees = [
+        (doc.docno, doc.headline)
+        for file in ieer.fileids()
+        for doc in ieer.parsed_docs(file)
+    ]
+    for tree in trees[:20]:
+        print()
+        print("%s:\n%s" % tree)
+
+
+#############################################
+## Dutch CONLL2002: take_on_role(PER, ORG
+#############################################
+
+
+def conllned(trace=1):
+    """
+    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
+    from CoNLL 2002.
+    """
+
+    from nltk.corpus import conll2002
+
+    vnv = """
+    (
+    is/V|    # 3rd sing present and
+    was/V|   # past forms of the verb zijn ('be')
+    werd/V|  # and also present
+    wordt/V  # past of worden ('become)
+    )
+    .*       # followed by anything
+    van/Prep # followed by van ('of')
+    """
+    VAN = re.compile(vnv, re.VERBOSE)
+
+    print()
+    print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
+    print("=" * 45)
+
+    for doc in conll2002.chunked_sents("ned.train"):
+        lcon = rcon = False
+        if trace:
+            lcon = rcon = True
+        for rel in extract_rels(
+            "PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10
+        ):
+            print(rtuple(rel, lcon=lcon, rcon=rcon))
+
+
+#############################################
+## Spanish CONLL2002: (PER, ORG)
+#############################################
+
+
+def conllesp():
+    from nltk.corpus import conll2002
+
+    de = """
+    .*
+    (
+    de/SP|
+    del/SP
+    )
+    """
+    DE = re.compile(de, re.VERBOSE)
+
+    print()
+    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
+    print("=" * 45)
+    rels = [
+        rel
+        for doc in conll2002.chunked_sents("esp.train")
+        for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE)
+    ]
+    for r in rels[:10]:
+        print(clause(r, relsym="DE"))
+    print()
+
+
+def ne_chunked():
+    print()
+    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
+    print("=" * 45)
+    ROLE = re.compile(
+        r".*(chairman|president|trader|scientist|economist|analyst|partner).*"
+    )
+    rels = []
+    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
+        sent = nltk.ne_chunk(sent)
+        rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7)
+        for rel in rels:
+            print(f"{i:<5}{rtuple(rel)}")
+
+
+if __name__ == "__main__":
+    import nltk
+    from nltk.sem import relextract
+
+    in_demo(trace=0)
+    roles_demo(trace=0)
+    conllned()
+    conllesp()
+    ieer_headlines()
+    ne_chunked()