updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/chunk/named_entity.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/chunk/named_entity.py
@@ -0,0 +1,407 @@
+# Natural Language Toolkit: Chunk parsing API
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Eric Kafe <kafe.eric@gmail.com> (tab-format models)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Named entity chunker
+"""
+
+import os
+import re
+from xml.etree import ElementTree as ET
+
+from nltk.tag import ClassifierBasedTagger, pos_tag
+
+try:
+    from nltk.classify import MaxentClassifier
+except ImportError:
+    pass
+
+from nltk.chunk.api import ChunkParserI
+from nltk.chunk.util import ChunkScore
+from nltk.data import find
+from nltk.tokenize import word_tokenize
+from nltk.tree import Tree
+
+
+class NEChunkParserTagger(ClassifierBasedTagger):
+    """
+    The IOB tagger used by the chunk parser.
+    """
+
+    def __init__(self, train=None, classifier=None):
+        ClassifierBasedTagger.__init__(
+            self,
+            train=train,
+            classifier_builder=self._classifier_builder,
+            classifier=classifier,
+        )
+
+    def _classifier_builder(self, train):
+        return MaxentClassifier.train(
+            #          "megam" cannot be the default algorithm since it requires compiling with ocaml
+            train,
+            algorithm="iis",
+            gaussian_prior_sigma=1,
+            trace=2,
+        )
+
+    def _english_wordlist(self):
+        try:
+            wl = self._en_wordlist
+        except AttributeError:
+            from nltk.corpus import words
+
+            self._en_wordlist = set(words.words("en-basic"))
+            wl = self._en_wordlist
+        return wl
+
+    def _feature_detector(self, tokens, index, history):
+        word = tokens[index][0]
+        pos = simplify_pos(tokens[index][1])
+        if index == 0:
+            prevword = prevprevword = None
+            prevpos = prevprevpos = None
+            prevshape = prevtag = prevprevtag = None
+        elif index == 1:
+            prevword = tokens[index - 1][0].lower()
+            prevprevword = None
+            prevpos = simplify_pos(tokens[index - 1][1])
+            prevprevpos = None
+            prevtag = history[index - 1][0]
+            prevshape = prevprevtag = None
+        else:
+            prevword = tokens[index - 1][0].lower()
+            prevprevword = tokens[index - 2][0].lower()
+            prevpos = simplify_pos(tokens[index - 1][1])
+            prevprevpos = simplify_pos(tokens[index - 2][1])
+            prevtag = history[index - 1]
+            prevprevtag = history[index - 2]
+            prevshape = shape(prevword)
+        if index == len(tokens) - 1:
+            nextword = nextnextword = None
+            nextpos = nextnextpos = None
+        elif index == len(tokens) - 2:
+            nextword = tokens[index + 1][0].lower()
+            nextpos = tokens[index + 1][1].lower()
+            nextnextword = None
+            nextnextpos = None
+        else:
+            nextword = tokens[index + 1][0].lower()
+            nextpos = tokens[index + 1][1].lower()
+            nextnextword = tokens[index + 2][0].lower()
+            nextnextpos = tokens[index + 2][1].lower()
+
+        # 89.6
+        features = {
+            "bias": True,
+            "shape": shape(word),
+            "wordlen": len(word),
+            "prefix3": word[:3].lower(),
+            "suffix3": word[-3:].lower(),
+            "pos": pos,
+            "word": word,
+            "en-wordlist": (word in self._english_wordlist()),
+            "prevtag": prevtag,
+            "prevpos": prevpos,
+            "nextpos": nextpos,
+            "prevword": prevword,
+            "nextword": nextword,
+            "word+nextpos": f"{word.lower()}+{nextpos}",
+            "pos+prevtag": f"{pos}+{prevtag}",
+            "shape+prevtag": f"{prevshape}+{prevtag}",
+        }
+
+        return features
+
+
+class NEChunkParser(ChunkParserI):
+    """
+    Expected input: list of pos-tagged words
+    """
+
+    def __init__(self, train):
+        self._train(train)
+
+    def parse(self, tokens):
+        """
+        Each token should be a pos-tagged word
+        """
+        tagged = self._tagger.tag(tokens)
+        tree = self._tagged_to_parse(tagged)
+        return tree
+
+    def _train(self, corpus):
+        # Convert to tagged sequence
+        corpus = [self._parse_to_tagged(s) for s in corpus]
+
+        self._tagger = NEChunkParserTagger(train=corpus)
+
+    def _tagged_to_parse(self, tagged_tokens):
+        """
+        Convert a list of tagged tokens to a chunk-parse tree.
+        """
+        sent = Tree("S", [])
+
+        for tok, tag in tagged_tokens:
+            if tag == "O":
+                sent.append(tok)
+            elif tag.startswith("B-"):
+                sent.append(Tree(tag[2:], [tok]))
+            elif tag.startswith("I-"):
+                if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
+                    sent[-1].append(tok)
+                else:
+                    sent.append(Tree(tag[2:], [tok]))
+        return sent
+
+    @staticmethod
+    def _parse_to_tagged(sent):
+        """
+        Convert a chunk-parse tree to a list of tagged tokens.
+        """
+        toks = []
+        for child in sent:
+            if isinstance(child, Tree):
+                if len(child) == 0:
+                    print("Warning -- empty chunk in sentence")
+                    continue
+                toks.append((child[0], f"B-{child.label()}"))
+                for tok in child[1:]:
+                    toks.append((tok, f"I-{child.label()}"))
+            else:
+                toks.append((child, "O"))
+        return toks
+
+
+def shape(word):
+    if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
+        return "number"
+    elif re.match(r"\W+$", word, re.UNICODE):
+        return "punct"
+    elif re.match(r"\w+$", word, re.UNICODE):
+        if word.istitle():
+            return "upcase"
+        elif word.islower():
+            return "downcase"
+        else:
+            return "mixedcase"
+    else:
+        return "other"
+
+
+def simplify_pos(s):
+    if s.startswith("V"):
+        return "V"
+    else:
+        return s.split("-")[0]
+
+
+def postag_tree(tree):
+    # Part-of-speech tagging.
+    words = tree.leaves()
+    tag_iter = (pos for (word, pos) in pos_tag(words))
+    newtree = Tree("S", [])
+    for child in tree:
+        if isinstance(child, Tree):
+            newtree.append(Tree(child.label(), []))
+            for subchild in child:
+                newtree[-1].append((subchild, next(tag_iter)))
+        else:
+            newtree.append((child, next(tag_iter)))
+    return newtree
+
+
+def load_ace_data(roots, fmt="binary", skip_bnews=True):
+    for root in roots:
+        for root, dirs, files in os.walk(root):
+            if root.endswith("bnews") and skip_bnews:
+                continue
+            for f in files:
+                if f.endswith(".sgm"):
+                    yield from load_ace_file(os.path.join(root, f), fmt)
+
+
+def load_ace_file(textfile, fmt):
+    print(f"  - {os.path.split(textfile)[1]}")
+    annfile = textfile + ".tmx.rdc.xml"
+
+    # Read the xml file, and get a list of entities
+    entities = []
+    with open(annfile) as infile:
+        xml = ET.parse(infile).getroot()
+    for entity in xml.findall("document/entity"):
+        typ = entity.find("entity_type").text
+        for mention in entity.findall("entity_mention"):
+            if mention.get("TYPE") != "NAME":
+                continue  # only NEs
+            s = int(mention.find("head/charseq/start").text)
+            e = int(mention.find("head/charseq/end").text) + 1
+            entities.append((s, e, typ))
+
+    # Read the text file, and mark the entities.
+    with open(textfile) as infile:
+        text = infile.read()
+
+    # Strip XML tags, since they don't count towards the indices
+    text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
+
+    # Blank out anything before/after <TEXT>
+    def subfunc(m):
+        return " " * (m.end() - m.start() - 6)
+
+    text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
+    text = re.sub(r"</TEXT>[\s\S]*", "", text)
+
+    # Simplify quotes
+    text = re.sub("``", ' "', text)
+    text = re.sub("''", '" ', text)
+
+    entity_types = {typ for (s, e, typ) in entities}
+
+    # Binary distinction (NE or not NE)
+    if fmt == "binary":
+        i = 0
+        toks = Tree("S", [])
+        for s, e, typ in sorted(entities):
+            if s < i:
+                s = i  # Overlapping!  Deal with this better?
+            if e <= s:
+                continue
+            toks.extend(word_tokenize(text[i:s]))
+            toks.append(Tree("NE", text[s:e].split()))
+            i = e
+        toks.extend(word_tokenize(text[i:]))
+        yield toks
+
+    # Multiclass distinction (NE type)
+    elif fmt == "multiclass":
+        i = 0
+        toks = Tree("S", [])
+        for s, e, typ in sorted(entities):
+            if s < i:
+                s = i  # Overlapping!  Deal with this better?
+            if e <= s:
+                continue
+            toks.extend(word_tokenize(text[i:s]))
+            toks.append(Tree(typ, text[s:e].split()))
+            i = e
+        toks.extend(word_tokenize(text[i:]))
+        yield toks
+
+    else:
+        raise ValueError("bad fmt value")
+
+
+# This probably belongs in a more general-purpose location (as does
+# the parse_to_tagged function).
+def cmp_chunks(correct, guessed):
+    correct = NEChunkParser._parse_to_tagged(correct)
+    guessed = NEChunkParser._parse_to_tagged(guessed)
+    ellipsis = False
+    for (w, ct), (w, gt) in zip(correct, guessed):
+        if ct == gt == "O":
+            if not ellipsis:
+                print(f"  {ct:15} {gt:15} {w}")
+                print("  {:15} {:15} {2}".format("...", "...", "..."))
+                ellipsis = True
+        else:
+            ellipsis = False
+            print(f"  {ct:15} {gt:15} {w}")
+
+
+# ======================================================================================
+
+
+class Maxent_NE_Chunker(NEChunkParser):
+    """
+    Expected input: list of pos-tagged words
+    """
+
+    def __init__(self, fmt="multiclass"):
+        from nltk.data import find
+
+        self._fmt = fmt
+        self._tab_dir = find(f"chunkers/maxent_ne_chunker_tab/english_ace_{fmt}/")
+        self.load_params()
+
+    def load_params(self):
+        from nltk.classify.maxent import BinaryMaxentFeatureEncoding, load_maxent_params
+
+        wgt, mpg, lab, aon = load_maxent_params(self._tab_dir)
+        mc = MaxentClassifier(
+            BinaryMaxentFeatureEncoding(lab, mpg, alwayson_features=aon), wgt
+        )
+        self._tagger = NEChunkParserTagger(classifier=mc)
+
+    def save_params(self):
+        from nltk.classify.maxent import save_maxent_params
+
+        classif = self._tagger._classifier
+        ecg = classif._encoding
+        wgt = classif._weights
+        mpg = ecg._mapping
+        lab = ecg._labels
+        aon = ecg._alwayson
+        fmt = self._fmt
+        save_maxent_params(wgt, mpg, lab, aon, tab_dir=f"/tmp/english_ace_{fmt}/")
+
+
+def build_model(fmt="multiclass"):
+    chunker = Maxent_NE_Chunker(fmt)
+    chunker.save_params()
+    return chunker
+
+
+# ======================================================================================
+
+"""
+2004 update: pickles are not supported anymore.
+
+Deprecated:
+
+def build_model(fmt="binary"):
+    print("Loading training data...")
+    train_paths = [
+        find("corpora/ace_data/ace.dev"),
+        find("corpora/ace_data/ace.heldout"),
+        find("corpora/ace_data/bbn.dev"),
+        find("corpora/ace_data/muc.dev"),
+    ]
+    train_trees = load_ace_data(train_paths, fmt)
+    train_data = [postag_tree(t) for t in train_trees]
+    print("Training...")
+    cp = NEChunkParser(train_data)
+    del train_data
+
+    print("Loading eval data...")
+    eval_paths = [find("corpora/ace_data/ace.eval")]
+    eval_trees = load_ace_data(eval_paths, fmt)
+    eval_data = [postag_tree(t) for t in eval_trees]
+
+    print("Evaluating...")
+    chunkscore = ChunkScore()
+    for i, correct in enumerate(eval_data):
+        guess = cp.parse(correct.leaves())
+        chunkscore.score(correct, guess)
+        if i < 3:
+            cmp_chunks(correct, guess)
+    print(chunkscore)
+
+    outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
+    print(f"Saving chunker to {outfilename}...")
+
+    with open(outfilename, "wb") as outfile:
+        pickle.dump(cp, outfile, -1)
+
+    return cp
+"""
+
+if __name__ == "__main__":
+    # Make sure that the object has the right class name:
+    build_model("binary")
+    build_model("multiclass")