updates
This commit is contained in:
@@ -0,0 +1,407 @@
|
||||
# Natural Language Toolkit: Chunk parsing API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Eric Kafe <kafe.eric@gmail.com> (tab-format models)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Named entity chunker
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
from nltk.tag import ClassifierBasedTagger, pos_tag
|
||||
|
||||
try:
|
||||
from nltk.classify import MaxentClassifier
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.chunk.api import ChunkParserI
|
||||
from nltk.chunk.util import ChunkScore
|
||||
from nltk.data import find
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class NEChunkParserTagger(ClassifierBasedTagger):
|
||||
"""
|
||||
The IOB tagger used by the chunk parser.
|
||||
"""
|
||||
|
||||
def __init__(self, train=None, classifier=None):
|
||||
ClassifierBasedTagger.__init__(
|
||||
self,
|
||||
train=train,
|
||||
classifier_builder=self._classifier_builder,
|
||||
classifier=classifier,
|
||||
)
|
||||
|
||||
def _classifier_builder(self, train):
|
||||
return MaxentClassifier.train(
|
||||
# "megam" cannot be the default algorithm since it requires compiling with ocaml
|
||||
train,
|
||||
algorithm="iis",
|
||||
gaussian_prior_sigma=1,
|
||||
trace=2,
|
||||
)
|
||||
|
||||
def _english_wordlist(self):
|
||||
try:
|
||||
wl = self._en_wordlist
|
||||
except AttributeError:
|
||||
from nltk.corpus import words
|
||||
|
||||
self._en_wordlist = set(words.words("en-basic"))
|
||||
wl = self._en_wordlist
|
||||
return wl
|
||||
|
||||
def _feature_detector(self, tokens, index, history):
|
||||
word = tokens[index][0]
|
||||
pos = simplify_pos(tokens[index][1])
|
||||
if index == 0:
|
||||
prevword = prevprevword = None
|
||||
prevpos = prevprevpos = None
|
||||
prevshape = prevtag = prevprevtag = None
|
||||
elif index == 1:
|
||||
prevword = tokens[index - 1][0].lower()
|
||||
prevprevword = None
|
||||
prevpos = simplify_pos(tokens[index - 1][1])
|
||||
prevprevpos = None
|
||||
prevtag = history[index - 1][0]
|
||||
prevshape = prevprevtag = None
|
||||
else:
|
||||
prevword = tokens[index - 1][0].lower()
|
||||
prevprevword = tokens[index - 2][0].lower()
|
||||
prevpos = simplify_pos(tokens[index - 1][1])
|
||||
prevprevpos = simplify_pos(tokens[index - 2][1])
|
||||
prevtag = history[index - 1]
|
||||
prevprevtag = history[index - 2]
|
||||
prevshape = shape(prevword)
|
||||
if index == len(tokens) - 1:
|
||||
nextword = nextnextword = None
|
||||
nextpos = nextnextpos = None
|
||||
elif index == len(tokens) - 2:
|
||||
nextword = tokens[index + 1][0].lower()
|
||||
nextpos = tokens[index + 1][1].lower()
|
||||
nextnextword = None
|
||||
nextnextpos = None
|
||||
else:
|
||||
nextword = tokens[index + 1][0].lower()
|
||||
nextpos = tokens[index + 1][1].lower()
|
||||
nextnextword = tokens[index + 2][0].lower()
|
||||
nextnextpos = tokens[index + 2][1].lower()
|
||||
|
||||
# 89.6
|
||||
features = {
|
||||
"bias": True,
|
||||
"shape": shape(word),
|
||||
"wordlen": len(word),
|
||||
"prefix3": word[:3].lower(),
|
||||
"suffix3": word[-3:].lower(),
|
||||
"pos": pos,
|
||||
"word": word,
|
||||
"en-wordlist": (word in self._english_wordlist()),
|
||||
"prevtag": prevtag,
|
||||
"prevpos": prevpos,
|
||||
"nextpos": nextpos,
|
||||
"prevword": prevword,
|
||||
"nextword": nextword,
|
||||
"word+nextpos": f"{word.lower()}+{nextpos}",
|
||||
"pos+prevtag": f"{pos}+{prevtag}",
|
||||
"shape+prevtag": f"{prevshape}+{prevtag}",
|
||||
}
|
||||
|
||||
return features
|
||||
|
||||
|
||||
class NEChunkParser(ChunkParserI):
|
||||
"""
|
||||
Expected input: list of pos-tagged words
|
||||
"""
|
||||
|
||||
def __init__(self, train):
|
||||
self._train(train)
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Each token should be a pos-tagged word
|
||||
"""
|
||||
tagged = self._tagger.tag(tokens)
|
||||
tree = self._tagged_to_parse(tagged)
|
||||
return tree
|
||||
|
||||
def _train(self, corpus):
|
||||
# Convert to tagged sequence
|
||||
corpus = [self._parse_to_tagged(s) for s in corpus]
|
||||
|
||||
self._tagger = NEChunkParserTagger(train=corpus)
|
||||
|
||||
def _tagged_to_parse(self, tagged_tokens):
|
||||
"""
|
||||
Convert a list of tagged tokens to a chunk-parse tree.
|
||||
"""
|
||||
sent = Tree("S", [])
|
||||
|
||||
for tok, tag in tagged_tokens:
|
||||
if tag == "O":
|
||||
sent.append(tok)
|
||||
elif tag.startswith("B-"):
|
||||
sent.append(Tree(tag[2:], [tok]))
|
||||
elif tag.startswith("I-"):
|
||||
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
|
||||
sent[-1].append(tok)
|
||||
else:
|
||||
sent.append(Tree(tag[2:], [tok]))
|
||||
return sent
|
||||
|
||||
@staticmethod
|
||||
def _parse_to_tagged(sent):
|
||||
"""
|
||||
Convert a chunk-parse tree to a list of tagged tokens.
|
||||
"""
|
||||
toks = []
|
||||
for child in sent:
|
||||
if isinstance(child, Tree):
|
||||
if len(child) == 0:
|
||||
print("Warning -- empty chunk in sentence")
|
||||
continue
|
||||
toks.append((child[0], f"B-{child.label()}"))
|
||||
for tok in child[1:]:
|
||||
toks.append((tok, f"I-{child.label()}"))
|
||||
else:
|
||||
toks.append((child, "O"))
|
||||
return toks
|
||||
|
||||
|
||||
def shape(word):
|
||||
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
|
||||
return "number"
|
||||
elif re.match(r"\W+$", word, re.UNICODE):
|
||||
return "punct"
|
||||
elif re.match(r"\w+$", word, re.UNICODE):
|
||||
if word.istitle():
|
||||
return "upcase"
|
||||
elif word.islower():
|
||||
return "downcase"
|
||||
else:
|
||||
return "mixedcase"
|
||||
else:
|
||||
return "other"
|
||||
|
||||
|
||||
def simplify_pos(s):
|
||||
if s.startswith("V"):
|
||||
return "V"
|
||||
else:
|
||||
return s.split("-")[0]
|
||||
|
||||
|
||||
def postag_tree(tree):
|
||||
# Part-of-speech tagging.
|
||||
words = tree.leaves()
|
||||
tag_iter = (pos for (word, pos) in pos_tag(words))
|
||||
newtree = Tree("S", [])
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
newtree.append(Tree(child.label(), []))
|
||||
for subchild in child:
|
||||
newtree[-1].append((subchild, next(tag_iter)))
|
||||
else:
|
||||
newtree.append((child, next(tag_iter)))
|
||||
return newtree
|
||||
|
||||
|
||||
def load_ace_data(roots, fmt="binary", skip_bnews=True):
|
||||
for root in roots:
|
||||
for root, dirs, files in os.walk(root):
|
||||
if root.endswith("bnews") and skip_bnews:
|
||||
continue
|
||||
for f in files:
|
||||
if f.endswith(".sgm"):
|
||||
yield from load_ace_file(os.path.join(root, f), fmt)
|
||||
|
||||
|
||||
def load_ace_file(textfile, fmt):
|
||||
print(f" - {os.path.split(textfile)[1]}")
|
||||
annfile = textfile + ".tmx.rdc.xml"
|
||||
|
||||
# Read the xml file, and get a list of entities
|
||||
entities = []
|
||||
with open(annfile) as infile:
|
||||
xml = ET.parse(infile).getroot()
|
||||
for entity in xml.findall("document/entity"):
|
||||
typ = entity.find("entity_type").text
|
||||
for mention in entity.findall("entity_mention"):
|
||||
if mention.get("TYPE") != "NAME":
|
||||
continue # only NEs
|
||||
s = int(mention.find("head/charseq/start").text)
|
||||
e = int(mention.find("head/charseq/end").text) + 1
|
||||
entities.append((s, e, typ))
|
||||
|
||||
# Read the text file, and mark the entities.
|
||||
with open(textfile) as infile:
|
||||
text = infile.read()
|
||||
|
||||
# Strip XML tags, since they don't count towards the indices
|
||||
text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
|
||||
|
||||
# Blank out anything before/after <TEXT>
|
||||
def subfunc(m):
|
||||
return " " * (m.end() - m.start() - 6)
|
||||
|
||||
text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
|
||||
text = re.sub(r"</TEXT>[\s\S]*", "", text)
|
||||
|
||||
# Simplify quotes
|
||||
text = re.sub("``", ' "', text)
|
||||
text = re.sub("''", '" ', text)
|
||||
|
||||
entity_types = {typ for (s, e, typ) in entities}
|
||||
|
||||
# Binary distinction (NE or not NE)
|
||||
if fmt == "binary":
|
||||
i = 0
|
||||
toks = Tree("S", [])
|
||||
for s, e, typ in sorted(entities):
|
||||
if s < i:
|
||||
s = i # Overlapping! Deal with this better?
|
||||
if e <= s:
|
||||
continue
|
||||
toks.extend(word_tokenize(text[i:s]))
|
||||
toks.append(Tree("NE", text[s:e].split()))
|
||||
i = e
|
||||
toks.extend(word_tokenize(text[i:]))
|
||||
yield toks
|
||||
|
||||
# Multiclass distinction (NE type)
|
||||
elif fmt == "multiclass":
|
||||
i = 0
|
||||
toks = Tree("S", [])
|
||||
for s, e, typ in sorted(entities):
|
||||
if s < i:
|
||||
s = i # Overlapping! Deal with this better?
|
||||
if e <= s:
|
||||
continue
|
||||
toks.extend(word_tokenize(text[i:s]))
|
||||
toks.append(Tree(typ, text[s:e].split()))
|
||||
i = e
|
||||
toks.extend(word_tokenize(text[i:]))
|
||||
yield toks
|
||||
|
||||
else:
|
||||
raise ValueError("bad fmt value")
|
||||
|
||||
|
||||
# This probably belongs in a more general-purpose location (as does
|
||||
# the parse_to_tagged function).
|
||||
def cmp_chunks(correct, guessed):
|
||||
correct = NEChunkParser._parse_to_tagged(correct)
|
||||
guessed = NEChunkParser._parse_to_tagged(guessed)
|
||||
ellipsis = False
|
||||
for (w, ct), (w, gt) in zip(correct, guessed):
|
||||
if ct == gt == "O":
|
||||
if not ellipsis:
|
||||
print(f" {ct:15} {gt:15} {w}")
|
||||
print(" {:15} {:15} {2}".format("...", "...", "..."))
|
||||
ellipsis = True
|
||||
else:
|
||||
ellipsis = False
|
||||
print(f" {ct:15} {gt:15} {w}")
|
||||
|
||||
|
||||
# ======================================================================================
|
||||
|
||||
|
||||
class Maxent_NE_Chunker(NEChunkParser):
|
||||
"""
|
||||
Expected input: list of pos-tagged words
|
||||
"""
|
||||
|
||||
def __init__(self, fmt="multiclass"):
|
||||
from nltk.data import find
|
||||
|
||||
self._fmt = fmt
|
||||
self._tab_dir = find(f"chunkers/maxent_ne_chunker_tab/english_ace_{fmt}/")
|
||||
self.load_params()
|
||||
|
||||
def load_params(self):
|
||||
from nltk.classify.maxent import BinaryMaxentFeatureEncoding, load_maxent_params
|
||||
|
||||
wgt, mpg, lab, aon = load_maxent_params(self._tab_dir)
|
||||
mc = MaxentClassifier(
|
||||
BinaryMaxentFeatureEncoding(lab, mpg, alwayson_features=aon), wgt
|
||||
)
|
||||
self._tagger = NEChunkParserTagger(classifier=mc)
|
||||
|
||||
def save_params(self):
|
||||
from nltk.classify.maxent import save_maxent_params
|
||||
|
||||
classif = self._tagger._classifier
|
||||
ecg = classif._encoding
|
||||
wgt = classif._weights
|
||||
mpg = ecg._mapping
|
||||
lab = ecg._labels
|
||||
aon = ecg._alwayson
|
||||
fmt = self._fmt
|
||||
save_maxent_params(wgt, mpg, lab, aon, tab_dir=f"/tmp/english_ace_{fmt}/")
|
||||
|
||||
|
||||
def build_model(fmt="multiclass"):
|
||||
chunker = Maxent_NE_Chunker(fmt)
|
||||
chunker.save_params()
|
||||
return chunker
|
||||
|
||||
|
||||
# ======================================================================================
|
||||
|
||||
"""
|
||||
2004 update: pickles are not supported anymore.
|
||||
|
||||
Deprecated:
|
||||
|
||||
def build_model(fmt="binary"):
|
||||
print("Loading training data...")
|
||||
train_paths = [
|
||||
find("corpora/ace_data/ace.dev"),
|
||||
find("corpora/ace_data/ace.heldout"),
|
||||
find("corpora/ace_data/bbn.dev"),
|
||||
find("corpora/ace_data/muc.dev"),
|
||||
]
|
||||
train_trees = load_ace_data(train_paths, fmt)
|
||||
train_data = [postag_tree(t) for t in train_trees]
|
||||
print("Training...")
|
||||
cp = NEChunkParser(train_data)
|
||||
del train_data
|
||||
|
||||
print("Loading eval data...")
|
||||
eval_paths = [find("corpora/ace_data/ace.eval")]
|
||||
eval_trees = load_ace_data(eval_paths, fmt)
|
||||
eval_data = [postag_tree(t) for t in eval_trees]
|
||||
|
||||
print("Evaluating...")
|
||||
chunkscore = ChunkScore()
|
||||
for i, correct in enumerate(eval_data):
|
||||
guess = cp.parse(correct.leaves())
|
||||
chunkscore.score(correct, guess)
|
||||
if i < 3:
|
||||
cmp_chunks(correct, guess)
|
||||
print(chunkscore)
|
||||
|
||||
outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
|
||||
print(f"Saving chunker to {outfilename}...")
|
||||
|
||||
with open(outfilename, "wb") as outfile:
|
||||
pickle.dump(cp, outfile, -1)
|
||||
|
||||
return cp
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Make sure that the object has the right class name:
|
||||
build_model("binary")
|
||||
build_model("multiclass")
|
||||
Reference in New Issue
Block a user