updates
This commit is contained in:
539
Backend/venv/lib/python3.12/site-packages/nltk/sem/relextract.py
Normal file
539
Backend/venv/lib/python3.12/site-packages/nltk/sem/relextract.py
Normal file
@@ -0,0 +1,539 @@
|
||||
# Natural Language Toolkit: Relation Extraction
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Code for extracting relational triples from the ieer and conll2002 corpora.
|
||||
|
||||
Relations are stored internally as dictionaries ('reldicts').
|
||||
|
||||
The two serialization outputs are "rtuple" and "clause".
|
||||
|
||||
- An rtuple is a tuple of the form ``(subj, filler, obj)``,
|
||||
where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words
|
||||
occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to
|
||||
circumvent locale variations in rendering utf-8 encoded strings.
|
||||
- A clause is an atom of the form ``relsym(subjsym, objsym)``,
|
||||
where the relation, subject and object have been canonicalized to single strings.
|
||||
"""
|
||||
|
||||
# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
|
||||
|
||||
import html
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
# Dictionary that associates corpora with NE classes
|
||||
NE_CLASSES = {
|
||||
"ieer": [
|
||||
"LOCATION",
|
||||
"ORGANIZATION",
|
||||
"PERSON",
|
||||
"DURATION",
|
||||
"DATE",
|
||||
"CARDINAL",
|
||||
"PERCENT",
|
||||
"MONEY",
|
||||
"MEASURE",
|
||||
],
|
||||
"conll2002": ["LOC", "PER", "ORG"],
|
||||
"ace": [
|
||||
"LOCATION",
|
||||
"ORGANIZATION",
|
||||
"PERSON",
|
||||
"DURATION",
|
||||
"DATE",
|
||||
"CARDINAL",
|
||||
"PERCENT",
|
||||
"MONEY",
|
||||
"MEASURE",
|
||||
"FACILITY",
|
||||
"GPE",
|
||||
],
|
||||
}
|
||||
|
||||
# Allow abbreviated class labels
|
||||
short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON")
|
||||
long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER")
|
||||
|
||||
|
||||
def _expand(type):
|
||||
"""
|
||||
Expand an NE class name.
|
||||
:type type: str
|
||||
:rtype: str
|
||||
"""
|
||||
try:
|
||||
return short2long[type]
|
||||
except KeyError:
|
||||
return type
|
||||
|
||||
|
||||
def class_abbrev(type):
|
||||
"""
|
||||
Abbreviate an NE class name.
|
||||
:type type: str
|
||||
:rtype: str
|
||||
"""
|
||||
try:
|
||||
return long2short[type]
|
||||
except KeyError:
|
||||
return type
|
||||
|
||||
|
||||
def _join(lst, sep=" ", untag=False):
|
||||
"""
|
||||
Join a list into a string, turning tags tuples into tag strings or just words.
|
||||
:param untag: if ``True``, omit the tag from tagged input strings.
|
||||
:type lst: list
|
||||
:rtype: str
|
||||
"""
|
||||
try:
|
||||
return sep.join(lst)
|
||||
except TypeError:
|
||||
if untag:
|
||||
return sep.join(tup[0] for tup in lst)
|
||||
from nltk.tag import tuple2str
|
||||
|
||||
return sep.join(tuple2str(tup) for tup in lst)
|
||||
|
||||
|
||||
def descape_entity(m, defs=html.entities.entitydefs):
|
||||
"""
|
||||
Translate one entity to its ISO Latin value.
|
||||
Inspired by example from effbot.org
|
||||
|
||||
|
||||
"""
|
||||
try:
|
||||
return defs[m.group(1)]
|
||||
|
||||
except KeyError:
|
||||
return m.group(0) # use as is
|
||||
|
||||
|
||||
def list2sym(lst):
|
||||
"""
|
||||
Convert a list of strings into a canonical symbol.
|
||||
:type lst: list
|
||||
:return: a Unicode string without whitespace
|
||||
:rtype: unicode
|
||||
"""
|
||||
sym = _join(lst, "_", untag=True)
|
||||
sym = sym.lower()
|
||||
ENT = re.compile(r"&(\w+?);")
|
||||
sym = ENT.sub(descape_entity, sym)
|
||||
sym = sym.replace(".", "")
|
||||
return sym
|
||||
|
||||
|
||||
def tree2semi_rel(tree):
|
||||
"""
|
||||
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
|
||||
|
||||
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
|
||||
identifies pairs whose first member is a list (possibly empty) of terminal
|
||||
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
|
||||
|
||||
:param tree: a chunk tree
|
||||
:return: a list of pairs (list(str), ``Tree``)
|
||||
:rtype: list of tuple
|
||||
"""
|
||||
|
||||
from nltk.tree import Tree
|
||||
|
||||
semi_rels = []
|
||||
semi_rel = [[], None]
|
||||
|
||||
for dtr in tree:
|
||||
if not isinstance(dtr, Tree):
|
||||
semi_rel[0].append(dtr)
|
||||
else:
|
||||
# dtr is a Tree
|
||||
semi_rel[1] = dtr
|
||||
semi_rels.append(semi_rel)
|
||||
semi_rel = [[], None]
|
||||
return semi_rels
|
||||
|
||||
|
||||
def semi_rel2reldict(pairs, window=5, trace=False):
|
||||
"""
|
||||
Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which
|
||||
stores information about the subject and object NEs plus the filler between them.
|
||||
Additionally, a left and right context of length =< window are captured (within
|
||||
a given input sentence).
|
||||
|
||||
:param pairs: a pair of list(str) and ``Tree``, as generated by
|
||||
:param window: a threshold for the number of items to include in the left and right context
|
||||
:type window: int
|
||||
:return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
|
||||
:rtype: list(defaultdict)
|
||||
"""
|
||||
result = []
|
||||
while len(pairs) > 2:
|
||||
reldict = defaultdict(str)
|
||||
reldict["lcon"] = _join(pairs[0][0][-window:])
|
||||
reldict["subjclass"] = pairs[0][1].label()
|
||||
reldict["subjtext"] = _join(pairs[0][1].leaves())
|
||||
reldict["subjsym"] = list2sym(pairs[0][1].leaves())
|
||||
reldict["filler"] = _join(pairs[1][0])
|
||||
reldict["untagged_filler"] = _join(pairs[1][0], untag=True)
|
||||
reldict["objclass"] = pairs[1][1].label()
|
||||
reldict["objtext"] = _join(pairs[1][1].leaves())
|
||||
reldict["objsym"] = list2sym(pairs[1][1].leaves())
|
||||
reldict["rcon"] = _join(pairs[2][0][:window])
|
||||
if trace:
|
||||
print(
|
||||
"(%s(%s, %s)"
|
||||
% (
|
||||
reldict["untagged_filler"],
|
||||
reldict["subjclass"],
|
||||
reldict["objclass"],
|
||||
)
|
||||
)
|
||||
result.append(reldict)
|
||||
pairs = pairs[1:]
|
||||
return result
|
||||
|
||||
|
||||
def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10):
|
||||
"""
|
||||
Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
|
||||
|
||||
The parameters ``subjclass`` and ``objclass`` can be used to restrict the
|
||||
Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
|
||||
'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
|
||||
|
||||
:param subjclass: the class of the subject Named Entity.
|
||||
:type subjclass: str
|
||||
:param objclass: the class of the object Named Entity.
|
||||
:type objclass: str
|
||||
:param doc: input document
|
||||
:type doc: ieer document or a list of chunk trees
|
||||
:param corpus: name of the corpus to take as input; possible values are
|
||||
'ieer' and 'conll2002'
|
||||
:type corpus: str
|
||||
:param pattern: a regular expression for filtering the fillers of
|
||||
retrieved triples.
|
||||
:type pattern: SRE_Pattern
|
||||
:param window: filters out fillers which exceed this threshold
|
||||
:type window: int
|
||||
:return: see ``mk_reldicts``
|
||||
:rtype: list(defaultdict)
|
||||
"""
|
||||
|
||||
if subjclass and subjclass not in NE_CLASSES[corpus]:
|
||||
if _expand(subjclass) in NE_CLASSES[corpus]:
|
||||
subjclass = _expand(subjclass)
|
||||
else:
|
||||
raise ValueError(
|
||||
"your value for the subject type has not been recognized: %s"
|
||||
% subjclass
|
||||
)
|
||||
if objclass and objclass not in NE_CLASSES[corpus]:
|
||||
if _expand(objclass) in NE_CLASSES[corpus]:
|
||||
objclass = _expand(objclass)
|
||||
else:
|
||||
raise ValueError(
|
||||
"your value for the object type has not been recognized: %s" % objclass
|
||||
)
|
||||
|
||||
if corpus == "ace" or corpus == "conll2002":
|
||||
pairs = tree2semi_rel(doc)
|
||||
elif corpus == "ieer":
|
||||
pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
|
||||
else:
|
||||
raise ValueError("corpus type not recognized")
|
||||
|
||||
reldicts = semi_rel2reldict(pairs)
|
||||
|
||||
relfilter = lambda x: (
|
||||
x["subjclass"] == subjclass
|
||||
and len(x["filler"].split()) <= window
|
||||
and pattern.match(x["filler"])
|
||||
and x["objclass"] == objclass
|
||||
)
|
||||
|
||||
return list(filter(relfilter, reldicts))
|
||||
|
||||
|
||||
def rtuple(reldict, lcon=False, rcon=False):
|
||||
"""
|
||||
Pretty print the reldict as an rtuple.
|
||||
:param reldict: a relation dictionary
|
||||
:type reldict: defaultdict
|
||||
"""
|
||||
items = [
|
||||
class_abbrev(reldict["subjclass"]),
|
||||
reldict["subjtext"],
|
||||
reldict["filler"],
|
||||
class_abbrev(reldict["objclass"]),
|
||||
reldict["objtext"],
|
||||
]
|
||||
format = "[%s: %r] %r [%s: %r]"
|
||||
if lcon:
|
||||
items = [reldict["lcon"]] + items
|
||||
format = "...%r)" + format
|
||||
if rcon:
|
||||
items.append(reldict["rcon"])
|
||||
format = format + "(%r..."
|
||||
printargs = tuple(items)
|
||||
return format % printargs
|
||||
|
||||
|
||||
def clause(reldict, relsym):
|
||||
"""
|
||||
Print the relation in clausal form.
|
||||
:param reldict: a relation dictionary
|
||||
:type reldict: defaultdict
|
||||
:param relsym: a label for the relation
|
||||
:type relsym: str
|
||||
"""
|
||||
items = (relsym, reldict["subjsym"], reldict["objsym"])
|
||||
return "%s(%r, %r)" % items
|
||||
|
||||
|
||||
#######################################################
|
||||
# Demos of relation extraction with regular expressions
|
||||
#######################################################
|
||||
|
||||
|
||||
############################################
|
||||
# Example of in(ORG, LOC)
|
||||
############################################
|
||||
def in_demo(trace=0, sql=True):
|
||||
"""
|
||||
Select pairs of organizations and locations whose mentions occur with an
|
||||
intervening occurrence of the preposition "in".
|
||||
|
||||
If the sql parameter is set to True, then the entity pairs are loaded into
|
||||
an in-memory database, and subsequently pulled out using an SQL "SELECT"
|
||||
query.
|
||||
"""
|
||||
from nltk.corpus import ieer
|
||||
|
||||
if sql:
|
||||
try:
|
||||
import sqlite3
|
||||
|
||||
connection = sqlite3.connect(":memory:")
|
||||
cur = connection.cursor()
|
||||
cur.execute(
|
||||
"""create table Locations
|
||||
(OrgName text, LocationName text, DocID text)"""
|
||||
)
|
||||
except ImportError:
|
||||
import warnings
|
||||
|
||||
warnings.warn("Cannot import sqlite; sql flag will be ignored.")
|
||||
|
||||
IN = re.compile(r".*\bin\b(?!\b.+ing)")
|
||||
|
||||
print()
|
||||
print("IEER: in(ORG, LOC) -- just the clauses:")
|
||||
print("=" * 45)
|
||||
|
||||
for file in ieer.fileids():
|
||||
for doc in ieer.parsed_docs(file):
|
||||
if trace:
|
||||
print(doc.docno)
|
||||
print("=" * 15)
|
||||
for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
|
||||
print(clause(rel, relsym="IN"))
|
||||
if sql:
|
||||
try:
|
||||
rtuple = (rel["subjtext"], rel["objtext"], doc.docno)
|
||||
cur.execute(
|
||||
"""insert into Locations
|
||||
values (?, ?, ?)""",
|
||||
rtuple,
|
||||
)
|
||||
connection.commit()
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
if sql:
|
||||
try:
|
||||
cur.execute(
|
||||
"""select OrgName from Locations
|
||||
where LocationName = 'Atlanta'"""
|
||||
)
|
||||
print()
|
||||
print("Extract data from SQL table: ORGs in Atlanta")
|
||||
print("-" * 15)
|
||||
for row in cur:
|
||||
print(row)
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
|
||||
############################################
|
||||
# Example of has_role(PER, LOC)
|
||||
############################################
|
||||
|
||||
|
||||
def roles_demo(trace=0):
|
||||
from nltk.corpus import ieer
|
||||
|
||||
roles = r"""
|
||||
(.*( # assorted roles
|
||||
analyst|
|
||||
chair(wo)?man|
|
||||
commissioner|
|
||||
counsel|
|
||||
director|
|
||||
economist|
|
||||
editor|
|
||||
executive|
|
||||
foreman|
|
||||
governor|
|
||||
head|
|
||||
lawyer|
|
||||
leader|
|
||||
librarian).*)|
|
||||
manager|
|
||||
partner|
|
||||
president|
|
||||
producer|
|
||||
professor|
|
||||
researcher|
|
||||
spokes(wo)?man|
|
||||
writer|
|
||||
,\sof\sthe?\s* # "X, of (the) Y"
|
||||
"""
|
||||
ROLES = re.compile(roles, re.VERBOSE)
|
||||
|
||||
print()
|
||||
print("IEER: has_role(PER, ORG) -- raw rtuples:")
|
||||
print("=" * 45)
|
||||
|
||||
for file in ieer.fileids():
|
||||
for doc in ieer.parsed_docs(file):
|
||||
lcon = rcon = False
|
||||
if trace:
|
||||
print(doc.docno)
|
||||
print("=" * 15)
|
||||
lcon = rcon = True
|
||||
for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES):
|
||||
print(rtuple(rel, lcon=lcon, rcon=rcon))
|
||||
|
||||
|
||||
##############################################
|
||||
### Show what's in the IEER Headlines
|
||||
##############################################
|
||||
|
||||
|
||||
def ieer_headlines():
|
||||
from nltk.corpus import ieer
|
||||
from nltk.tree import Tree
|
||||
|
||||
print("IEER: First 20 Headlines")
|
||||
print("=" * 45)
|
||||
|
||||
trees = [
|
||||
(doc.docno, doc.headline)
|
||||
for file in ieer.fileids()
|
||||
for doc in ieer.parsed_docs(file)
|
||||
]
|
||||
for tree in trees[:20]:
|
||||
print()
|
||||
print("%s:\n%s" % tree)
|
||||
|
||||
|
||||
#############################################
|
||||
## Dutch CONLL2002: take_on_role(PER, ORG
|
||||
#############################################
|
||||
|
||||
|
||||
def conllned(trace=1):
|
||||
"""
|
||||
Find the copula+'van' relation ('of') in the Dutch tagged training corpus
|
||||
from CoNLL 2002.
|
||||
"""
|
||||
|
||||
from nltk.corpus import conll2002
|
||||
|
||||
vnv = """
|
||||
(
|
||||
is/V| # 3rd sing present and
|
||||
was/V| # past forms of the verb zijn ('be')
|
||||
werd/V| # and also present
|
||||
wordt/V # past of worden ('become)
|
||||
)
|
||||
.* # followed by anything
|
||||
van/Prep # followed by van ('of')
|
||||
"""
|
||||
VAN = re.compile(vnv, re.VERBOSE)
|
||||
|
||||
print()
|
||||
print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
|
||||
print("=" * 45)
|
||||
|
||||
for doc in conll2002.chunked_sents("ned.train"):
|
||||
lcon = rcon = False
|
||||
if trace:
|
||||
lcon = rcon = True
|
||||
for rel in extract_rels(
|
||||
"PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10
|
||||
):
|
||||
print(rtuple(rel, lcon=lcon, rcon=rcon))
|
||||
|
||||
|
||||
#############################################
|
||||
## Spanish CONLL2002: (PER, ORG)
|
||||
#############################################
|
||||
|
||||
|
||||
def conllesp():
|
||||
from nltk.corpus import conll2002
|
||||
|
||||
de = """
|
||||
.*
|
||||
(
|
||||
de/SP|
|
||||
del/SP
|
||||
)
|
||||
"""
|
||||
DE = re.compile(de, re.VERBOSE)
|
||||
|
||||
print()
|
||||
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
|
||||
print("=" * 45)
|
||||
rels = [
|
||||
rel
|
||||
for doc in conll2002.chunked_sents("esp.train")
|
||||
for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE)
|
||||
]
|
||||
for r in rels[:10]:
|
||||
print(clause(r, relsym="DE"))
|
||||
print()
|
||||
|
||||
|
||||
def ne_chunked():
|
||||
print()
|
||||
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
|
||||
print("=" * 45)
|
||||
ROLE = re.compile(
|
||||
r".*(chairman|president|trader|scientist|economist|analyst|partner).*"
|
||||
)
|
||||
rels = []
|
||||
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
|
||||
sent = nltk.ne_chunk(sent)
|
||||
rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7)
|
||||
for rel in rels:
|
||||
print(f"{i:<5}{rtuple(rel)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import nltk
|
||||
from nltk.sem import relextract
|
||||
|
||||
in_demo(trace=0)
|
||||
roles_demo(trace=0)
|
||||
conllned()
|
||||
conllesp()
|
||||
ieer_headlines()
|
||||
ne_chunked()
|
||||
Reference in New Issue
Block a user