updates
This commit is contained in:
642
Backend/venv/lib/python3.12/site-packages/nltk/chunk/util.py
Normal file
642
Backend/venv/lib/python3.12/site-packages/nltk/chunk/util.py
Normal file
@@ -0,0 +1,642 @@
|
||||
# Natural Language Toolkit: Chunk format conversions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
|
||||
from nltk.metrics import accuracy as _accuracy
|
||||
from nltk.tag.mapping import map_tag
|
||||
from nltk.tag.util import str2tuple
|
||||
from nltk.tree import Tree
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## EVALUATION
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def accuracy(chunker, gold):
|
||||
"""
|
||||
Score the accuracy of the chunker against the gold standard.
|
||||
Strip the chunk information from the gold standard and rechunk it using
|
||||
the chunker, then compute the accuracy score.
|
||||
|
||||
:type chunker: ChunkParserI
|
||||
:param chunker: The chunker being evaluated.
|
||||
:type gold: tree
|
||||
:param gold: The chunk structures to score the chunker on.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
gold_tags = []
|
||||
test_tags = []
|
||||
for gold_tree in gold:
|
||||
test_tree = chunker.parse(gold_tree.flatten())
|
||||
gold_tags += tree2conlltags(gold_tree)
|
||||
test_tags += tree2conlltags(test_tree)
|
||||
|
||||
# print 'GOLD:', gold_tags[:50]
|
||||
# print 'TEST:', test_tags[:50]
|
||||
return _accuracy(gold_tags, test_tags)
|
||||
|
||||
|
||||
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
|
||||
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
|
||||
#
|
||||
# SB: use nltk.metrics for precision/recall scoring?
|
||||
#
|
||||
class ChunkScore:
|
||||
"""
|
||||
A utility class for scoring chunk parsers. ``ChunkScore`` can
|
||||
evaluate a chunk parser's output, based on a number of statistics
|
||||
(precision, recall, f-measure, misssed chunks, incorrect chunks).
|
||||
It can also combine the scores from the parsing of multiple texts;
|
||||
this makes it significantly easier to evaluate a chunk parser that
|
||||
operates one sentence at a time.
|
||||
|
||||
Texts are evaluated with the ``score`` method. The results of
|
||||
evaluation can be accessed via a number of accessor methods, such
|
||||
as ``precision`` and ``f_measure``. A typical use of the
|
||||
``ChunkScore`` class is::
|
||||
|
||||
>>> chunkscore = ChunkScore() # doctest: +SKIP
|
||||
>>> for correct in correct_sentences: # doctest: +SKIP
|
||||
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
|
||||
... chunkscore.score(correct, guess) # doctest: +SKIP
|
||||
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
|
||||
F Measure: 0.823
|
||||
|
||||
:ivar kwargs: Keyword arguments:
|
||||
|
||||
- max_tp_examples: The maximum number actual examples of true
|
||||
positives to record. This affects the ``correct`` member
|
||||
function: ``correct`` will not return more than this number
|
||||
of true positive examples. This does *not* affect any of
|
||||
the numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- max_fp_examples: The maximum number actual examples of false
|
||||
positives to record. This affects the ``incorrect`` member
|
||||
function and the ``guessed`` member function: ``incorrect``
|
||||
will not return more than this number of examples, and
|
||||
``guessed`` will not return more than this number of true
|
||||
positive examples. This does *not* affect any of the
|
||||
numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- max_fn_examples: The maximum number actual examples of false
|
||||
negatives to record. This affects the ``missed`` member
|
||||
function and the ``correct`` member function: ``missed``
|
||||
will not return more than this number of examples, and
|
||||
``correct`` will not return more than this number of true
|
||||
negative examples. This does *not* affect any of the
|
||||
numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- chunk_label: A regular expression indicating which chunks
|
||||
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
|
||||
|
||||
:type _tp: list(Token)
|
||||
:ivar _tp: List of true positives
|
||||
:type _fp: list(Token)
|
||||
:ivar _fp: List of false positives
|
||||
:type _fn: list(Token)
|
||||
:ivar _fn: List of false negatives
|
||||
|
||||
:type _tp_num: int
|
||||
:ivar _tp_num: Number of true positives
|
||||
:type _fp_num: int
|
||||
:ivar _fp_num: Number of false positives
|
||||
:type _fn_num: int
|
||||
:ivar _fn_num: Number of false negatives.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._correct = set()
|
||||
self._guessed = set()
|
||||
self._tp = set()
|
||||
self._fp = set()
|
||||
self._fn = set()
|
||||
self._max_tp = kwargs.get("max_tp_examples", 100)
|
||||
self._max_fp = kwargs.get("max_fp_examples", 100)
|
||||
self._max_fn = kwargs.get("max_fn_examples", 100)
|
||||
self._chunk_label = kwargs.get("chunk_label", ".*")
|
||||
self._tp_num = 0
|
||||
self._fp_num = 0
|
||||
self._fn_num = 0
|
||||
self._count = 0
|
||||
self._tags_correct = 0.0
|
||||
self._tags_total = 0.0
|
||||
|
||||
self._measuresNeedUpdate = False
|
||||
|
||||
def _updateMeasures(self):
|
||||
if self._measuresNeedUpdate:
|
||||
self._tp = self._guessed & self._correct
|
||||
self._fn = self._correct - self._guessed
|
||||
self._fp = self._guessed - self._correct
|
||||
self._tp_num = len(self._tp)
|
||||
self._fp_num = len(self._fp)
|
||||
self._fn_num = len(self._fn)
|
||||
self._measuresNeedUpdate = False
|
||||
|
||||
def score(self, correct, guessed):
|
||||
"""
|
||||
Given a correctly chunked sentence, score another chunked
|
||||
version of the same sentence.
|
||||
|
||||
:type correct: chunk structure
|
||||
:param correct: The known-correct ("gold standard") chunked
|
||||
sentence.
|
||||
:type guessed: chunk structure
|
||||
:param guessed: The chunked sentence to be scored.
|
||||
"""
|
||||
self._correct |= _chunksets(correct, self._count, self._chunk_label)
|
||||
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
|
||||
self._count += 1
|
||||
self._measuresNeedUpdate = True
|
||||
# Keep track of per-tag accuracy (if possible)
|
||||
try:
|
||||
correct_tags = tree2conlltags(correct)
|
||||
guessed_tags = tree2conlltags(guessed)
|
||||
except ValueError:
|
||||
# This exception case is for nested chunk structures,
|
||||
# where tree2conlltags will fail with a ValueError: "Tree
|
||||
# is too deeply nested to be printed in CoNLL format."
|
||||
correct_tags = guessed_tags = ()
|
||||
self._tags_total += len(correct_tags)
|
||||
self._tags_correct += sum(
|
||||
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
|
||||
)
|
||||
|
||||
def accuracy(self):
|
||||
"""
|
||||
Return the overall tag-based accuracy for all text that have
|
||||
been scored by this ``ChunkScore``, using the IOB (conll2000)
|
||||
tag encoding.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
if self._tags_total == 0:
|
||||
return 1
|
||||
return self._tags_correct / self._tags_total
|
||||
|
||||
def precision(self):
|
||||
"""
|
||||
Return the overall precision for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
div = self._tp_num + self._fp_num
|
||||
if div == 0:
|
||||
return 0
|
||||
else:
|
||||
return self._tp_num / div
|
||||
|
||||
def recall(self):
|
||||
"""
|
||||
Return the overall recall for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
div = self._tp_num + self._fn_num
|
||||
if div == 0:
|
||||
return 0
|
||||
else:
|
||||
return self._tp_num / div
|
||||
|
||||
def f_measure(self, alpha=0.5):
|
||||
"""
|
||||
Return the overall F measure for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:param alpha: the relative weighting of precision and recall.
|
||||
Larger alpha biases the score towards the precision value,
|
||||
while smaller alpha biases the score towards the recall
|
||||
value. ``alpha`` should have a value in the range [0,1].
|
||||
:type alpha: float
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
p = self.precision()
|
||||
r = self.recall()
|
||||
if p == 0 or r == 0: # what if alpha is 0 or 1?
|
||||
return 0
|
||||
return 1 / (alpha / p + (1 - alpha) / r)
|
||||
|
||||
def missed(self):
|
||||
"""
|
||||
Return the chunks which were included in the
|
||||
correct chunk structures, but not in the guessed chunk
|
||||
structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
self._updateMeasures()
|
||||
chunks = list(self._fn)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def incorrect(self):
|
||||
"""
|
||||
Return the chunks which were included in the guessed chunk structures,
|
||||
but not in the correct chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
self._updateMeasures()
|
||||
chunks = list(self._fp)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def correct(self):
|
||||
"""
|
||||
Return the chunks which were included in the correct
|
||||
chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
chunks = list(self._correct)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def guessed(self):
|
||||
"""
|
||||
Return the chunks which were included in the guessed
|
||||
chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
chunks = list(self._guessed)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def __len__(self):
|
||||
self._updateMeasures()
|
||||
return self._tp_num + self._fn_num
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a concise representation of this ``ChunkScoring``.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return "<ChunkScoring of " + repr(len(self)) + " chunks>"
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Return a verbose representation of this ``ChunkScoring``.
|
||||
This representation includes the precision, recall, and
|
||||
f-measure scores. For other information about the score,
|
||||
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return (
|
||||
"ChunkParse score:\n"
|
||||
+ f" IOB Accuracy: {self.accuracy() * 100:5.1f}%\n"
|
||||
+ f" Precision: {self.precision() * 100:5.1f}%\n"
|
||||
+ f" Recall: {self.recall() * 100:5.1f}%\n"
|
||||
+ f" F-Measure: {self.f_measure() * 100:5.1f}%"
|
||||
)
|
||||
|
||||
|
||||
# extract chunks, and assign unique id, the absolute position of
|
||||
# the first word of the chunk
|
||||
def _chunksets(t, count, chunk_label):
|
||||
pos = 0
|
||||
chunks = []
|
||||
for child in t:
|
||||
if isinstance(child, Tree):
|
||||
if re.match(chunk_label, child.label()):
|
||||
chunks.append(((count, pos), child.freeze()))
|
||||
pos += len(child.leaves())
|
||||
else:
|
||||
pos += 1
|
||||
return set(chunks)
|
||||
|
||||
|
||||
def tagstr2tree(
|
||||
s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
|
||||
):
|
||||
"""
|
||||
Divide a string of bracketted tagged text into
|
||||
chunks and unchunked tokens, and produce a Tree.
|
||||
Chunks are marked by square brackets (``[...]``). Words are
|
||||
delimited by whitespace, and each word should have the form
|
||||
``text/tag``. Words that do not contain a slash are
|
||||
assigned a ``tag`` of None.
|
||||
|
||||
:param s: The string to be converted
|
||||
:type s: str
|
||||
:param chunk_label: The label to use for chunk nodes
|
||||
:type chunk_label: str
|
||||
:param root_label: The label to use for the root of the tree
|
||||
:type root_label: str
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
|
||||
|
||||
stack = [Tree(root_label, [])]
|
||||
for match in WORD_OR_BRACKET.finditer(s):
|
||||
text = match.group()
|
||||
if text[0] == "[":
|
||||
if len(stack) != 1:
|
||||
raise ValueError(f"Unexpected [ at char {match.start():d}")
|
||||
chunk = Tree(chunk_label, [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
elif text[0] == "]":
|
||||
if len(stack) != 2:
|
||||
raise ValueError(f"Unexpected ] at char {match.start():d}")
|
||||
stack.pop()
|
||||
else:
|
||||
if sep is None:
|
||||
stack[-1].append(text)
|
||||
else:
|
||||
word, tag = str2tuple(text, sep)
|
||||
if source_tagset and target_tagset:
|
||||
tag = map_tag(source_tagset, target_tagset, tag)
|
||||
stack[-1].append((word, tag))
|
||||
|
||||
if len(stack) != 1:
|
||||
raise ValueError(f"Expected ] at char {len(s):d}")
|
||||
return stack[0]
|
||||
|
||||
|
||||
### CONLL
|
||||
|
||||
_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
|
||||
|
||||
|
||||
def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
|
||||
"""
|
||||
Return a chunk structure for a single sentence
|
||||
encoded in the given CONLL 2000 style string.
|
||||
This function converts a CoNLL IOB string into a tree.
|
||||
It uses the specified chunk types
|
||||
(defaults to NP, PP and VP), and creates a tree rooted at a node
|
||||
labeled S (by default).
|
||||
|
||||
:param s: The CoNLL string to be converted.
|
||||
:type s: str
|
||||
:param chunk_types: The chunk types to be converted.
|
||||
:type chunk_types: tuple
|
||||
:param root_label: The node label to use for the root.
|
||||
:type root_label: str
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
stack = [Tree(root_label, [])]
|
||||
|
||||
for lineno, line in enumerate(s.split("\n")):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Decode the line.
|
||||
match = _LINE_RE.match(line)
|
||||
if match is None:
|
||||
raise ValueError(f"Error on line {lineno:d}")
|
||||
(word, tag, state, chunk_type) = match.groups()
|
||||
|
||||
# If it's a chunk type we don't care about, treat it as O.
|
||||
if chunk_types is not None and chunk_type not in chunk_types:
|
||||
state = "O"
|
||||
|
||||
# For "Begin"/"Outside", finish any completed chunks -
|
||||
# also do so for "Inside" which don't match the previous token.
|
||||
mismatch_I = state == "I" and chunk_type != stack[-1].label()
|
||||
if state in "BO" or mismatch_I:
|
||||
if len(stack) == 2:
|
||||
stack.pop()
|
||||
|
||||
# For "Begin", start a new chunk.
|
||||
if state == "B" or mismatch_I:
|
||||
chunk = Tree(chunk_type, [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
|
||||
# Add the new word token.
|
||||
stack[-1].append((word, tag))
|
||||
|
||||
return stack[0]
|
||||
|
||||
|
||||
def tree2conlltags(t):
|
||||
"""
|
||||
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
|
||||
Convert a tree to the CoNLL IOB tag format.
|
||||
|
||||
:param t: The tree to be converted.
|
||||
:type t: Tree
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
tags = []
|
||||
for child in t:
|
||||
try:
|
||||
category = child.label()
|
||||
prefix = "B-"
|
||||
for contents in child:
|
||||
if isinstance(contents, Tree):
|
||||
raise ValueError(
|
||||
"Tree is too deeply nested to be printed in CoNLL format"
|
||||
)
|
||||
tags.append((contents[0], contents[1], prefix + category))
|
||||
prefix = "I-"
|
||||
except AttributeError:
|
||||
tags.append((child[0], child[1], "O"))
|
||||
return tags
|
||||
|
||||
|
||||
def conlltags2tree(
|
||||
sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
|
||||
):
|
||||
"""
|
||||
Convert the CoNLL IOB format to a tree.
|
||||
"""
|
||||
tree = Tree(root_label, [])
|
||||
for word, postag, chunktag in sentence:
|
||||
if chunktag is None:
|
||||
if strict:
|
||||
raise ValueError("Bad conll tag sequence")
|
||||
else:
|
||||
# Treat as O
|
||||
tree.append((word, postag))
|
||||
elif chunktag.startswith("B-"):
|
||||
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
||||
elif chunktag.startswith("I-"):
|
||||
if (
|
||||
len(tree) == 0
|
||||
or not isinstance(tree[-1], Tree)
|
||||
or tree[-1].label() != chunktag[2:]
|
||||
):
|
||||
if strict:
|
||||
raise ValueError("Bad conll tag sequence")
|
||||
else:
|
||||
# Treat as B-*
|
||||
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
||||
else:
|
||||
tree[-1].append((word, postag))
|
||||
elif chunktag == "O":
|
||||
tree.append((word, postag))
|
||||
else:
|
||||
raise ValueError(f"Bad conll tag {chunktag!r}")
|
||||
return tree
|
||||
|
||||
|
||||
def tree2conllstr(t):
|
||||
"""
|
||||
Return a multiline string where each line contains a word, tag and IOB tag.
|
||||
Convert a tree to the CoNLL IOB string format
|
||||
|
||||
:param t: The tree to be converted.
|
||||
:type t: Tree
|
||||
:rtype: str
|
||||
"""
|
||||
lines = [" ".join(token) for token in tree2conlltags(t)]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
### IEER
|
||||
|
||||
_IEER_DOC_RE = re.compile(
|
||||
r"<DOC>\s*"
|
||||
r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
|
||||
r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
|
||||
r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
|
||||
r"<BODY>\s*"
|
||||
r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
|
||||
r"<TEXT>(?P<text>.*?)</TEXT>\s*"
|
||||
r"</BODY>\s*</DOC>\s*",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
|
||||
|
||||
|
||||
def _ieer_read_text(s, root_label):
|
||||
stack = [Tree(root_label, [])]
|
||||
# s will be None if there is no headline in the text
|
||||
# return the empty list in place of a Tree
|
||||
if s is None:
|
||||
return []
|
||||
for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
|
||||
piece = piece_m.group()
|
||||
try:
|
||||
if piece.startswith("<b_"):
|
||||
m = _IEER_TYPE_RE.match(piece)
|
||||
if m is None:
|
||||
print("XXXX", piece)
|
||||
chunk = Tree(m.group("type"), [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
elif piece.startswith("<e_"):
|
||||
stack.pop()
|
||||
# elif piece.startswith('<'):
|
||||
# print "ERROR:", piece
|
||||
# raise ValueError # Unexpected HTML
|
||||
else:
|
||||
stack[-1].append(piece)
|
||||
except (IndexError, ValueError) as e:
|
||||
raise ValueError(
|
||||
f"Bad IEER string (error at character {piece_m.start():d})"
|
||||
) from e
|
||||
if len(stack) != 1:
|
||||
raise ValueError("Bad IEER string")
|
||||
return stack[0]
|
||||
|
||||
|
||||
def ieerstr2tree(
|
||||
s,
|
||||
chunk_types=[
|
||||
"LOCATION",
|
||||
"ORGANIZATION",
|
||||
"PERSON",
|
||||
"DURATION",
|
||||
"DATE",
|
||||
"CARDINAL",
|
||||
"PERCENT",
|
||||
"MONEY",
|
||||
"MEASURE",
|
||||
],
|
||||
root_label="S",
|
||||
):
|
||||
"""
|
||||
Return a chunk structure containing the chunked tagged text that is
|
||||
encoded in the given IEER style string.
|
||||
Convert a string of chunked tagged text in the IEER named
|
||||
entity format into a chunk structure. Chunks are of several
|
||||
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
|
||||
PERCENT, MONEY, and MEASURE.
|
||||
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
# Try looking for a single document. If that doesn't work, then just
|
||||
# treat everything as if it was within the <TEXT>...</TEXT>.
|
||||
m = _IEER_DOC_RE.match(s)
|
||||
if m:
|
||||
return {
|
||||
"text": _ieer_read_text(m.group("text"), root_label),
|
||||
"docno": m.group("docno"),
|
||||
"doctype": m.group("doctype"),
|
||||
"date_time": m.group("date_time"),
|
||||
#'headline': m.group('headline')
|
||||
# we want to capture NEs in the headline too!
|
||||
"headline": _ieer_read_text(m.group("headline"), root_label),
|
||||
}
|
||||
else:
|
||||
return _ieer_read_text(s, root_label)
|
||||
|
||||
|
||||
def demo():
|
||||
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
|
||||
import nltk
|
||||
|
||||
t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
|
||||
t.pprint()
|
||||
print()
|
||||
|
||||
s = """
|
||||
These DT B-NP
|
||||
research NN I-NP
|
||||
protocols NNS I-NP
|
||||
offer VBP B-VP
|
||||
to TO B-PP
|
||||
the DT B-NP
|
||||
patient NN I-NP
|
||||
not RB O
|
||||
only RB O
|
||||
the DT B-NP
|
||||
very RB I-NP
|
||||
best JJS I-NP
|
||||
therapy NN I-NP
|
||||
which WDT B-NP
|
||||
we PRP B-NP
|
||||
have VBP B-VP
|
||||
established VBN I-VP
|
||||
today NN B-NP
|
||||
but CC B-NP
|
||||
also RB I-NP
|
||||
the DT B-NP
|
||||
hope NN I-NP
|
||||
of IN B-PP
|
||||
something NN B-NP
|
||||
still RB B-ADJP
|
||||
better JJR I-ADJP
|
||||
. . O
|
||||
"""
|
||||
|
||||
conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
|
||||
conll_tree.pprint()
|
||||
|
||||
# Demonstrate CoNLL output
|
||||
print("CoNLL output:")
|
||||
print(nltk.chunk.tree2conllstr(conll_tree))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
Reference in New Issue
Block a user