updates
This commit is contained in:
102
Backend/venv/lib/python3.12/site-packages/nltk/parse/__init__.py
Normal file
102
Backend/venv/lib/python3.12/site-packages/nltk/parse/__init__.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# Natural Language Toolkit: Parsers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
NLTK Parsers
|
||||
|
||||
Classes and interfaces for producing tree structures that represent
|
||||
the internal organization of a text. This task is known as "parsing"
|
||||
the text, and the resulting tree structures are called the text's
|
||||
"parses". Typically, the text is a single sentence, and the tree
|
||||
structure represents the syntactic structure of the sentence.
|
||||
However, parsers can also be used in other domains. For example,
|
||||
parsers can be used to derive the morphological structure of the
|
||||
morphemes that make up a word, or to derive the discourse structure
|
||||
for a set of utterances.
|
||||
|
||||
Sometimes, a single piece of text can be represented by more than one
|
||||
tree structure. Texts represented by more than one tree structure are
|
||||
called "ambiguous" texts. Note that there are actually two ways in
|
||||
which a text can be ambiguous:
|
||||
|
||||
- The text has multiple correct parses.
|
||||
- There is not enough information to decide which of several
|
||||
candidate parses is correct.
|
||||
|
||||
However, the parser module does *not* distinguish these two types of
|
||||
ambiguity.
|
||||
|
||||
The parser module defines ``ParserI``, a standard interface for parsing
|
||||
texts; and two simple implementations of that interface,
|
||||
``ShiftReduceParser`` and ``RecursiveDescentParser``. It also contains
|
||||
three sub-modules for specialized kinds of parsing:
|
||||
|
||||
- ``nltk.parser.chart`` defines chart parsing, which uses dynamic
|
||||
programming to efficiently parse texts.
|
||||
- ``nltk.parser.probabilistic`` defines probabilistic parsing, which
|
||||
associates a probability with each parse.
|
||||
"""
|
||||
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.bllip import BllipParser
|
||||
from nltk.parse.chart import (
|
||||
BottomUpChartParser,
|
||||
BottomUpLeftCornerChartParser,
|
||||
ChartParser,
|
||||
LeftCornerChartParser,
|
||||
SteppingChartParser,
|
||||
TopDownChartParser,
|
||||
)
|
||||
from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.parse.earleychart import (
|
||||
EarleyChartParser,
|
||||
FeatureEarleyChartParser,
|
||||
FeatureIncrementalBottomUpChartParser,
|
||||
FeatureIncrementalBottomUpLeftCornerChartParser,
|
||||
FeatureIncrementalChartParser,
|
||||
FeatureIncrementalTopDownChartParser,
|
||||
IncrementalBottomUpChartParser,
|
||||
IncrementalBottomUpLeftCornerChartParser,
|
||||
IncrementalChartParser,
|
||||
IncrementalLeftCornerChartParser,
|
||||
IncrementalTopDownChartParser,
|
||||
)
|
||||
from nltk.parse.evaluate import DependencyEvaluator
|
||||
from nltk.parse.featurechart import (
|
||||
FeatureBottomUpChartParser,
|
||||
FeatureBottomUpLeftCornerChartParser,
|
||||
FeatureChartParser,
|
||||
FeatureTopDownChartParser,
|
||||
)
|
||||
from nltk.parse.malt import MaltParser
|
||||
from nltk.parse.nonprojectivedependencyparser import (
|
||||
NaiveBayesDependencyScorer,
|
||||
NonprojectiveDependencyParser,
|
||||
ProbabilisticNonprojectiveParser,
|
||||
)
|
||||
from nltk.parse.pchart import (
|
||||
BottomUpProbabilisticChartParser,
|
||||
InsideChartParser,
|
||||
LongestChartParser,
|
||||
RandomChartParser,
|
||||
UnsortedChartParser,
|
||||
)
|
||||
from nltk.parse.projectivedependencyparser import (
|
||||
ProbabilisticProjectiveDependencyParser,
|
||||
ProjectiveDependencyParser,
|
||||
)
|
||||
from nltk.parse.recursivedescent import (
|
||||
RecursiveDescentParser,
|
||||
SteppingRecursiveDescentParser,
|
||||
)
|
||||
from nltk.parse.shiftreduce import ShiftReduceParser, SteppingShiftReduceParser
|
||||
from nltk.parse.transitionparser import TransitionParser
|
||||
from nltk.parse.util import TestGrammar, extract_test_sentences, load_parser
|
||||
from nltk.parse.viterbi import ViterbiParser
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
72
Backend/venv/lib/python3.12/site-packages/nltk/parse/api.py
Normal file
72
Backend/venv/lib/python3.12/site-packages/nltk/parse/api.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# Natural Language Toolkit: Parser API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
import itertools
|
||||
|
||||
from nltk.internals import overridden
|
||||
|
||||
|
||||
class ParserI:
|
||||
"""
|
||||
A processing class for deriving trees that represent possible
|
||||
structures for a sequence of tokens. These tree structures are
|
||||
known as "parses". Typically, parsers are used to derive syntax
|
||||
trees for sentences. But parsers can also be used to derive other
|
||||
kinds of tree structure, such as morphological trees and discourse
|
||||
structures.
|
||||
|
||||
Subclasses must define:
|
||||
- at least one of: ``parse()``, ``parse_sents()``.
|
||||
|
||||
Subclasses may define:
|
||||
- ``grammar()``
|
||||
"""
|
||||
|
||||
def grammar(self):
|
||||
"""
|
||||
:return: The grammar used by this parser.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def parse(self, sent, *args, **kwargs):
|
||||
"""
|
||||
:return: An iterator that generates parse trees for the sentence.
|
||||
When possible this list is sorted from most likely to least likely.
|
||||
|
||||
:param sent: The sentence to be parsed
|
||||
:type sent: list(str)
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
if overridden(self.parse_sents):
|
||||
return next(self.parse_sents([sent], *args, **kwargs))
|
||||
elif overridden(self.parse_one):
|
||||
return (
|
||||
tree
|
||||
for tree in [self.parse_one(sent, *args, **kwargs)]
|
||||
if tree is not None
|
||||
)
|
||||
elif overridden(self.parse_all):
|
||||
return iter(self.parse_all(sent, *args, **kwargs))
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def parse_sents(self, sents, *args, **kwargs):
|
||||
"""
|
||||
Apply ``self.parse()`` to each element of ``sents``.
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
return (self.parse(sent, *args, **kwargs) for sent in sents)
|
||||
|
||||
def parse_all(self, sent, *args, **kwargs):
|
||||
""":rtype: list(Tree)"""
|
||||
return list(self.parse(sent, *args, **kwargs))
|
||||
|
||||
def parse_one(self, sent, *args, **kwargs):
|
||||
""":rtype: Tree or None"""
|
||||
return next(self.parse(sent, *args, **kwargs), None)
|
||||
299
Backend/venv/lib/python3.12/site-packages/nltk/parse/bllip.py
Normal file
299
Backend/venv/lib/python3.12/site-packages/nltk/parse/bllip.py
Normal file
@@ -0,0 +1,299 @@
|
||||
# Natural Language Toolkit: Interface to BLLIP Parser
|
||||
#
|
||||
# Author: David McClosky <dmcc@bigasterisk.com>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import Tree
|
||||
|
||||
"""
|
||||
Interface for parsing with BLLIP Parser. Requires the Python
|
||||
bllipparser module. BllipParser objects can be constructed with the
|
||||
``BllipParser.from_unified_model_dir`` class method or manually using the
|
||||
``BllipParser`` constructor. The former is generally easier if you have
|
||||
a BLLIP Parser unified model directory -- a basic model can be obtained
|
||||
from NLTK's downloader. More unified parsing models can be obtained with
|
||||
BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher``
|
||||
or see docs for ``bllipparser.ModelFetcher.download_and_install_model``).
|
||||
|
||||
Basic usage::
|
||||
|
||||
# download and install a basic unified parsing model (Wall Street Journal)
|
||||
# sudo python -m nltk.downloader bllip_wsj_no_aux
|
||||
|
||||
>>> from nltk.data import find
|
||||
>>> model_dir = find('models/bllip_wsj_no_aux').path
|
||||
>>> bllip = BllipParser.from_unified_model_dir(model_dir)
|
||||
|
||||
# 1-best parsing
|
||||
>>> sentence1 = 'British left waffles on Falklands .'.split()
|
||||
>>> top_parse = bllip.parse_one(sentence1)
|
||||
>>> print(top_parse)
|
||||
(S1
|
||||
(S
|
||||
(NP (JJ British) (NN left))
|
||||
(VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands))))
|
||||
(. .)))
|
||||
|
||||
# n-best parsing
|
||||
>>> sentence2 = 'Time flies'.split()
|
||||
>>> all_parses = bllip.parse_all(sentence2)
|
||||
>>> print(len(all_parses))
|
||||
50
|
||||
>>> print(all_parses[0])
|
||||
(S1 (S (NP (NNP Time)) (VP (VBZ flies))))
|
||||
|
||||
# incorporating external tagging constraints (None means unconstrained tag)
|
||||
>>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')])
|
||||
>>> print(next(constrained1))
|
||||
(S1 (NP (VB Time) (NNS flies)))
|
||||
>>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)])
|
||||
>>> print(next(constrained2))
|
||||
(S1 (NP (NN Time) (VBZ flies)))
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of
|
||||
the 1st North American chapter of the Association for Computational
|
||||
Linguistics conference. Association for Computational Linguistics,
|
||||
2000.
|
||||
|
||||
- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing
|
||||
and MaxEnt discriminative reranking." Proceedings of the 43rd Annual
|
||||
Meeting on Association for Computational Linguistics. Association
|
||||
for Computational Linguistics, 2005.
|
||||
|
||||
Known issues
|
||||
------------
|
||||
|
||||
Note that BLLIP Parser is not currently threadsafe. Since this module
|
||||
uses a SWIG interface, it is potentially unsafe to create multiple
|
||||
``BllipParser`` objects in the same process. BLLIP Parser currently
|
||||
has issues with non-ASCII text and will raise an error if given any.
|
||||
|
||||
See https://pypi.python.org/pypi/bllipparser/ for more information
|
||||
on BLLIP Parser's Python interface.
|
||||
"""
|
||||
|
||||
__all__ = ["BllipParser"]
|
||||
|
||||
# this block allows this module to be imported even if bllipparser isn't
|
||||
# available
|
||||
try:
|
||||
from bllipparser import RerankingParser
|
||||
from bllipparser.RerankingParser import get_unified_model_parameters
|
||||
|
||||
def _ensure_bllip_import_or_error():
|
||||
pass
|
||||
|
||||
except ImportError as ie:
|
||||
|
||||
def _ensure_bllip_import_or_error(ie=ie):
|
||||
raise ImportError("Couldn't import bllipparser module: %s" % ie)
|
||||
|
||||
|
||||
def _ensure_ascii(words):
|
||||
try:
|
||||
for i, word in enumerate(words):
|
||||
word.encode("ascii")
|
||||
except UnicodeEncodeError as e:
|
||||
raise ValueError(
|
||||
f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser "
|
||||
"currently doesn't support non-ASCII inputs."
|
||||
) from e
|
||||
|
||||
|
||||
def _scored_parse_to_nltk_tree(scored_parse):
|
||||
return Tree.fromstring(str(scored_parse.ptb_parse))
|
||||
|
||||
|
||||
class BllipParser(ParserI):
|
||||
"""
|
||||
Interface for parsing with BLLIP Parser. BllipParser objects can be
|
||||
constructed with the ``BllipParser.from_unified_model_dir`` class
|
||||
method or manually using the ``BllipParser`` constructor.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser_model=None,
|
||||
reranker_features=None,
|
||||
reranker_weights=None,
|
||||
parser_options=None,
|
||||
reranker_options=None,
|
||||
):
|
||||
"""
|
||||
Load a BLLIP Parser model from scratch. You'll typically want to
|
||||
use the ``from_unified_model_dir()`` class method to construct
|
||||
this object.
|
||||
|
||||
:param parser_model: Path to parser model directory
|
||||
:type parser_model: str
|
||||
|
||||
:param reranker_features: Path the reranker model's features file
|
||||
:type reranker_features: str
|
||||
|
||||
:param reranker_weights: Path the reranker model's weights file
|
||||
:type reranker_weights: str
|
||||
|
||||
:param parser_options: optional dictionary of parser options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
|
||||
for more information.
|
||||
:type parser_options: dict(str)
|
||||
|
||||
:param reranker_options: optional
|
||||
dictionary of reranker options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
|
||||
for more information.
|
||||
:type reranker_options: dict(str)
|
||||
"""
|
||||
_ensure_bllip_import_or_error()
|
||||
|
||||
parser_options = parser_options or {}
|
||||
reranker_options = reranker_options or {}
|
||||
|
||||
self.rrp = RerankingParser()
|
||||
self.rrp.load_parser_model(parser_model, **parser_options)
|
||||
if reranker_features and reranker_weights:
|
||||
self.rrp.load_reranker_model(
|
||||
features_filename=reranker_features,
|
||||
weights_filename=reranker_weights,
|
||||
**reranker_options,
|
||||
)
|
||||
|
||||
def parse(self, sentence):
|
||||
"""
|
||||
Use BLLIP Parser to parse a sentence. Takes a sentence as a list
|
||||
of words; it will be automatically tagged with this BLLIP Parser
|
||||
instance's tagger.
|
||||
|
||||
:return: An iterator that generates parse trees for the sentence
|
||||
from most likely to least likely.
|
||||
|
||||
:param sentence: The sentence to be parsed
|
||||
:type sentence: list(str)
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
_ensure_ascii(sentence)
|
||||
nbest_list = self.rrp.parse(sentence)
|
||||
for scored_parse in nbest_list:
|
||||
yield _scored_parse_to_nltk_tree(scored_parse)
|
||||
|
||||
def tagged_parse(self, word_and_tag_pairs):
|
||||
"""
|
||||
Use BLLIP to parse a sentence. Takes a sentence as a list of
|
||||
(word, tag) tuples; the sentence must have already been tokenized
|
||||
and tagged. BLLIP will attempt to use the tags provided but may
|
||||
use others if it can't come up with a complete parse subject
|
||||
to those constraints. You may also specify a tag as ``None``
|
||||
to leave a token's tag unconstrained.
|
||||
|
||||
:return: An iterator that generates parse trees for the sentence
|
||||
from most likely to least likely.
|
||||
|
||||
:param sentence: Input sentence to parse as (word, tag) pairs
|
||||
:type sentence: list(tuple(str, str))
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
words = []
|
||||
tag_map = {}
|
||||
for i, (word, tag) in enumerate(word_and_tag_pairs):
|
||||
words.append(word)
|
||||
if tag is not None:
|
||||
tag_map[i] = tag
|
||||
|
||||
_ensure_ascii(words)
|
||||
nbest_list = self.rrp.parse_tagged(words, tag_map)
|
||||
for scored_parse in nbest_list:
|
||||
yield _scored_parse_to_nltk_tree(scored_parse)
|
||||
|
||||
@classmethod
|
||||
def from_unified_model_dir(
|
||||
cls, model_dir, parser_options=None, reranker_options=None
|
||||
):
|
||||
"""
|
||||
Create a ``BllipParser`` object from a unified parsing model
|
||||
directory. Unified parsing model directories are a standardized
|
||||
way of storing BLLIP parser and reranker models together on disk.
|
||||
See ``bllipparser.RerankingParser.get_unified_model_parameters()``
|
||||
for more information about unified model directories.
|
||||
|
||||
:return: A ``BllipParser`` object using the parser and reranker
|
||||
models in the model directory.
|
||||
|
||||
:param model_dir: Path to the unified model directory.
|
||||
:type model_dir: str
|
||||
:param parser_options: optional dictionary of parser options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
|
||||
for more information.
|
||||
:type parser_options: dict(str)
|
||||
:param reranker_options: optional dictionary of reranker options, see
|
||||
``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
|
||||
for more information.
|
||||
:type reranker_options: dict(str)
|
||||
:rtype: BllipParser
|
||||
"""
|
||||
(
|
||||
parser_model_dir,
|
||||
reranker_features_filename,
|
||||
reranker_weights_filename,
|
||||
) = get_unified_model_parameters(model_dir)
|
||||
return cls(
|
||||
parser_model_dir,
|
||||
reranker_features_filename,
|
||||
reranker_weights_filename,
|
||||
parser_options,
|
||||
reranker_options,
|
||||
)
|
||||
|
||||
|
||||
def demo():
|
||||
"""This assumes the Python module bllipparser is installed."""
|
||||
|
||||
# download and install a basic unified parsing model (Wall Street Journal)
|
||||
# sudo python -m nltk.downloader bllip_wsj_no_aux
|
||||
|
||||
from nltk.data import find
|
||||
|
||||
model_dir = find("models/bllip_wsj_no_aux").path
|
||||
|
||||
print("Loading BLLIP Parsing models...")
|
||||
# the easiest way to get started is to use a unified model
|
||||
bllip = BllipParser.from_unified_model_dir(model_dir)
|
||||
print("Done.")
|
||||
|
||||
sentence1 = "British left waffles on Falklands .".split()
|
||||
sentence2 = "I saw the man with the telescope .".split()
|
||||
# this sentence is known to fail under the WSJ parsing model
|
||||
fail1 = "# ! ? : -".split()
|
||||
for sentence in (sentence1, sentence2, fail1):
|
||||
print("Sentence: %r" % " ".join(sentence))
|
||||
try:
|
||||
tree = next(bllip.parse(sentence))
|
||||
print(tree)
|
||||
except StopIteration:
|
||||
print("(parse failed)")
|
||||
|
||||
# n-best parsing demo
|
||||
for i, parse in enumerate(bllip.parse(sentence1)):
|
||||
print("parse %d:\n%s" % (i, parse))
|
||||
|
||||
# using external POS tag constraints
|
||||
print(
|
||||
"forcing 'tree' to be 'NN':",
|
||||
next(bllip.tagged_parse([("A", None), ("tree", "NN")])),
|
||||
)
|
||||
print(
|
||||
"forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
|
||||
next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])),
|
||||
)
|
||||
# constraints don't have to make sense... (though on more complicated
|
||||
# sentences, they may cause the parse to fail)
|
||||
print(
|
||||
"forcing 'A' to be 'NNP':",
|
||||
next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])),
|
||||
)
|
||||
1848
Backend/venv/lib/python3.12/site-packages/nltk/parse/chart.py
Normal file
1848
Backend/venv/lib/python3.12/site-packages/nltk/parse/chart.py
Normal file
File diff suppressed because it is too large
Load Diff
805
Backend/venv/lib/python3.12/site-packages/nltk/parse/corenlp.py
Normal file
805
Backend/venv/lib/python3.12/site-packages/nltk/parse/corenlp.py
Normal file
@@ -0,0 +1,805 @@
|
||||
# Natural Language Toolkit: Interface to the CoreNLP REST API.
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Dmitrijs Milajevs <dimazest@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
|
||||
from nltk.internals import _java_options, config_java, find_jar_iter, java
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.tag.api import TaggerI
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tree import Tree
|
||||
|
||||
_stanford_url = "https://stanfordnlp.github.io/CoreNLP/"
|
||||
|
||||
|
||||
class CoreNLPServerError(EnvironmentError):
|
||||
"""Exceptions associated with the Core NLP server."""
|
||||
|
||||
|
||||
def try_port(port=0):
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.bind(("", port))
|
||||
|
||||
p = sock.getsockname()[1]
|
||||
sock.close()
|
||||
|
||||
return p
|
||||
|
||||
|
||||
class CoreNLPServer:
|
||||
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
|
||||
_JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
path_to_models_jar=None,
|
||||
verbose=False,
|
||||
java_options=None,
|
||||
corenlp_options=None,
|
||||
port=None,
|
||||
):
|
||||
if corenlp_options is None:
|
||||
corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
|
||||
|
||||
jars = list(
|
||||
find_jar_iter(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("CORENLP",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
)
|
||||
)
|
||||
|
||||
# find the most recent code and model jar
|
||||
stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name))
|
||||
|
||||
if port is None:
|
||||
try:
|
||||
port = try_port(9000)
|
||||
except OSError:
|
||||
port = try_port()
|
||||
corenlp_options.extend(["-port", str(port)])
|
||||
else:
|
||||
try_port(port)
|
||||
corenlp_options.extend(["-port", str(port)])
|
||||
|
||||
self.url = f"http://localhost:{port}"
|
||||
|
||||
model_jar = max(
|
||||
find_jar_iter(
|
||||
self._MODEL_JAR_PATTERN,
|
||||
path_to_models_jar,
|
||||
env_vars=("CORENLP_MODELS",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
),
|
||||
key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name),
|
||||
)
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
self._classpath = stanford_jar, model_jar
|
||||
|
||||
self.corenlp_options = corenlp_options
|
||||
self.java_options = java_options or ["-mx2g"]
|
||||
|
||||
def start(self, stdout="devnull", stderr="devnull"):
|
||||
"""Starts the CoreNLP server
|
||||
|
||||
:param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
|
||||
"""
|
||||
import requests
|
||||
|
||||
cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
|
||||
|
||||
if self.corenlp_options:
|
||||
cmd.extend(self.corenlp_options)
|
||||
|
||||
# Configure java.
|
||||
default_options = " ".join(_java_options)
|
||||
config_java(options=self.java_options, verbose=self.verbose)
|
||||
|
||||
try:
|
||||
self.popen = java(
|
||||
cmd,
|
||||
classpath=self._classpath,
|
||||
blocking=False,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
)
|
||||
finally:
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=self.verbose)
|
||||
|
||||
# Check that the server is istill running.
|
||||
returncode = self.popen.poll()
|
||||
if returncode is not None:
|
||||
_, stderrdata = self.popen.communicate()
|
||||
raise CoreNLPServerError(
|
||||
returncode,
|
||||
"Could not start the server. "
|
||||
"The error was: {}".format(stderrdata.decode("ascii")),
|
||||
)
|
||||
|
||||
for i in range(30):
|
||||
try:
|
||||
response = requests.get(requests.compat.urljoin(self.url, "live"))
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
else:
|
||||
if response.ok:
|
||||
break
|
||||
else:
|
||||
raise CoreNLPServerError("Could not connect to the server.")
|
||||
|
||||
for i in range(60):
|
||||
try:
|
||||
response = requests.get(requests.compat.urljoin(self.url, "ready"))
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
else:
|
||||
if response.ok:
|
||||
break
|
||||
else:
|
||||
raise CoreNLPServerError("The server is not ready.")
|
||||
|
||||
def stop(self):
|
||||
self.popen.terminate()
|
||||
self.popen.wait()
|
||||
|
||||
def __enter__(self):
|
||||
self.start()
|
||||
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.stop()
|
||||
return False
|
||||
|
||||
|
||||
class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
|
||||
"""Interface to the CoreNLP Parser."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url="http://localhost:9000",
|
||||
encoding="utf8",
|
||||
tagtype=None,
|
||||
strict_json=True,
|
||||
):
|
||||
import requests
|
||||
|
||||
self.url = url
|
||||
self.encoding = encoding
|
||||
|
||||
if tagtype not in ["pos", "ner", None]:
|
||||
raise ValueError("tagtype must be either 'pos', 'ner' or None")
|
||||
|
||||
self.tagtype = tagtype
|
||||
self.strict_json = strict_json
|
||||
|
||||
self.session = requests.Session()
|
||||
|
||||
def parse_sents(self, sentences, *args, **kwargs):
|
||||
"""Parse multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list where each sentence is a list of
|
||||
words. Each sentence will be automatically tagged with this
|
||||
CoreNLPParser instance's tagger.
|
||||
|
||||
If a whitespace exists inside a token, then the token will be treated as
|
||||
several tokens.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(list(str))
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
# Converting list(list(str)) -> list(str)
|
||||
sentences = (" ".join(words) for words in sentences)
|
||||
return self.raw_parse_sents(sentences, *args, **kwargs)
|
||||
|
||||
def raw_parse(self, sentence, properties=None, *args, **kwargs):
|
||||
"""Parse a sentence.
|
||||
|
||||
Takes a sentence as a string; before parsing, it will be automatically
|
||||
tokenized and tagged by the CoreNLP Parser.
|
||||
|
||||
:param sentence: Input sentence to parse
|
||||
:type sentence: str
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
default_properties = {"tokenize.whitespace": "false"}
|
||||
default_properties.update(properties or {})
|
||||
|
||||
return next(
|
||||
self.raw_parse_sents(
|
||||
[sentence], properties=default_properties, *args, **kwargs
|
||||
)
|
||||
)
|
||||
|
||||
def api_call(self, data, properties=None, timeout=60):
|
||||
default_properties = {
|
||||
"outputFormat": "json",
|
||||
"annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
|
||||
parser_annotator=self.parser_annotator
|
||||
),
|
||||
}
|
||||
|
||||
default_properties.update(properties or {})
|
||||
|
||||
response = self.session.post(
|
||||
self.url,
|
||||
params={"properties": json.dumps(default_properties)},
|
||||
data=data.encode(self.encoding),
|
||||
headers={"Content-Type": f"text/plain; charset={self.encoding}"},
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json(strict=self.strict_json)
|
||||
|
||||
def raw_parse_sents(
|
||||
self, sentences, verbose=False, properties=None, *args, **kwargs
|
||||
):
|
||||
"""Parse multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list of strings. Each sentence will be
|
||||
automatically tokenized and tagged.
|
||||
|
||||
:param sentences: Input sentences to parse.
|
||||
:type sentences: list(str)
|
||||
:rtype: iter(iter(Tree))
|
||||
|
||||
"""
|
||||
default_properties = {
|
||||
# Only splits on '\n', never inside the sentence.
|
||||
"ssplit.eolonly": "true"
|
||||
}
|
||||
|
||||
default_properties.update(properties or {})
|
||||
|
||||
"""
|
||||
for sentence in sentences:
|
||||
parsed_data = self.api_call(sentence, properties=default_properties)
|
||||
|
||||
assert len(parsed_data['sentences']) == 1
|
||||
|
||||
for parse in parsed_data['sentences']:
|
||||
tree = self.make_tree(parse)
|
||||
yield iter([tree])
|
||||
"""
|
||||
parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
|
||||
for parsed_sent in parsed_data["sentences"]:
|
||||
tree = self.make_tree(parsed_sent)
|
||||
yield iter([tree])
|
||||
|
||||
def parse_text(self, text, *args, **kwargs):
|
||||
"""Parse a piece of text.
|
||||
|
||||
The text might contain several sentences which will be split by CoreNLP.
|
||||
|
||||
:param str text: text to be split.
|
||||
:returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables?
|
||||
|
||||
"""
|
||||
parsed_data = self.api_call(text, *args, **kwargs)
|
||||
|
||||
for parse in parsed_data["sentences"]:
|
||||
yield self.make_tree(parse)
|
||||
|
||||
def tokenize(self, text, properties=None):
|
||||
"""Tokenize a string of text.
|
||||
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The CoreNLP server can be started using the following notation, although
|
||||
we recommend the `with CoreNLPServer() as server:` context manager notation
|
||||
to ensure that the server is always stopped.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> parser = CoreNLPParser(url=server.url)
|
||||
|
||||
>>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'
|
||||
>>> list(parser.tokenize(text))
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
>>> s = "The colour of the wall is blue."
|
||||
>>> list(
|
||||
... parser.tokenize(
|
||||
... 'The colour of the wall is blue.',
|
||||
... properties={'tokenize.options': 'americanize=true'},
|
||||
... )
|
||||
... )
|
||||
['The', 'colour', 'of', 'the', 'wall', 'is', 'blue', '.']
|
||||
>>> server.stop()
|
||||
|
||||
"""
|
||||
default_properties = {"annotators": "tokenize,ssplit"}
|
||||
|
||||
default_properties.update(properties or {})
|
||||
|
||||
result = self.api_call(text, properties=default_properties)
|
||||
|
||||
for sentence in result["sentences"]:
|
||||
for token in sentence["tokens"]:
|
||||
yield token["originalText"] or token["word"]
|
||||
|
||||
def tag_sents(self, sentences, properties=None):
|
||||
"""
|
||||
Tag multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list where each sentence is a list of
|
||||
tokens.
|
||||
|
||||
:param sentences: Input sentences to tag
|
||||
:type sentences: list(list(str))
|
||||
:rtype: list(list(tuple(str, str))
|
||||
"""
|
||||
|
||||
# Converting list(list(str)) -> list(str)
|
||||
sentences = (" ".join(words) for words in sentences)
|
||||
|
||||
if properties is None:
|
||||
properties = {"tokenize.whitespace": "true", "ner.useSUTime": "false"}
|
||||
|
||||
return [sentences[0] for sentences in self.raw_tag_sents(sentences, properties)]
|
||||
|
||||
def tag(self, sentence: str, properties=None) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Tag a list of tokens.
|
||||
|
||||
:rtype: list(tuple(str, str))
|
||||
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The CoreNLP server can be started using the following notation, although
|
||||
we recommend the `with CoreNLPServer() as server:` context manager notation
|
||||
to ensure that the server is always stopped.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> parser = CoreNLPParser(url=server.url, tagtype='ner')
|
||||
>>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
|
||||
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
|
||||
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]
|
||||
|
||||
>>> parser = CoreNLPParser(url=server.url, tagtype='pos')
|
||||
>>> tokens = "What is the airspeed of an unladen swallow ?".split()
|
||||
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
|
||||
('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
|
||||
('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
|
||||
>>> server.stop()
|
||||
"""
|
||||
return self.tag_sents([sentence], properties)[0]
|
||||
|
||||
def raw_tag_sents(self, sentences, properties=None):
|
||||
"""
|
||||
Tag multiple sentences.
|
||||
|
||||
Takes multiple sentences as a list where each sentence is a string.
|
||||
|
||||
:param sentences: Input sentences to tag
|
||||
:type sentences: list(str)
|
||||
:rtype: list(list(list(tuple(str, str)))
|
||||
"""
|
||||
default_properties = {
|
||||
"ssplit.isOneSentence": "true",
|
||||
"annotators": "tokenize,ssplit,",
|
||||
}
|
||||
default_properties.update(properties or {})
|
||||
|
||||
# Supports only 'pos' or 'ner' tags.
|
||||
assert self.tagtype in [
|
||||
"pos",
|
||||
"ner",
|
||||
], "CoreNLP tagger supports only 'pos' or 'ner' tags."
|
||||
default_properties["annotators"] += self.tagtype
|
||||
for sentence in sentences:
|
||||
tagged_data = self.api_call(sentence, properties=default_properties)
|
||||
yield [
|
||||
[
|
||||
(token["word"], token[self.tagtype])
|
||||
for token in tagged_sentence["tokens"]
|
||||
]
|
||||
for tagged_sentence in tagged_data["sentences"]
|
||||
]
|
||||
|
||||
|
||||
class CoreNLPParser(GenericCoreNLPParser):
|
||||
"""
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The recommended usage of `CoreNLPParser` is using the context manager notation:
|
||||
>>> with CoreNLPServer() as server:
|
||||
... parser = CoreNLPParser(url=server.url)
|
||||
... next(
|
||||
... parser.raw_parse('The quick brown fox jumps over the lazy dog.')
|
||||
... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______________|__________________________
|
||||
| VP |
|
||||
| _________|___ |
|
||||
| | PP |
|
||||
| | ________|___ |
|
||||
NP | | NP |
|
||||
____|__________ | | _______|____ |
|
||||
DT JJ JJ NN VBZ IN DT JJ NN .
|
||||
| | | | | | | | | |
|
||||
The quick brown fox jumps over the lazy dog .
|
||||
|
||||
Alternatively, the server can be started using the following notation.
|
||||
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
|
||||
outside of Python.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> parser = CoreNLPParser(url=server.url)
|
||||
|
||||
>>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
|
||||
... [
|
||||
... 'The quick brown fox jumps over the lazy dog.',
|
||||
... 'The quick grey wolf jumps over the lazy fox.',
|
||||
... ]
|
||||
... )
|
||||
|
||||
>>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______________|__________________________
|
||||
| VP |
|
||||
| _________|___ |
|
||||
| | PP |
|
||||
| | ________|___ |
|
||||
NP | | NP |
|
||||
____|__________ | | _______|____ |
|
||||
DT JJ JJ NN VBZ IN DT JJ NN .
|
||||
| | | | | | | | | |
|
||||
The quick brown fox jumps over the lazy dog .
|
||||
|
||||
>>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______________|__________________________
|
||||
| VP |
|
||||
| _________|___ |
|
||||
| | PP |
|
||||
| | ________|___ |
|
||||
NP | | NP |
|
||||
____|_________ | | _______|____ |
|
||||
DT JJ JJ NN VBZ IN DT JJ NN .
|
||||
| | | | | | | | | |
|
||||
The quick grey wolf jumps over the lazy fox .
|
||||
|
||||
>>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
|
||||
... [
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ]
|
||||
... )
|
||||
|
||||
>>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_______|____
|
||||
| VP
|
||||
| ________|___
|
||||
NP | NP
|
||||
| | ___|___
|
||||
PRP VBP DT NN
|
||||
| | | |
|
||||
I 'm a dog
|
||||
|
||||
>>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
____|___________
|
||||
| VP
|
||||
| ___________|_____________
|
||||
| | NP
|
||||
| | _______|________________________
|
||||
| | NP | | |
|
||||
| | _____|_______ | | |
|
||||
NP | NP | | NP |
|
||||
| | ______|_________ | | ___|____ |
|
||||
DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB-
|
||||
| | | | | | | | | |
|
||||
This is my friends ' cat -LRB- the tabby -RRB-
|
||||
|
||||
>>> parse_john, parse_mary, = parser.parse_text(
|
||||
... 'John loves Mary. Mary walks.'
|
||||
... )
|
||||
|
||||
>>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_____|_____________
|
||||
| VP |
|
||||
| ____|___ |
|
||||
NP | NP |
|
||||
| | | |
|
||||
NNP VBZ NNP .
|
||||
| | | |
|
||||
John loves Mary .
|
||||
|
||||
>>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
||||
ROOT
|
||||
|
|
||||
S
|
||||
_____|____
|
||||
NP VP |
|
||||
| | |
|
||||
NNP VBZ .
|
||||
| | |
|
||||
Mary walks .
|
||||
|
||||
Special cases
|
||||
|
||||
>>> next(
|
||||
... parser.raw_parse(
|
||||
... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
|
||||
... 'Jessica Lynch have angrily dismissed claims made in her biography '
|
||||
... 'that she was raped by her Iraqi captors.'
|
||||
... )
|
||||
... ).height()
|
||||
14
|
||||
|
||||
>>> next(
|
||||
... parser.raw_parse(
|
||||
... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
|
||||
... '0.05 percent, at 997.02.'
|
||||
... )
|
||||
... ).height()
|
||||
11
|
||||
|
||||
>>> server.stop()
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "penn"
|
||||
parser_annotator = "parse"
|
||||
|
||||
def make_tree(self, result):
|
||||
return Tree.fromstring(result["parse"])
|
||||
|
||||
|
||||
class CoreNLPDependencyParser(GenericCoreNLPParser):
|
||||
"""Dependency parser.
|
||||
|
||||
Skip these tests if CoreNLP is likely not ready.
|
||||
>>> from nltk.test.setup_fixt import check_jar
|
||||
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
|
||||
|
||||
The recommended usage of `CoreNLPParser` is using the context manager notation:
|
||||
>>> with CoreNLPServer() as server:
|
||||
... dep_parser = CoreNLPDependencyParser(url=server.url)
|
||||
... parse, = dep_parser.raw_parse(
|
||||
... 'The quick brown fox jumps over the lazy dog.'
|
||||
... )
|
||||
... print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 4 det
|
||||
quick JJ 4 amod
|
||||
brown JJ 4 amod
|
||||
fox NN 5 nsubj
|
||||
jumps VBZ 0 ROOT
|
||||
over IN 9 case
|
||||
the DT 9 det
|
||||
lazy JJ 9 amod
|
||||
dog NN 5 obl
|
||||
. . 5 punct
|
||||
|
||||
Alternatively, the server can be started using the following notation.
|
||||
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
|
||||
outside of Python.
|
||||
>>> server = CoreNLPServer()
|
||||
>>> server.start()
|
||||
>>> dep_parser = CoreNLPDependencyParser(url=server.url)
|
||||
>>> parse, = dep_parser.raw_parse('The quick brown fox jumps over the lazy dog.')
|
||||
>>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE
|
||||
(jumps (fox The quick brown) (dog over the lazy) .)
|
||||
|
||||
>>> for governor, dep, dependent in parse.triples():
|
||||
... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE
|
||||
('jumps', 'VBZ') nsubj ('fox', 'NN')
|
||||
('fox', 'NN') det ('The', 'DT')
|
||||
('fox', 'NN') amod ('quick', 'JJ')
|
||||
('fox', 'NN') amod ('brown', 'JJ')
|
||||
('jumps', 'VBZ') obl ('dog', 'NN')
|
||||
('dog', 'NN') case ('over', 'IN')
|
||||
('dog', 'NN') det ('the', 'DT')
|
||||
('dog', 'NN') amod ('lazy', 'JJ')
|
||||
('jumps', 'VBZ') punct ('.', '.')
|
||||
|
||||
>>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
|
||||
... [
|
||||
... 'The quick brown fox jumps over the lazy dog.',
|
||||
... 'The quick grey wolf jumps over the lazy fox.',
|
||||
... ]
|
||||
... )
|
||||
>>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 4 det
|
||||
quick JJ 4 amod
|
||||
brown JJ 4 amod
|
||||
fox NN 5 nsubj
|
||||
jumps VBZ 0 ROOT
|
||||
over IN 9 case
|
||||
the DT 9 det
|
||||
lazy JJ 9 amod
|
||||
dog NN 5 obl
|
||||
. . 5 punct
|
||||
|
||||
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 4 det
|
||||
quick JJ 4 amod
|
||||
grey JJ 4 amod
|
||||
wolf NN 5 nsubj
|
||||
jumps VBZ 0 ROOT
|
||||
over IN 9 case
|
||||
the DT 9 det
|
||||
lazy JJ 9 amod
|
||||
fox NN 5 obl
|
||||
. . 5 punct
|
||||
|
||||
>>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
|
||||
... [
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ]
|
||||
... )
|
||||
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
I PRP 4 nsubj
|
||||
'm VBP 4 cop
|
||||
a DT 4 det
|
||||
dog NN 0 ROOT
|
||||
|
||||
>>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
This DT 6 nsubj
|
||||
is VBZ 6 cop
|
||||
my PRP$ 4 nmod:poss
|
||||
friends NNS 6 nmod:poss
|
||||
' POS 4 case
|
||||
cat NN 0 ROOT
|
||||
( -LRB- 9 punct
|
||||
the DT 9 det
|
||||
tabby NN 6 dep
|
||||
) -RRB- 9 punct
|
||||
|
||||
>>> parse_john, parse_mary, = dep_parser.parse_text(
|
||||
... 'John loves Mary. Mary walks.'
|
||||
... )
|
||||
|
||||
>>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
John NNP 2 nsubj
|
||||
loves VBZ 0 ROOT
|
||||
Mary NNP 2 obj
|
||||
. . 2 punct
|
||||
|
||||
>>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
||||
Mary NNP 2 nsubj
|
||||
walks VBZ 0 ROOT
|
||||
. . 2 punct
|
||||
|
||||
Special cases
|
||||
|
||||
Non-breaking space inside of a token.
|
||||
|
||||
>>> len(
|
||||
... next(
|
||||
... dep_parser.raw_parse(
|
||||
... 'Anhalt said children typically treat a 20-ounce soda bottle as one '
|
||||
... 'serving, while it actually contains 2 1/2 servings.'
|
||||
... )
|
||||
... ).nodes
|
||||
... )
|
||||
23
|
||||
|
||||
Phone numbers.
|
||||
|
||||
>>> len(
|
||||
... next(
|
||||
... dep_parser.raw_parse('This is not going to crash: 01 111 555.')
|
||||
... ).nodes
|
||||
... )
|
||||
10
|
||||
|
||||
>>> print(
|
||||
... next(
|
||||
... dep_parser.raw_parse('The underscore _ should not simply disappear.')
|
||||
... ).to_conll(4)
|
||||
... ) # doctest: +NORMALIZE_WHITESPACE
|
||||
The DT 2 det
|
||||
underscore NN 7 nsubj
|
||||
_ NFP 7 punct
|
||||
should MD 7 aux
|
||||
not RB 7 advmod
|
||||
simply RB 7 advmod
|
||||
disappear VB 0 ROOT
|
||||
. . 7 punct
|
||||
|
||||
>>> print(
|
||||
... next(
|
||||
... dep_parser.raw_parse(
|
||||
... 'for all of its insights into the dream world of teen life , and its electronic expression through '
|
||||
... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
|
||||
... '1/2-hour running time .'
|
||||
... )
|
||||
... ).to_conll(4)
|
||||
... ) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
|
||||
for IN 2 case
|
||||
all DT 24 obl
|
||||
of IN 5 case
|
||||
its PRP$ 5 nmod:poss
|
||||
insights NNS 2 nmod
|
||||
into IN 9 case
|
||||
the DT 9 det
|
||||
dream NN 9 compound
|
||||
world NN 5 nmod
|
||||
of IN 12 case
|
||||
teen NN 12 compound
|
||||
...
|
||||
|
||||
>>> server.stop()
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "conll2007"
|
||||
parser_annotator = "depparse"
|
||||
|
||||
def make_tree(self, result):
|
||||
return DependencyGraph(
|
||||
(
|
||||
" ".join(n_items[1:]) # NLTK expects an iterable of strings...
|
||||
for n_items in sorted(transform(result))
|
||||
),
|
||||
cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token.
|
||||
)
|
||||
|
||||
|
||||
def transform(sentence):
|
||||
for dependency in sentence["basicDependencies"]:
|
||||
dependent_index = dependency["dependent"]
|
||||
token = sentence["tokens"][dependent_index - 1]
|
||||
|
||||
# Return values that we don't know as '_'. Also, consider tag and ctag
|
||||
# to be equal.
|
||||
yield (
|
||||
dependent_index,
|
||||
"_",
|
||||
token["word"],
|
||||
token["lemma"],
|
||||
token["pos"],
|
||||
token["pos"],
|
||||
"_",
|
||||
str(dependency["governor"]),
|
||||
dependency["dep"],
|
||||
"_",
|
||||
"_",
|
||||
)
|
||||
@@ -0,0 +1,799 @@
|
||||
# Natural Language Toolkit: Dependency Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Jason Narad <jason.narad@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (modifications)
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Tools for reading and writing dependency trees.
|
||||
The input is assumed to be in Malt-TAB format
|
||||
(https://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
from pprint import pformat
|
||||
|
||||
from nltk.internals import find_binary
|
||||
from nltk.tree import Tree
|
||||
|
||||
#################################################################
|
||||
# DependencyGraph Class
|
||||
#################################################################
|
||||
|
||||
|
||||
class DependencyGraph:
|
||||
"""
|
||||
A container for the nodes and labelled edges of a dependency structure.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tree_str=None,
|
||||
cell_extractor=None,
|
||||
zero_based=False,
|
||||
cell_separator=None,
|
||||
top_relation_label="ROOT",
|
||||
):
|
||||
"""Dependency graph.
|
||||
|
||||
We place a dummy `TOP` node with the index 0, since the root node is
|
||||
often assigned 0 as its head. This also means that the indexing of the
|
||||
nodes corresponds directly to the Malt-TAB format, which starts at 1.
|
||||
|
||||
If zero-based is True, then Malt-TAB-like input with node numbers
|
||||
starting at 0 and the root node assigned -1 (as produced by, e.g.,
|
||||
zpar).
|
||||
|
||||
:param str cell_separator: the cell separator. If not provided, cells
|
||||
are split by whitespace.
|
||||
|
||||
:param str top_relation_label: the label by which the top relation is
|
||||
identified, for examlple, `ROOT`, `null` or `TOP`.
|
||||
"""
|
||||
self.nodes = defaultdict(
|
||||
lambda: {
|
||||
"address": None,
|
||||
"word": None,
|
||||
"lemma": None,
|
||||
"ctag": None,
|
||||
"tag": None,
|
||||
"feats": None,
|
||||
"head": None,
|
||||
"deps": defaultdict(list),
|
||||
"rel": None,
|
||||
}
|
||||
)
|
||||
|
||||
self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0})
|
||||
|
||||
self.root = None
|
||||
|
||||
if tree_str:
|
||||
self._parse(
|
||||
tree_str,
|
||||
cell_extractor=cell_extractor,
|
||||
zero_based=zero_based,
|
||||
cell_separator=cell_separator,
|
||||
top_relation_label=top_relation_label,
|
||||
)
|
||||
|
||||
def remove_by_address(self, address):
|
||||
"""
|
||||
Removes the node with the given address. References
|
||||
to this node in others will still exist.
|
||||
"""
|
||||
del self.nodes[address]
|
||||
|
||||
def redirect_arcs(self, originals, redirect):
|
||||
"""
|
||||
Redirects arcs to any of the nodes in the originals list
|
||||
to the redirect node address.
|
||||
"""
|
||||
for node in self.nodes.values():
|
||||
new_deps = []
|
||||
for dep in node["deps"]:
|
||||
if dep in originals:
|
||||
new_deps.append(redirect)
|
||||
else:
|
||||
new_deps.append(dep)
|
||||
node["deps"] = new_deps
|
||||
|
||||
def add_arc(self, head_address, mod_address):
|
||||
"""
|
||||
Adds an arc from the node specified by head_address to the
|
||||
node specified by the mod address.
|
||||
"""
|
||||
relation = self.nodes[mod_address]["rel"]
|
||||
self.nodes[head_address]["deps"].setdefault(relation, [])
|
||||
self.nodes[head_address]["deps"][relation].append(mod_address)
|
||||
# self.nodes[head_address]['deps'].append(mod_address)
|
||||
|
||||
def connect_graph(self):
|
||||
"""
|
||||
Fully connects all non-root nodes. All nodes are set to be dependents
|
||||
of the root node.
|
||||
"""
|
||||
for node1 in self.nodes.values():
|
||||
for node2 in self.nodes.values():
|
||||
if node1["address"] != node2["address"] and node2["rel"] != "TOP":
|
||||
relation = node2["rel"]
|
||||
node1["deps"].setdefault(relation, [])
|
||||
node1["deps"][relation].append(node2["address"])
|
||||
# node1['deps'].append(node2['address'])
|
||||
|
||||
def get_by_address(self, node_address):
|
||||
"""Return the node with the given address."""
|
||||
return self.nodes[node_address]
|
||||
|
||||
def contains_address(self, node_address):
|
||||
"""
|
||||
Returns true if the graph contains a node with the given node
|
||||
address, false otherwise.
|
||||
"""
|
||||
return node_address in self.nodes
|
||||
|
||||
def to_dot(self):
|
||||
"""Return a dot representation suitable for using with Graphviz.
|
||||
|
||||
>>> dg = DependencyGraph(
|
||||
... 'John N 2\\n'
|
||||
... 'loves V 0\\n'
|
||||
... 'Mary N 2'
|
||||
... )
|
||||
>>> print(dg.to_dot())
|
||||
digraph G{
|
||||
edge [dir=forward]
|
||||
node [shape=plaintext]
|
||||
<BLANKLINE>
|
||||
0 [label="0 (None)"]
|
||||
0 -> 2 [label="ROOT"]
|
||||
1 [label="1 (John)"]
|
||||
2 [label="2 (loves)"]
|
||||
2 -> 1 [label=""]
|
||||
2 -> 3 [label=""]
|
||||
3 [label="3 (Mary)"]
|
||||
}
|
||||
|
||||
"""
|
||||
# Start the digraph specification
|
||||
s = "digraph G{\n"
|
||||
s += "edge [dir=forward]\n"
|
||||
s += "node [shape=plaintext]\n"
|
||||
|
||||
# Draw the remaining nodes
|
||||
for node in sorted(self.nodes.values(), key=lambda v: v["address"]):
|
||||
s += '\n{} [label="{} ({})"]'.format(
|
||||
node["address"],
|
||||
node["address"],
|
||||
node["word"],
|
||||
)
|
||||
for rel, deps in node["deps"].items():
|
||||
for dep in deps:
|
||||
if rel is not None:
|
||||
s += '\n{} -> {} [label="{}"]'.format(node["address"], dep, rel)
|
||||
else:
|
||||
s += "\n{} -> {} ".format(node["address"], dep)
|
||||
s += "\n}"
|
||||
|
||||
return s
|
||||
|
||||
def _repr_svg_(self):
|
||||
"""Show SVG representation of the transducer (IPython magic).
|
||||
>>> from nltk.test.setup_fixt import check_binary
|
||||
>>> check_binary('dot')
|
||||
>>> dg = DependencyGraph(
|
||||
... 'John N 2\\n'
|
||||
... 'loves V 0\\n'
|
||||
... 'Mary N 2'
|
||||
... )
|
||||
>>> dg._repr_svg_().split('\\n')[0]
|
||||
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>'
|
||||
|
||||
"""
|
||||
dot_string = self.to_dot()
|
||||
return dot2img(dot_string)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.nodes)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<DependencyGraph with {len(self.nodes)} nodes>"
|
||||
|
||||
@staticmethod
|
||||
def load(
|
||||
filename, zero_based=False, cell_separator=None, top_relation_label="ROOT"
|
||||
):
|
||||
"""
|
||||
:param filename: a name of a file in Malt-TAB format
|
||||
:param zero_based: nodes in the input file are numbered starting from 0
|
||||
rather than 1 (as produced by, e.g., zpar)
|
||||
:param str cell_separator: the cell separator. If not provided, cells
|
||||
are split by whitespace.
|
||||
:param str top_relation_label: the label by which the top relation is
|
||||
identified, for examlple, `ROOT`, `null` or `TOP`.
|
||||
|
||||
:return: a list of DependencyGraphs
|
||||
|
||||
"""
|
||||
with open(filename) as infile:
|
||||
return [
|
||||
DependencyGraph(
|
||||
tree_str,
|
||||
zero_based=zero_based,
|
||||
cell_separator=cell_separator,
|
||||
top_relation_label=top_relation_label,
|
||||
)
|
||||
for tree_str in infile.read().split("\n\n")
|
||||
]
|
||||
|
||||
def left_children(self, node_index):
|
||||
"""
|
||||
Returns the number of left children under the node specified
|
||||
by the given address.
|
||||
"""
|
||||
children = chain.from_iterable(self.nodes[node_index]["deps"].values())
|
||||
index = self.nodes[node_index]["address"]
|
||||
return sum(1 for c in children if c < index)
|
||||
|
||||
def right_children(self, node_index):
|
||||
"""
|
||||
Returns the number of right children under the node specified
|
||||
by the given address.
|
||||
"""
|
||||
children = chain.from_iterable(self.nodes[node_index]["deps"].values())
|
||||
index = self.nodes[node_index]["address"]
|
||||
return sum(1 for c in children if c > index)
|
||||
|
||||
def add_node(self, node):
|
||||
if not self.contains_address(node["address"]):
|
||||
self.nodes[node["address"]].update(node)
|
||||
|
||||
def _parse(
|
||||
self,
|
||||
input_,
|
||||
cell_extractor=None,
|
||||
zero_based=False,
|
||||
cell_separator=None,
|
||||
top_relation_label="ROOT",
|
||||
):
|
||||
"""Parse a sentence.
|
||||
|
||||
:param extractor: a function that given a tuple of cells returns a
|
||||
7-tuple, where the values are ``word, lemma, ctag, tag, feats, head,
|
||||
rel``.
|
||||
|
||||
:param str cell_separator: the cell separator. If not provided, cells
|
||||
are split by whitespace.
|
||||
|
||||
:param str top_relation_label: the label by which the top relation is
|
||||
identified, for examlple, `ROOT`, `null` or `TOP`.
|
||||
|
||||
"""
|
||||
|
||||
def extract_3_cells(cells, index):
|
||||
word, tag, head = cells
|
||||
return index, word, word, tag, tag, "", head, ""
|
||||
|
||||
def extract_4_cells(cells, index):
|
||||
word, tag, head, rel = cells
|
||||
return index, word, word, tag, tag, "", head, rel
|
||||
|
||||
def extract_7_cells(cells, index):
|
||||
line_index, word, lemma, tag, _, head, rel = cells
|
||||
try:
|
||||
index = int(line_index)
|
||||
except ValueError:
|
||||
# index can't be parsed as an integer, use default
|
||||
pass
|
||||
return index, word, lemma, tag, tag, "", head, rel
|
||||
|
||||
def extract_10_cells(cells, index):
|
||||
line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
|
||||
try:
|
||||
index = int(line_index)
|
||||
except ValueError:
|
||||
# index can't be parsed as an integer, use default
|
||||
pass
|
||||
return index, word, lemma, ctag, tag, feats, head, rel
|
||||
|
||||
extractors = {
|
||||
3: extract_3_cells,
|
||||
4: extract_4_cells,
|
||||
7: extract_7_cells,
|
||||
10: extract_10_cells,
|
||||
}
|
||||
|
||||
if isinstance(input_, str):
|
||||
input_ = (line for line in input_.split("\n"))
|
||||
|
||||
lines = (l.rstrip() for l in input_)
|
||||
lines = (l for l in lines if l)
|
||||
|
||||
cell_number = None
|
||||
for index, line in enumerate(lines, start=1):
|
||||
cells = line.split(cell_separator)
|
||||
if cell_number is None:
|
||||
cell_number = len(cells)
|
||||
else:
|
||||
assert cell_number == len(cells)
|
||||
|
||||
if cell_extractor is None:
|
||||
try:
|
||||
cell_extractor = extractors[cell_number]
|
||||
except KeyError as e:
|
||||
raise ValueError(
|
||||
"Number of tab-delimited fields ({}) not supported by "
|
||||
"CoNLL(10) or Malt-Tab(4) format".format(cell_number)
|
||||
) from e
|
||||
|
||||
try:
|
||||
index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(
|
||||
cells, index
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
# cell_extractor doesn't take 2 arguments or doesn't return 8
|
||||
# values; assume the cell_extractor is an older external
|
||||
# extractor and doesn't accept or return an index.
|
||||
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
|
||||
|
||||
if head == "_":
|
||||
continue
|
||||
|
||||
head = int(head)
|
||||
if zero_based:
|
||||
head += 1
|
||||
|
||||
self.nodes[index].update(
|
||||
{
|
||||
"address": index,
|
||||
"word": word,
|
||||
"lemma": lemma,
|
||||
"ctag": ctag,
|
||||
"tag": tag,
|
||||
"feats": feats,
|
||||
"head": head,
|
||||
"rel": rel,
|
||||
}
|
||||
)
|
||||
|
||||
# Make sure that the fake root node has labeled dependencies.
|
||||
if (cell_number == 3) and (head == 0):
|
||||
rel = top_relation_label
|
||||
self.nodes[head]["deps"][rel].append(index)
|
||||
|
||||
if self.nodes[0]["deps"][top_relation_label]:
|
||||
root_address = self.nodes[0]["deps"][top_relation_label][0]
|
||||
self.root = self.nodes[root_address]
|
||||
self.top_relation_label = top_relation_label
|
||||
else:
|
||||
warnings.warn(
|
||||
"The graph doesn't contain a node " "that depends on the root element."
|
||||
)
|
||||
|
||||
def _word(self, node, filter=True):
|
||||
w = node["word"]
|
||||
if filter:
|
||||
if w != ",":
|
||||
return w
|
||||
return w
|
||||
|
||||
def _tree(self, i):
|
||||
"""Turn dependency graphs into NLTK trees.
|
||||
|
||||
:param int i: index of a node
|
||||
:return: either a word (if the indexed node is a leaf) or a ``Tree``.
|
||||
"""
|
||||
node = self.get_by_address(i)
|
||||
word = node["word"]
|
||||
deps = sorted(chain.from_iterable(node["deps"].values()))
|
||||
|
||||
if deps:
|
||||
return Tree(word, [self._tree(dep) for dep in deps])
|
||||
else:
|
||||
return word
|
||||
|
||||
def tree(self):
|
||||
"""
|
||||
Starting with the ``root`` node, build a dependency tree using the NLTK
|
||||
``Tree`` constructor. Dependency labels are omitted.
|
||||
"""
|
||||
node = self.root
|
||||
|
||||
word = node["word"]
|
||||
deps = sorted(chain.from_iterable(node["deps"].values()))
|
||||
return Tree(word, [self._tree(dep) for dep in deps])
|
||||
|
||||
def triples(self, node=None):
|
||||
"""
|
||||
Extract dependency triples of the form:
|
||||
((head word, head tag), rel, (dep word, dep tag))
|
||||
"""
|
||||
|
||||
if not node:
|
||||
node = self.root
|
||||
|
||||
head = (node["word"], node["ctag"])
|
||||
for i in sorted(chain.from_iterable(node["deps"].values())):
|
||||
dep = self.get_by_address(i)
|
||||
yield (head, dep["rel"], (dep["word"], dep["ctag"]))
|
||||
yield from self.triples(node=dep)
|
||||
|
||||
def _hd(self, i):
|
||||
try:
|
||||
return self.nodes[i]["head"]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def _rel(self, i):
|
||||
try:
|
||||
return self.nodes[i]["rel"]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
# what's the return type? Boolean or list?
|
||||
def contains_cycle(self):
|
||||
"""Check whether there are cycles.
|
||||
|
||||
>>> dg = DependencyGraph(treebank_data)
|
||||
>>> dg.contains_cycle()
|
||||
False
|
||||
|
||||
>>> cyclic_dg = DependencyGraph()
|
||||
>>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}
|
||||
>>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}
|
||||
>>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}
|
||||
>>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}
|
||||
>>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}
|
||||
>>> cyclic_dg.nodes = {
|
||||
... 0: top,
|
||||
... 1: child1,
|
||||
... 2: child2,
|
||||
... 3: child3,
|
||||
... 4: child4,
|
||||
... }
|
||||
>>> cyclic_dg.root = top
|
||||
|
||||
>>> cyclic_dg.contains_cycle()
|
||||
[1, 2, 4, 3]
|
||||
|
||||
"""
|
||||
distances = {}
|
||||
|
||||
for node in self.nodes.values():
|
||||
for dep in node["deps"]:
|
||||
key = tuple([node["address"], dep])
|
||||
distances[key] = 1
|
||||
|
||||
for _ in self.nodes:
|
||||
new_entries = {}
|
||||
|
||||
for pair1 in distances:
|
||||
for pair2 in distances:
|
||||
if pair1[1] == pair2[0]:
|
||||
key = tuple([pair1[0], pair2[1]])
|
||||
new_entries[key] = distances[pair1] + distances[pair2]
|
||||
|
||||
for pair in new_entries:
|
||||
distances[pair] = new_entries[pair]
|
||||
if pair[0] == pair[1]:
|
||||
path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0])
|
||||
return path
|
||||
|
||||
return False # return []?
|
||||
|
||||
def get_cycle_path(self, curr_node, goal_node_index):
|
||||
for dep in curr_node["deps"]:
|
||||
if dep == goal_node_index:
|
||||
return [curr_node["address"]]
|
||||
for dep in curr_node["deps"]:
|
||||
path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
|
||||
if len(path) > 0:
|
||||
path.insert(0, curr_node["address"])
|
||||
return path
|
||||
return []
|
||||
|
||||
def to_conll(self, style):
|
||||
"""
|
||||
The dependency graph in CoNLL format.
|
||||
|
||||
:param style: the style to use for the format (3, 4, 10 columns)
|
||||
:type style: int
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
if style == 3:
|
||||
template = "{word}\t{tag}\t{head}\n"
|
||||
elif style == 4:
|
||||
template = "{word}\t{tag}\t{head}\t{rel}\n"
|
||||
elif style == 10:
|
||||
template = (
|
||||
"{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Number of tab-delimited fields ({}) not supported by "
|
||||
"CoNLL(10) or Malt-Tab(4) format".format(style)
|
||||
)
|
||||
|
||||
return "".join(
|
||||
template.format(i=i, **node)
|
||||
for i, node in sorted(self.nodes.items())
|
||||
if node["tag"] != "TOP"
|
||||
)
|
||||
|
||||
def nx_graph(self):
|
||||
"""Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
|
||||
import networkx
|
||||
|
||||
nx_nodelist = list(range(1, len(self.nodes)))
|
||||
nx_edgelist = [
|
||||
(n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n)
|
||||
]
|
||||
self.nx_labels = {}
|
||||
for n in nx_nodelist:
|
||||
self.nx_labels[n] = self.nodes[n]["word"]
|
||||
|
||||
g = networkx.MultiDiGraph()
|
||||
g.add_nodes_from(nx_nodelist)
|
||||
g.add_edges_from(nx_edgelist)
|
||||
|
||||
return g
|
||||
|
||||
|
||||
def dot2img(dot_string, t="svg"):
|
||||
"""
|
||||
Create image representation fom dot_string, using the 'dot' program
|
||||
from the Graphviz package.
|
||||
|
||||
Use the 't' argument to specify the image file format, for ex. 'jpeg', 'eps',
|
||||
'json', 'png' or 'webp' (Running 'dot -T:' lists all available formats).
|
||||
|
||||
Note that the "capture_output" option of subprocess.run() is only available
|
||||
with text formats (like svg), but not with binary image formats (like png).
|
||||
"""
|
||||
|
||||
try:
|
||||
find_binary("dot")
|
||||
try:
|
||||
if t in ["dot", "dot_json", "json", "svg"]:
|
||||
proc = subprocess.run(
|
||||
["dot", "-T%s" % t],
|
||||
capture_output=True,
|
||||
input=dot_string,
|
||||
text=True,
|
||||
)
|
||||
else:
|
||||
proc = subprocess.run(
|
||||
["dot", "-T%s" % t],
|
||||
input=bytes(dot_string, encoding="utf8"),
|
||||
)
|
||||
return proc.stdout
|
||||
except:
|
||||
raise Exception(
|
||||
"Cannot create image representation by running dot from string: {}"
|
||||
"".format(dot_string)
|
||||
)
|
||||
except OSError as e:
|
||||
raise Exception("Cannot find the dot binary from Graphviz package") from e
|
||||
|
||||
|
||||
class DependencyGraphError(Exception):
|
||||
"""Dependency graph exception."""
|
||||
|
||||
|
||||
def demo():
|
||||
malt_demo()
|
||||
conll_demo()
|
||||
conll_file_demo()
|
||||
cycle_finding_demo()
|
||||
|
||||
|
||||
def malt_demo(nx=False):
|
||||
"""
|
||||
A demonstration of the result of reading a dependency
|
||||
version of the first sentence of the Penn Treebank.
|
||||
"""
|
||||
dg = DependencyGraph(
|
||||
"""Pierre NNP 2 NMOD
|
||||
Vinken NNP 8 SUB
|
||||
, , 2 P
|
||||
61 CD 5 NMOD
|
||||
years NNS 6 AMOD
|
||||
old JJ 2 NMOD
|
||||
, , 2 P
|
||||
will MD 0 ROOT
|
||||
join VB 8 VC
|
||||
the DT 11 NMOD
|
||||
board NN 9 OBJ
|
||||
as IN 9 VMOD
|
||||
a DT 15 NMOD
|
||||
nonexecutive JJ 15 NMOD
|
||||
director NN 12 PMOD
|
||||
Nov. NNP 9 VMOD
|
||||
29 CD 16 NMOD
|
||||
. . 9 VMOD
|
||||
"""
|
||||
)
|
||||
tree = dg.tree()
|
||||
tree.pprint()
|
||||
if nx:
|
||||
# currently doesn't work
|
||||
import networkx
|
||||
from matplotlib import pylab
|
||||
|
||||
g = dg.nx_graph()
|
||||
g.info()
|
||||
pos = networkx.spring_layout(g, dim=1)
|
||||
networkx.draw_networkx_nodes(g, pos, node_size=50)
|
||||
# networkx.draw_networkx_edges(g, pos, edge_color='k', width=8)
|
||||
networkx.draw_networkx_labels(g, pos, dg.nx_labels)
|
||||
pylab.xticks([])
|
||||
pylab.yticks([])
|
||||
pylab.savefig("tree.png")
|
||||
pylab.show()
|
||||
|
||||
|
||||
def conll_demo():
|
||||
"""
|
||||
A demonstration of how to read a string representation of
|
||||
a CoNLL format dependency tree.
|
||||
"""
|
||||
dg = DependencyGraph(conll_data1)
|
||||
tree = dg.tree()
|
||||
tree.pprint()
|
||||
print(dg)
|
||||
print(dg.to_conll(4))
|
||||
|
||||
|
||||
def conll_file_demo():
|
||||
print("Mass conll_read demo...")
|
||||
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
|
||||
for graph in graphs:
|
||||
tree = graph.tree()
|
||||
print("\n")
|
||||
tree.pprint()
|
||||
|
||||
|
||||
def cycle_finding_demo():
|
||||
dg = DependencyGraph(treebank_data)
|
||||
print(dg.contains_cycle())
|
||||
cyclic_dg = DependencyGraph()
|
||||
cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0})
|
||||
cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1})
|
||||
cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2})
|
||||
cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3})
|
||||
cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4})
|
||||
print(cyclic_dg.contains_cycle())
|
||||
|
||||
|
||||
treebank_data = """Pierre NNP 2 NMOD
|
||||
Vinken NNP 8 SUB
|
||||
, , 2 P
|
||||
61 CD 5 NMOD
|
||||
years NNS 6 AMOD
|
||||
old JJ 2 NMOD
|
||||
, , 2 P
|
||||
will MD 0 ROOT
|
||||
join VB 8 VC
|
||||
the DT 11 NMOD
|
||||
board NN 9 OBJ
|
||||
as IN 9 VMOD
|
||||
a DT 15 NMOD
|
||||
nonexecutive JJ 15 NMOD
|
||||
director NN 12 PMOD
|
||||
Nov. NNP 9 VMOD
|
||||
29 CD 16 NMOD
|
||||
. . 9 VMOD
|
||||
"""
|
||||
|
||||
conll_data1 = """
|
||||
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 met met Prep Prep voor 8 mod _ _
|
||||
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
|
||||
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
|
||||
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
|
||||
7 gaan ga V V hulp|inf 6 vc _ _
|
||||
8 winkelen winkel V V intrans|inf 11 cnj _ _
|
||||
9 , , Punc Punc komma 8 punct _ _
|
||||
10 zwemmen zwem V V intrans|inf 11 cnj _ _
|
||||
11 of of Conj Conj neven 7 vc _ _
|
||||
12 terrassen terras N N soort|mv|neut 11 cnj _ _
|
||||
13 . . Punc Punc punt 12 punct _ _
|
||||
"""
|
||||
|
||||
conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _
|
||||
2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _
|
||||
4 wild wild Adj Adj attr|stell|onverv 5 mod _ _
|
||||
5 zwaaien zwaai N N soort|mv|neut 2 vc _ _
|
||||
6 . . Punc Punc punt 5 punct _ _
|
||||
|
||||
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 met met Prep Prep voor 8 mod _ _
|
||||
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
|
||||
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
|
||||
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
|
||||
7 gaan ga V V hulp|inf 6 vc _ _
|
||||
8 winkelen winkel V V intrans|inf 11 cnj _ _
|
||||
9 , , Punc Punc komma 8 punct _ _
|
||||
10 zwemmen zwem V V intrans|inf 11 cnj _ _
|
||||
11 of of Conj Conj neven 7 vc _ _
|
||||
12 terrassen terras N N soort|mv|neut 11 cnj _ _
|
||||
13 . . Punc Punc punt 12 punct _ _
|
||||
|
||||
1 Dat dat Pron Pron aanw|neut|attr 2 det _ _
|
||||
2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _
|
||||
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
|
||||
4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _
|
||||
5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _
|
||||
6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _
|
||||
7 . . Punc Punc punt 6 punct _ _
|
||||
|
||||
1 Het het Pron Pron onbep|neut|zelfst 2 su _ _
|
||||
2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _
|
||||
3 bij bij Prep Prep voor 2 ld _ _
|
||||
4 de de Art Art bep|zijdofmv|neut 6 det _ _
|
||||
5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _
|
||||
6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _
|
||||
7 die die Pron Pron betr|neut|zelfst 6 mod _ _
|
||||
8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _
|
||||
9 ginds ginds Adv Adv gew|aanw 12 mod _ _
|
||||
10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _
|
||||
11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _
|
||||
12 gelaten laat V V trans|verldw|onverv 11 vc _ _
|
||||
13 . . Punc Punc punt 12 punct _ _
|
||||
|
||||
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _
|
||||
3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _
|
||||
4 naast naast Prep Prep voor 11 mod _ _
|
||||
5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _
|
||||
6 op op Prep Prep voor 11 ld _ _
|
||||
7 de de Art Art bep|zijdofmv|neut 8 det _ _
|
||||
8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _
|
||||
9 kunnen kan V V hulp|inf 2 vc _ _
|
||||
10 gaan ga V V hulp|inf 9 vc _ _
|
||||
11 liggen lig V V intrans|inf 10 vc _ _
|
||||
12 . . Punc Punc punt 11 punct _ _
|
||||
|
||||
1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _
|
||||
2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _
|
||||
3 mams mams N N soort|ev|neut 4 det _ _
|
||||
4 rug rug N N soort|ev|neut 5 obj1 _ _
|
||||
5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _
|
||||
6 hebben heb V V hulp|inf 2 vc _ _
|
||||
7 en en Conj Conj neven 0 ROOT _ _
|
||||
8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _
|
||||
9 de de Art Art bep|zijdofmv|neut 10 det _ _
|
||||
10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _
|
||||
11 . . Punc Punc punt 10 punct _ _
|
||||
|
||||
1 Of of Conj Conj onder|metfin 0 ROOT _ _
|
||||
2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _
|
||||
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
|
||||
4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _
|
||||
5 met met Prep Prep voor 10 mod _ _
|
||||
6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _
|
||||
7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _
|
||||
8 rond rond Adv Adv deelv 10 svp _ _
|
||||
9 kunnen kan V V hulp|inf 3 vc _ _
|
||||
10 slenteren slenter V V intrans|inf 9 vc _ _
|
||||
11 in in Prep Prep voor 10 mod _ _
|
||||
12 de de Art Art bep|zijdofmv|neut 13 det _ _
|
||||
13 buurt buurt N N soort|ev|neut 11 obj1 _ _
|
||||
14 van van Prep Prep voor 13 mod _ _
|
||||
15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _
|
||||
16 . . Punc Punc punt 15 punct _ _
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,552 @@
|
||||
# Natural Language Toolkit: An Incremental Earley Chart Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
||||
# Rob Speer <rspeer@mit.edu>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Jean Mark Gawron <gawron@mail.sdsu.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Data classes and parser implementations for *incremental* chart
|
||||
parsers, which use dynamic programming to efficiently parse a text.
|
||||
A "chart parser" derives parse trees for a text by iteratively adding
|
||||
\"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree
|
||||
structure for a subsequence of the text. The "chart" is a
|
||||
\"blackboard\" for composing and combining these hypotheses.
|
||||
|
||||
A parser is "incremental", if it guarantees that for all i, j where i < j,
|
||||
all edges ending at i are built before any edges ending at j.
|
||||
This is appealing for, say, speech recognizer hypothesis filtering.
|
||||
|
||||
The main parser class is ``EarleyChartParser``, which is a top-down
|
||||
algorithm, originally formulated by Jay Earley (1970).
|
||||
"""
|
||||
|
||||
from time import perf_counter
|
||||
|
||||
from nltk.parse.chart import (
|
||||
BottomUpPredictCombineRule,
|
||||
BottomUpPredictRule,
|
||||
CachedTopDownPredictRule,
|
||||
Chart,
|
||||
ChartParser,
|
||||
EdgeI,
|
||||
EmptyPredictRule,
|
||||
FilteredBottomUpPredictCombineRule,
|
||||
FilteredSingleEdgeFundamentalRule,
|
||||
LeafEdge,
|
||||
LeafInitRule,
|
||||
SingleEdgeFundamentalRule,
|
||||
TopDownInitRule,
|
||||
)
|
||||
from nltk.parse.featurechart import (
|
||||
FeatureBottomUpPredictCombineRule,
|
||||
FeatureBottomUpPredictRule,
|
||||
FeatureChart,
|
||||
FeatureChartParser,
|
||||
FeatureEmptyPredictRule,
|
||||
FeatureSingleEdgeFundamentalRule,
|
||||
FeatureTopDownInitRule,
|
||||
FeatureTopDownPredictRule,
|
||||
)
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental Chart
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class IncrementalChart(Chart):
|
||||
def initialize(self):
|
||||
# A sequence of edge lists contained in this chart.
|
||||
self._edgelists = tuple([] for x in self._positions())
|
||||
|
||||
# The set of child pointer lists associated with each edge.
|
||||
self._edge_to_cpls = {}
|
||||
|
||||
# Indexes mapping attribute values to lists of edges
|
||||
# (used by select()).
|
||||
self._indexes = {}
|
||||
|
||||
def edges(self):
|
||||
return list(self.iteredges())
|
||||
|
||||
def iteredges(self):
|
||||
return (edge for edgelist in self._edgelists for edge in edgelist)
|
||||
|
||||
def select(self, end, **restrictions):
|
||||
edgelist = self._edgelists[end]
|
||||
|
||||
# If there are no restrictions, then return all edges.
|
||||
if restrictions == {}:
|
||||
return iter(edgelist)
|
||||
|
||||
# Find the index corresponding to the given restrictions.
|
||||
restr_keys = sorted(restrictions.keys())
|
||||
restr_keys = tuple(restr_keys)
|
||||
|
||||
# If it doesn't exist, then create it.
|
||||
if restr_keys not in self._indexes:
|
||||
self._add_index(restr_keys)
|
||||
|
||||
vals = tuple(restrictions[key] for key in restr_keys)
|
||||
return iter(self._indexes[restr_keys][end].get(vals, []))
|
||||
|
||||
def _add_index(self, restr_keys):
|
||||
# Make sure it's a valid index.
|
||||
for key in restr_keys:
|
||||
if not hasattr(EdgeI, key):
|
||||
raise ValueError("Bad restriction: %s" % key)
|
||||
|
||||
# Create the index.
|
||||
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
|
||||
|
||||
# Add all existing edges to the index.
|
||||
for end, edgelist in enumerate(self._edgelists):
|
||||
this_index = index[end]
|
||||
for edge in edgelist:
|
||||
vals = tuple(getattr(edge, key)() for key in restr_keys)
|
||||
this_index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _register_with_indexes(self, edge):
|
||||
end = edge.end()
|
||||
for restr_keys, index in self._indexes.items():
|
||||
vals = tuple(getattr(edge, key)() for key in restr_keys)
|
||||
index[end].setdefault(vals, []).append(edge)
|
||||
|
||||
def _append_edge(self, edge):
|
||||
self._edgelists[edge.end()].append(edge)
|
||||
|
||||
def _positions(self):
|
||||
return range(self.num_leaves() + 1)
|
||||
|
||||
|
||||
class FeatureIncrementalChart(IncrementalChart, FeatureChart):
|
||||
def select(self, end, **restrictions):
|
||||
edgelist = self._edgelists[end]
|
||||
|
||||
# If there are no restrictions, then return all edges.
|
||||
if restrictions == {}:
|
||||
return iter(edgelist)
|
||||
|
||||
# Find the index corresponding to the given restrictions.
|
||||
restr_keys = sorted(restrictions.keys())
|
||||
restr_keys = tuple(restr_keys)
|
||||
|
||||
# If it doesn't exist, then create it.
|
||||
if restr_keys not in self._indexes:
|
||||
self._add_index(restr_keys)
|
||||
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(restrictions[key]) for key in restr_keys
|
||||
)
|
||||
return iter(self._indexes[restr_keys][end].get(vals, []))
|
||||
|
||||
def _add_index(self, restr_keys):
|
||||
# Make sure it's a valid index.
|
||||
for key in restr_keys:
|
||||
if not hasattr(EdgeI, key):
|
||||
raise ValueError("Bad restriction: %s" % key)
|
||||
|
||||
# Create the index.
|
||||
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
|
||||
|
||||
# Add all existing edges to the index.
|
||||
for end, edgelist in enumerate(self._edgelists):
|
||||
this_index = index[end]
|
||||
for edge in edgelist:
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)())
|
||||
for key in restr_keys
|
||||
)
|
||||
this_index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _register_with_indexes(self, edge):
|
||||
end = edge.end()
|
||||
for restr_keys, index in self._indexes.items():
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
|
||||
)
|
||||
index[end].setdefault(vals, []).append(edge)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental CFG Rules
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class CompleteFundamentalRule(SingleEdgeFundamentalRule):
|
||||
def _apply_incomplete(self, chart, grammar, left_edge):
|
||||
end = left_edge.end()
|
||||
# When the chart is incremental, we only have to look for
|
||||
# empty complete edges here.
|
||||
for right_edge in chart.select(
|
||||
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
|
||||
):
|
||||
new_edge = left_edge.move_dot_forward(right_edge.end())
|
||||
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class CompleterRule(CompleteFundamentalRule):
|
||||
_fundamental_rule = CompleteFundamentalRule()
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if not isinstance(edge, LeafEdge):
|
||||
yield from self._fundamental_rule.apply(chart, grammar, edge)
|
||||
|
||||
|
||||
class ScannerRule(CompleteFundamentalRule):
|
||||
_fundamental_rule = CompleteFundamentalRule()
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if isinstance(edge, LeafEdge):
|
||||
yield from self._fundamental_rule.apply(chart, grammar, edge)
|
||||
|
||||
|
||||
class PredictorRule(CachedTopDownPredictRule):
|
||||
pass
|
||||
|
||||
|
||||
class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
|
||||
def apply(self, chart, grammar, edge):
|
||||
# Since the Filtered rule only works for grammars without empty productions,
|
||||
# we only have to bother with complete edges here.
|
||||
if edge.is_complete():
|
||||
yield from self._apply_complete(chart, grammar, edge)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental FCFG Rules
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
|
||||
def _apply_incomplete(self, chart, grammar, left_edge):
|
||||
fr = self._fundamental_rule
|
||||
end = left_edge.end()
|
||||
# When the chart is incremental, we only have to look for
|
||||
# empty complete edges here.
|
||||
for right_edge in chart.select(
|
||||
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, left_edge, right_edge)
|
||||
|
||||
|
||||
class FeatureCompleterRule(CompleterRule):
|
||||
_fundamental_rule = FeatureCompleteFundamentalRule()
|
||||
|
||||
|
||||
class FeatureScannerRule(ScannerRule):
|
||||
_fundamental_rule = FeatureCompleteFundamentalRule()
|
||||
|
||||
|
||||
class FeaturePredictorRule(FeatureTopDownPredictRule):
|
||||
pass
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental CFG Chart Parsers
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
EARLEY_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
TopDownInitRule(),
|
||||
CompleterRule(),
|
||||
ScannerRule(),
|
||||
PredictorRule(),
|
||||
]
|
||||
TD_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
TopDownInitRule(),
|
||||
CachedTopDownPredictRule(),
|
||||
CompleteFundamentalRule(),
|
||||
]
|
||||
BU_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
EmptyPredictRule(),
|
||||
BottomUpPredictRule(),
|
||||
CompleteFundamentalRule(),
|
||||
]
|
||||
BU_LC_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
EmptyPredictRule(),
|
||||
BottomUpPredictCombineRule(),
|
||||
CompleteFundamentalRule(),
|
||||
]
|
||||
|
||||
LC_INCREMENTAL_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FilteredBottomUpPredictCombineRule(),
|
||||
FilteredCompleteFundamentalRule(),
|
||||
]
|
||||
|
||||
|
||||
class IncrementalChartParser(ChartParser):
|
||||
"""
|
||||
An *incremental* chart parser implementing Jay Earley's
|
||||
parsing algorithm:
|
||||
|
||||
| For each index end in [0, 1, ..., N]:
|
||||
| For each edge such that edge.end = end:
|
||||
| If edge is incomplete and edge.next is not a part of speech:
|
||||
| Apply PredictorRule to edge
|
||||
| If edge is incomplete and edge.next is a part of speech:
|
||||
| Apply ScannerRule to edge
|
||||
| If edge is complete:
|
||||
| Apply CompleterRule to edge
|
||||
| Return any complete parses in the chart
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=BU_LC_INCREMENTAL_STRATEGY,
|
||||
trace=0,
|
||||
trace_chart_width=50,
|
||||
chart_class=IncrementalChart,
|
||||
):
|
||||
"""
|
||||
Create a new Earley chart parser, that uses ``grammar`` to
|
||||
parse texts.
|
||||
|
||||
:type grammar: CFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
:type trace_chart_width: int
|
||||
:param trace_chart_width: The default total width reserved for
|
||||
the chart in trace output. The remainder of each line will
|
||||
be used to display edges.
|
||||
:param chart_class: The class that should be used to create
|
||||
the charts used by this parser.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
self._trace_chart_width = trace_chart_width
|
||||
self._chart_class = chart_class
|
||||
|
||||
self._axioms = []
|
||||
self._inference_rules = []
|
||||
for rule in strategy:
|
||||
if rule.NUM_EDGES == 0:
|
||||
self._axioms.append(rule)
|
||||
elif rule.NUM_EDGES == 1:
|
||||
self._inference_rules.append(rule)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Incremental inference rules must have " "NUM_EDGES == 0 or 1"
|
||||
)
|
||||
|
||||
def chart_parse(self, tokens, trace=None):
|
||||
if trace is None:
|
||||
trace = self._trace
|
||||
trace_new_edges = self._trace_new_edges
|
||||
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
chart = self._chart_class(tokens)
|
||||
grammar = self._grammar
|
||||
|
||||
# Width, for printing trace edges.
|
||||
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
|
||||
if trace:
|
||||
print(chart.pretty_format_leaves(trace_edge_width))
|
||||
|
||||
for axiom in self._axioms:
|
||||
new_edges = list(axiom.apply(chart, grammar))
|
||||
trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
|
||||
|
||||
inference_rules = self._inference_rules
|
||||
for end in range(chart.num_leaves() + 1):
|
||||
if trace > 1:
|
||||
print("\n* Processing queue:", end, "\n")
|
||||
agenda = list(chart.select(end=end))
|
||||
while agenda:
|
||||
edge = agenda.pop()
|
||||
for rule in inference_rules:
|
||||
new_edges = list(rule.apply(chart, grammar, edge))
|
||||
trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
|
||||
for new_edge in new_edges:
|
||||
if new_edge.end() == end:
|
||||
agenda.append(new_edge)
|
||||
|
||||
return chart
|
||||
|
||||
|
||||
class EarleyChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
|
||||
|
||||
|
||||
class IncrementalTopDownChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class IncrementalBottomUpChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class IncrementalLeftCornerChartParser(IncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
if not grammar.is_nonempty():
|
||||
raise ValueError(
|
||||
"IncrementalLeftCornerParser only works for grammars "
|
||||
"without empty productions."
|
||||
)
|
||||
IncrementalChartParser.__init__(
|
||||
self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Incremental FCFG Chart Parsers
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
EARLEY_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureTopDownInitRule(),
|
||||
FeatureCompleterRule(),
|
||||
FeatureScannerRule(),
|
||||
FeaturePredictorRule(),
|
||||
]
|
||||
TD_INCREMENTAL_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureTopDownInitRule(),
|
||||
FeatureTopDownPredictRule(),
|
||||
FeatureCompleteFundamentalRule(),
|
||||
]
|
||||
BU_INCREMENTAL_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictRule(),
|
||||
FeatureCompleteFundamentalRule(),
|
||||
]
|
||||
BU_LC_INCREMENTAL_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictCombineRule(),
|
||||
FeatureCompleteFundamentalRule(),
|
||||
]
|
||||
|
||||
|
||||
class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
|
||||
def __init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
|
||||
trace_chart_width=20,
|
||||
chart_class=FeatureIncrementalChart,
|
||||
**parser_args
|
||||
):
|
||||
IncrementalChartParser.__init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=strategy,
|
||||
trace_chart_width=trace_chart_width,
|
||||
chart_class=chart_class,
|
||||
**parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureEarleyChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureIncrementalChartParser.__init__(
|
||||
self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Demonstration
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo(
|
||||
print_times=True,
|
||||
print_grammar=False,
|
||||
print_trees=True,
|
||||
trace=2,
|
||||
sent="I saw John with a dog with my cookie",
|
||||
numparses=5,
|
||||
):
|
||||
"""
|
||||
A demonstration of the Earley parsers.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
from nltk.parse.chart import demo_grammar
|
||||
|
||||
# The grammar for ChartParser and SteppingChartParser:
|
||||
grammar = demo_grammar()
|
||||
if print_grammar:
|
||||
print("* Grammar")
|
||||
print(grammar)
|
||||
|
||||
# Tokenize the sample sentence.
|
||||
print("* Sentence:")
|
||||
print(sent)
|
||||
tokens = sent.split()
|
||||
print(tokens)
|
||||
print()
|
||||
|
||||
# Do the parsing.
|
||||
earley = EarleyChartParser(grammar, trace=trace)
|
||||
t = perf_counter()
|
||||
chart = earley.chart_parse(tokens)
|
||||
parses = list(chart.parses(grammar.start()))
|
||||
t = perf_counter() - t
|
||||
|
||||
# Print results.
|
||||
if numparses:
|
||||
assert len(parses) == numparses, "Not all parses found"
|
||||
if print_trees:
|
||||
for tree in parses:
|
||||
print(tree)
|
||||
else:
|
||||
print("Nr trees:", len(parses))
|
||||
if print_times:
|
||||
print("Time:", t)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
129
Backend/venv/lib/python3.12/site-packages/nltk/parse/evaluate.py
Normal file
129
Backend/venv/lib/python3.12/site-packages/nltk/parse/evaluate.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# Natural Language Toolkit: evaluation of dependency parser
|
||||
#
|
||||
# Author: Long Duong <longdt219@gmail.com>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import unicodedata
|
||||
|
||||
|
||||
class DependencyEvaluator:
|
||||
"""
|
||||
Class for measuring labelled and unlabelled attachment score for
|
||||
dependency parsing. Note that the evaluation ignores punctuation.
|
||||
|
||||
>>> from nltk.parse import DependencyGraph, DependencyEvaluator
|
||||
|
||||
>>> gold_sent = DependencyGraph(\"""
|
||||
... Pierre NNP 2 NMOD
|
||||
... Vinken NNP 8 SUB
|
||||
... , , 2 P
|
||||
... 61 CD 5 NMOD
|
||||
... years NNS 6 AMOD
|
||||
... old JJ 2 NMOD
|
||||
... , , 2 P
|
||||
... will MD 0 ROOT
|
||||
... join VB 8 VC
|
||||
... the DT 11 NMOD
|
||||
... board NN 9 OBJ
|
||||
... as IN 9 VMOD
|
||||
... a DT 15 NMOD
|
||||
... nonexecutive JJ 15 NMOD
|
||||
... director NN 12 PMOD
|
||||
... Nov. NNP 9 VMOD
|
||||
... 29 CD 16 NMOD
|
||||
... . . 9 VMOD
|
||||
... \""")
|
||||
|
||||
>>> parsed_sent = DependencyGraph(\"""
|
||||
... Pierre NNP 8 NMOD
|
||||
... Vinken NNP 1 SUB
|
||||
... , , 3 P
|
||||
... 61 CD 6 NMOD
|
||||
... years NNS 6 AMOD
|
||||
... old JJ 2 NMOD
|
||||
... , , 3 AMOD
|
||||
... will MD 0 ROOT
|
||||
... join VB 8 VC
|
||||
... the DT 11 AMOD
|
||||
... board NN 9 OBJECT
|
||||
... as IN 9 NMOD
|
||||
... a DT 15 NMOD
|
||||
... nonexecutive JJ 15 NMOD
|
||||
... director NN 12 PMOD
|
||||
... Nov. NNP 9 VMOD
|
||||
... 29 CD 16 NMOD
|
||||
... . . 9 VMOD
|
||||
... \""")
|
||||
|
||||
>>> de = DependencyEvaluator([parsed_sent],[gold_sent])
|
||||
>>> las, uas = de.eval()
|
||||
>>> las
|
||||
0.6
|
||||
>>> uas
|
||||
0.8
|
||||
>>> abs(uas - 0.8) < 0.00001
|
||||
True
|
||||
"""
|
||||
|
||||
def __init__(self, parsed_sents, gold_sents):
|
||||
"""
|
||||
:param parsed_sents: the list of parsed_sents as the output of parser
|
||||
:type parsed_sents: list(DependencyGraph)
|
||||
"""
|
||||
self._parsed_sents = parsed_sents
|
||||
self._gold_sents = gold_sents
|
||||
|
||||
def _remove_punct(self, inStr):
|
||||
"""
|
||||
Function to remove punctuation from Unicode string.
|
||||
:param input: the input string
|
||||
:return: Unicode string after remove all punctuation
|
||||
"""
|
||||
punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
|
||||
return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat)
|
||||
|
||||
def eval(self):
|
||||
"""
|
||||
Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS)
|
||||
|
||||
:return : tuple(float,float)
|
||||
"""
|
||||
if len(self._parsed_sents) != len(self._gold_sents):
|
||||
raise ValueError(
|
||||
" Number of parsed sentence is different with number of gold sentence."
|
||||
)
|
||||
|
||||
corr = 0
|
||||
corrL = 0
|
||||
total = 0
|
||||
|
||||
for i in range(len(self._parsed_sents)):
|
||||
parsed_sent_nodes = self._parsed_sents[i].nodes
|
||||
gold_sent_nodes = self._gold_sents[i].nodes
|
||||
|
||||
if len(parsed_sent_nodes) != len(gold_sent_nodes):
|
||||
raise ValueError("Sentences must have equal length.")
|
||||
|
||||
for parsed_node_address, parsed_node in parsed_sent_nodes.items():
|
||||
gold_node = gold_sent_nodes[parsed_node_address]
|
||||
|
||||
if parsed_node["word"] is None:
|
||||
continue
|
||||
if parsed_node["word"] != gold_node["word"]:
|
||||
raise ValueError("Sentence sequence is not matched.")
|
||||
|
||||
# Ignore if word is punctuation by default
|
||||
# if (parsed_sent[j]["word"] in string.punctuation):
|
||||
if self._remove_punct(parsed_node["word"]) == "":
|
||||
continue
|
||||
|
||||
total += 1
|
||||
if parsed_node["head"] == gold_node["head"]:
|
||||
corr += 1
|
||||
if parsed_node["rel"] == gold_node["rel"]:
|
||||
corrL += 1
|
||||
|
||||
return corrL / total, corr / total
|
||||
@@ -0,0 +1,674 @@
|
||||
# Natural Language Toolkit: Chart Parser for Feature-Based Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rob Speer <rspeer@mit.edu>
|
||||
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Extension of chart parsing implementation to handle grammars with
|
||||
feature structures as nodes.
|
||||
"""
|
||||
from time import perf_counter
|
||||
|
||||
from nltk.featstruct import TYPE, FeatStruct, find_variables, unify
|
||||
from nltk.grammar import (
|
||||
CFG,
|
||||
FeatStructNonterminal,
|
||||
Nonterminal,
|
||||
Production,
|
||||
is_nonterminal,
|
||||
is_terminal,
|
||||
)
|
||||
from nltk.parse.chart import (
|
||||
BottomUpPredictCombineRule,
|
||||
BottomUpPredictRule,
|
||||
CachedTopDownPredictRule,
|
||||
Chart,
|
||||
ChartParser,
|
||||
EdgeI,
|
||||
EmptyPredictRule,
|
||||
FundamentalRule,
|
||||
LeafInitRule,
|
||||
SingleEdgeFundamentalRule,
|
||||
TopDownInitRule,
|
||||
TreeEdge,
|
||||
)
|
||||
from nltk.sem import logic
|
||||
from nltk.tree import Tree
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Tree Edge
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureTreeEdge(TreeEdge):
|
||||
"""
|
||||
A specialized tree edge that allows shared variable bindings
|
||||
between nonterminals on the left-hand side and right-hand side.
|
||||
|
||||
Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a
|
||||
dictionary mapping from variables to values. If the edge is not
|
||||
complete, then these bindings are simply stored. However, if the
|
||||
edge is complete, then the constructor applies these bindings to
|
||||
every nonterminal in the edge whose symbol implements the
|
||||
interface ``SubstituteBindingsI``.
|
||||
"""
|
||||
|
||||
def __init__(self, span, lhs, rhs, dot=0, bindings=None):
|
||||
"""
|
||||
Construct a new edge. If the edge is incomplete (i.e., if
|
||||
``dot<len(rhs)``), then store the bindings as-is. If the edge
|
||||
is complete (i.e., if ``dot==len(rhs)``), then apply the
|
||||
bindings to all nonterminals in ``lhs`` and ``rhs``, and then
|
||||
clear the bindings. See ``TreeEdge`` for a description of
|
||||
the other arguments.
|
||||
"""
|
||||
if bindings is None:
|
||||
bindings = {}
|
||||
|
||||
# If the edge is complete, then substitute in the bindings,
|
||||
# and then throw them away. (If we didn't throw them away, we
|
||||
# might think that 2 complete edges are different just because
|
||||
# they have different bindings, even though all bindings have
|
||||
# already been applied.)
|
||||
if dot == len(rhs) and bindings:
|
||||
lhs = self._bind(lhs, bindings)
|
||||
rhs = [self._bind(elt, bindings) for elt in rhs]
|
||||
bindings = {}
|
||||
|
||||
# Initialize the edge.
|
||||
TreeEdge.__init__(self, span, lhs, rhs, dot)
|
||||
self._bindings = bindings
|
||||
self._comparison_key = (self._comparison_key, tuple(sorted(bindings.items())))
|
||||
|
||||
@staticmethod
|
||||
def from_production(production, index):
|
||||
"""
|
||||
:return: A new ``TreeEdge`` formed from the given production.
|
||||
The new edge's left-hand side and right-hand side will
|
||||
be taken from ``production``; its span will be
|
||||
``(index,index)``; and its dot position will be ``0``.
|
||||
:rtype: TreeEdge
|
||||
"""
|
||||
return FeatureTreeEdge(
|
||||
span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
|
||||
)
|
||||
|
||||
def move_dot_forward(self, new_end, bindings=None):
|
||||
"""
|
||||
:return: A new ``FeatureTreeEdge`` formed from this edge.
|
||||
The new edge's dot position is increased by ``1``,
|
||||
and its end index will be replaced by ``new_end``.
|
||||
:rtype: FeatureTreeEdge
|
||||
:param new_end: The new end index.
|
||||
:type new_end: int
|
||||
:param bindings: Bindings for the new edge.
|
||||
:type bindings: dict
|
||||
"""
|
||||
return FeatureTreeEdge(
|
||||
span=(self._span[0], new_end),
|
||||
lhs=self._lhs,
|
||||
rhs=self._rhs,
|
||||
dot=self._dot + 1,
|
||||
bindings=bindings,
|
||||
)
|
||||
|
||||
def _bind(self, nt, bindings):
|
||||
if not isinstance(nt, FeatStructNonterminal):
|
||||
return nt
|
||||
return nt.substitute_bindings(bindings)
|
||||
|
||||
def next_with_bindings(self):
|
||||
return self._bind(self.nextsym(), self._bindings)
|
||||
|
||||
def bindings(self):
|
||||
"""
|
||||
Return a copy of this edge's bindings dictionary.
|
||||
"""
|
||||
return self._bindings.copy()
|
||||
|
||||
def variables(self):
|
||||
"""
|
||||
:return: The set of variables used by this edge.
|
||||
:rtype: set(Variable)
|
||||
"""
|
||||
return find_variables(
|
||||
[self._lhs]
|
||||
+ list(self._rhs)
|
||||
+ list(self._bindings.keys())
|
||||
+ list(self._bindings.values()),
|
||||
fs_class=FeatStruct,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
if self.is_complete():
|
||||
return super().__str__()
|
||||
else:
|
||||
bindings = "{%s}" % ", ".join(
|
||||
"%s: %r" % item for item in sorted(self._bindings.items())
|
||||
)
|
||||
return f"{super().__str__()} {bindings}"
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# A specialized Chart for feature grammars
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
# TODO: subsumes check when adding new edges
|
||||
|
||||
|
||||
class FeatureChart(Chart):
|
||||
"""
|
||||
A Chart for feature grammars.
|
||||
:see: ``Chart`` for more information.
|
||||
"""
|
||||
|
||||
def select(self, **restrictions):
|
||||
"""
|
||||
Returns an iterator over the edges in this chart.
|
||||
See ``Chart.select`` for more information about the
|
||||
``restrictions`` on the edges.
|
||||
"""
|
||||
# If there are no restrictions, then return all edges.
|
||||
if restrictions == {}:
|
||||
return iter(self._edges)
|
||||
|
||||
# Find the index corresponding to the given restrictions.
|
||||
restr_keys = sorted(restrictions.keys())
|
||||
restr_keys = tuple(restr_keys)
|
||||
|
||||
# If it doesn't exist, then create it.
|
||||
if restr_keys not in self._indexes:
|
||||
self._add_index(restr_keys)
|
||||
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(restrictions[key]) for key in restr_keys
|
||||
)
|
||||
return iter(self._indexes[restr_keys].get(vals, []))
|
||||
|
||||
def _add_index(self, restr_keys):
|
||||
"""
|
||||
A helper function for ``select``, which creates a new index for
|
||||
a given set of attributes (aka restriction keys).
|
||||
"""
|
||||
# Make sure it's a valid index.
|
||||
for key in restr_keys:
|
||||
if not hasattr(EdgeI, key):
|
||||
raise ValueError("Bad restriction: %s" % key)
|
||||
|
||||
# Create the index.
|
||||
index = self._indexes[restr_keys] = {}
|
||||
|
||||
# Add all existing edges to the index.
|
||||
for edge in self._edges:
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
|
||||
)
|
||||
index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _register_with_indexes(self, edge):
|
||||
"""
|
||||
A helper function for ``insert``, which registers the new
|
||||
edge with all existing indexes.
|
||||
"""
|
||||
for restr_keys, index in self._indexes.items():
|
||||
vals = tuple(
|
||||
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
|
||||
)
|
||||
index.setdefault(vals, []).append(edge)
|
||||
|
||||
def _get_type_if_possible(self, item):
|
||||
"""
|
||||
Helper function which returns the ``TYPE`` feature of the ``item``,
|
||||
if it exists, otherwise it returns the ``item`` itself
|
||||
"""
|
||||
if isinstance(item, dict) and TYPE in item:
|
||||
return item[TYPE]
|
||||
else:
|
||||
return item
|
||||
|
||||
def parses(self, start, tree_class=Tree):
|
||||
for edge in self.select(start=0, end=self._num_leaves):
|
||||
if (
|
||||
(isinstance(edge, FeatureTreeEdge))
|
||||
and (edge.lhs()[TYPE] == start[TYPE])
|
||||
and (unify(edge.lhs(), start, rename_vars=True))
|
||||
):
|
||||
yield from self.trees(edge, complete=True, tree_class=tree_class)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Fundamental Rule
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureFundamentalRule(FundamentalRule):
|
||||
r"""
|
||||
A specialized version of the fundamental rule that operates on
|
||||
nonterminals whose symbols are ``FeatStructNonterminal``s. Rather
|
||||
than simply comparing the nonterminals for equality, they are
|
||||
unified. Variable bindings from these unifications are collected
|
||||
and stored in the chart using a ``FeatureTreeEdge``. When a
|
||||
complete edge is generated, these bindings are applied to all
|
||||
nonterminals in the edge.
|
||||
|
||||
The fundamental rule states that:
|
||||
|
||||
- ``[A -> alpha \* B1 beta][i:j]``
|
||||
- ``[B2 -> gamma \*][j:k]``
|
||||
|
||||
licenses the edge:
|
||||
|
||||
- ``[A -> alpha B3 \* beta][i:j]``
|
||||
|
||||
assuming that B1 and B2 can be unified to generate B3.
|
||||
"""
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
# Make sure the rule is applicable.
|
||||
if not (
|
||||
left_edge.end() == right_edge.start()
|
||||
and left_edge.is_incomplete()
|
||||
and right_edge.is_complete()
|
||||
and isinstance(left_edge, FeatureTreeEdge)
|
||||
):
|
||||
return
|
||||
found = right_edge.lhs()
|
||||
nextsym = left_edge.nextsym()
|
||||
if isinstance(right_edge, FeatureTreeEdge):
|
||||
if not is_nonterminal(nextsym):
|
||||
return
|
||||
if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]:
|
||||
return
|
||||
# Create a copy of the bindings.
|
||||
bindings = left_edge.bindings()
|
||||
# We rename vars here, because we don't want variables
|
||||
# from the two different productions to match.
|
||||
found = found.rename_variables(used_vars=left_edge.variables())
|
||||
# Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
|
||||
# generate B3 (result).
|
||||
result = unify(nextsym, found, bindings, rename_vars=False)
|
||||
if result is None:
|
||||
return
|
||||
else:
|
||||
if nextsym != found:
|
||||
return
|
||||
# Create a copy of the bindings.
|
||||
bindings = left_edge.bindings()
|
||||
|
||||
# Construct the new edge.
|
||||
new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)
|
||||
|
||||
# Add it to the chart, with appropriate child pointers.
|
||||
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
|
||||
"""
|
||||
A specialized version of the completer / single edge fundamental rule
|
||||
that operates on nonterminals whose symbols are ``FeatStructNonterminal``.
|
||||
Rather than simply comparing the nonterminals for equality, they are
|
||||
unified.
|
||||
"""
|
||||
|
||||
_fundamental_rule = FeatureFundamentalRule()
|
||||
|
||||
def _apply_complete(self, chart, grammar, right_edge):
|
||||
fr = self._fundamental_rule
|
||||
for left_edge in chart.select(
|
||||
end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, left_edge, right_edge)
|
||||
|
||||
def _apply_incomplete(self, chart, grammar, left_edge):
|
||||
fr = self._fundamental_rule
|
||||
for right_edge in chart.select(
|
||||
start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, left_edge, right_edge)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Top-Down Prediction
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureTopDownInitRule(TopDownInitRule):
|
||||
def apply(self, chart, grammar):
|
||||
for prod in grammar.productions(lhs=grammar.start()):
|
||||
new_edge = FeatureTreeEdge.from_production(prod, 0)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureTopDownPredictRule(CachedTopDownPredictRule):
|
||||
r"""
|
||||
A specialized version of the (cached) top down predict rule that operates
|
||||
on nonterminals whose symbols are ``FeatStructNonterminal``. Rather
|
||||
than simply comparing the nonterminals for equality, they are
|
||||
unified.
|
||||
|
||||
The top down expand rule states that:
|
||||
|
||||
- ``[A -> alpha \* B1 beta][i:j]``
|
||||
|
||||
licenses the edge:
|
||||
|
||||
- ``[B2 -> \* gamma][j:j]``
|
||||
|
||||
for each grammar production ``B2 -> gamma``, assuming that B1
|
||||
and B2 can be unified.
|
||||
"""
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_complete():
|
||||
return
|
||||
nextsym, index = edge.nextsym(), edge.end()
|
||||
if not is_nonterminal(nextsym):
|
||||
return
|
||||
|
||||
# If we've already applied this rule to an edge with the same
|
||||
# next & end, and the chart & grammar have not changed, then
|
||||
# just return (no new edges to add).
|
||||
nextsym_with_bindings = edge.next_with_bindings()
|
||||
done = self._done.get((nextsym_with_bindings, index), (None, None))
|
||||
if done[0] is chart and done[1] is grammar:
|
||||
return
|
||||
|
||||
for prod in grammar.productions(lhs=nextsym):
|
||||
# If the left corner in the predicted production is
|
||||
# leaf, it must match with the input.
|
||||
if prod.rhs():
|
||||
first = prod.rhs()[0]
|
||||
if is_terminal(first):
|
||||
if index >= chart.num_leaves():
|
||||
continue
|
||||
if first != chart.leaf(index):
|
||||
continue
|
||||
|
||||
# We rename vars here, because we don't want variables
|
||||
# from the two different productions to match.
|
||||
if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
|
||||
new_edge = FeatureTreeEdge.from_production(prod, edge.end())
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
# Record the fact that we've applied this rule.
|
||||
self._done[nextsym_with_bindings, index] = (chart, grammar)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Bottom-Up Prediction
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class FeatureBottomUpPredictRule(BottomUpPredictRule):
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_incomplete():
|
||||
return
|
||||
for prod in grammar.productions(rhs=edge.lhs()):
|
||||
if isinstance(edge, FeatureTreeEdge):
|
||||
_next = prod.rhs()[0]
|
||||
if not is_nonterminal(_next):
|
||||
continue
|
||||
|
||||
new_edge = FeatureTreeEdge.from_production(prod, edge.start())
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_incomplete():
|
||||
return
|
||||
found = edge.lhs()
|
||||
for prod in grammar.productions(rhs=found):
|
||||
bindings = {}
|
||||
if isinstance(edge, FeatureTreeEdge):
|
||||
_next = prod.rhs()[0]
|
||||
if not is_nonterminal(_next):
|
||||
continue
|
||||
|
||||
# We rename vars here, because we don't want variables
|
||||
# from the two different productions to match.
|
||||
used_vars = find_variables(
|
||||
(prod.lhs(),) + prod.rhs(), fs_class=FeatStruct
|
||||
)
|
||||
found = found.rename_variables(used_vars=used_vars)
|
||||
|
||||
result = unify(_next, found, bindings, rename_vars=False)
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
new_edge = FeatureTreeEdge.from_production(
|
||||
prod, edge.start()
|
||||
).move_dot_forward(edge.end(), bindings)
|
||||
if chart.insert(new_edge, (edge,)):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class FeatureEmptyPredictRule(EmptyPredictRule):
|
||||
def apply(self, chart, grammar):
|
||||
for prod in grammar.productions(empty=True):
|
||||
for index in range(chart.num_leaves() + 1):
|
||||
new_edge = FeatureTreeEdge.from_production(prod, index)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Feature Chart Parser
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
TD_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureTopDownInitRule(),
|
||||
FeatureTopDownPredictRule(),
|
||||
FeatureSingleEdgeFundamentalRule(),
|
||||
]
|
||||
BU_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictRule(),
|
||||
FeatureSingleEdgeFundamentalRule(),
|
||||
]
|
||||
BU_LC_FEATURE_STRATEGY = [
|
||||
LeafInitRule(),
|
||||
FeatureEmptyPredictRule(),
|
||||
FeatureBottomUpPredictCombineRule(),
|
||||
FeatureSingleEdgeFundamentalRule(),
|
||||
]
|
||||
|
||||
|
||||
class FeatureChartParser(ChartParser):
|
||||
def __init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=BU_LC_FEATURE_STRATEGY,
|
||||
trace_chart_width=20,
|
||||
chart_class=FeatureChart,
|
||||
**parser_args,
|
||||
):
|
||||
ChartParser.__init__(
|
||||
self,
|
||||
grammar,
|
||||
strategy=strategy,
|
||||
trace_chart_width=trace_chart_width,
|
||||
chart_class=chart_class,
|
||||
**parser_args,
|
||||
)
|
||||
|
||||
|
||||
class FeatureTopDownChartParser(FeatureChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args)
|
||||
|
||||
|
||||
class FeatureBottomUpChartParser(FeatureChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args)
|
||||
|
||||
|
||||
class FeatureBottomUpLeftCornerChartParser(FeatureChartParser):
|
||||
def __init__(self, grammar, **parser_args):
|
||||
FeatureChartParser.__init__(
|
||||
self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args
|
||||
)
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Instantiate Variable Chart
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class InstantiateVarsChart(FeatureChart):
|
||||
"""
|
||||
A specialized chart that 'instantiates' variables whose names
|
||||
start with '@', by replacing them with unique new variables.
|
||||
In particular, whenever a complete edge is added to the chart, any
|
||||
variables in the edge's ``lhs`` whose names start with '@' will be
|
||||
replaced by unique new ``Variable``.
|
||||
"""
|
||||
|
||||
def __init__(self, tokens):
|
||||
FeatureChart.__init__(self, tokens)
|
||||
|
||||
def initialize(self):
|
||||
self._instantiated = set()
|
||||
FeatureChart.initialize(self)
|
||||
|
||||
def insert(self, edge, child_pointer_list):
|
||||
if edge in self._instantiated:
|
||||
return False
|
||||
self.instantiate_edge(edge)
|
||||
return FeatureChart.insert(self, edge, child_pointer_list)
|
||||
|
||||
def instantiate_edge(self, edge):
|
||||
"""
|
||||
If the edge is a ``FeatureTreeEdge``, and it is complete,
|
||||
then instantiate all variables whose names start with '@',
|
||||
by replacing them with unique new variables.
|
||||
|
||||
Note that instantiation is done in-place, since the
|
||||
parsing algorithms might already hold a reference to
|
||||
the edge for future use.
|
||||
"""
|
||||
# If the edge is a leaf, or is not complete, or is
|
||||
# already in the chart, then just return it as-is.
|
||||
if not isinstance(edge, FeatureTreeEdge):
|
||||
return
|
||||
if not edge.is_complete():
|
||||
return
|
||||
if edge in self._edge_to_cpls:
|
||||
return
|
||||
|
||||
# Get a list of variables that need to be instantiated.
|
||||
# If there are none, then return as-is.
|
||||
inst_vars = self.inst_vars(edge)
|
||||
if not inst_vars:
|
||||
return
|
||||
|
||||
# Instantiate the edge!
|
||||
self._instantiated.add(edge)
|
||||
edge._lhs = edge.lhs().substitute_bindings(inst_vars)
|
||||
|
||||
def inst_vars(self, edge):
|
||||
return {
|
||||
var: logic.unique_variable()
|
||||
for var in edge.lhs().variables()
|
||||
if var.name.startswith("@")
|
||||
}
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# Demo
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo_grammar():
|
||||
from nltk.grammar import FeatureGrammar
|
||||
|
||||
return FeatureGrammar.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
PP -> Prep NP
|
||||
NP -> NP PP
|
||||
VP -> VP PP
|
||||
VP -> Verb NP
|
||||
VP -> Verb
|
||||
NP -> Det[pl=?x] Noun[pl=?x]
|
||||
NP -> "John"
|
||||
NP -> "I"
|
||||
Det -> "the"
|
||||
Det -> "my"
|
||||
Det[-pl] -> "a"
|
||||
Noun[-pl] -> "dog"
|
||||
Noun[-pl] -> "cookie"
|
||||
Verb -> "ate"
|
||||
Verb -> "saw"
|
||||
Prep -> "with"
|
||||
Prep -> "under"
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def demo(
|
||||
print_times=True,
|
||||
print_grammar=True,
|
||||
print_trees=True,
|
||||
print_sentence=True,
|
||||
trace=1,
|
||||
parser=FeatureChartParser,
|
||||
sent="I saw John with a dog with my cookie",
|
||||
):
|
||||
import sys
|
||||
import time
|
||||
|
||||
print()
|
||||
grammar = demo_grammar()
|
||||
if print_grammar:
|
||||
print(grammar)
|
||||
print()
|
||||
print("*", parser.__name__)
|
||||
if print_sentence:
|
||||
print("Sentence:", sent)
|
||||
tokens = sent.split()
|
||||
t = perf_counter()
|
||||
cp = parser(grammar, trace=trace)
|
||||
chart = cp.chart_parse(tokens)
|
||||
trees = list(chart.parses(grammar.start()))
|
||||
if print_times:
|
||||
print("Time: %s" % (perf_counter() - t))
|
||||
if print_trees:
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
else:
|
||||
print("Nr trees:", len(trees))
|
||||
|
||||
|
||||
def run_profile():
|
||||
import profile
|
||||
|
||||
profile.run("for i in range(1): demo()", "/tmp/profile.out")
|
||||
import pstats
|
||||
|
||||
p = pstats.Stats("/tmp/profile.out")
|
||||
p.strip_dirs().sort_stats("time", "cum").print_stats(60)
|
||||
p.strip_dirs().sort_stats("cum", "time").print_stats(60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from nltk.data import load
|
||||
|
||||
demo()
|
||||
print()
|
||||
grammar = load("grammars/book_grammars/feat0.fcfg")
|
||||
cp = FeatureChartParser(grammar, trace=2)
|
||||
sent = "Kim likes children"
|
||||
tokens = sent.split()
|
||||
trees = cp.parse(tokens)
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
@@ -0,0 +1,88 @@
|
||||
# Natural Language Toolkit: Generating from a CFG
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
||||
# Eric Kafe <kafe.eric@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
import itertools
|
||||
import sys
|
||||
|
||||
from nltk.grammar import Nonterminal
|
||||
|
||||
|
||||
def generate(grammar, start=None, depth=None, n=None):
|
||||
"""
|
||||
Generates an iterator of all sentences from a CFG.
|
||||
|
||||
:param grammar: The Grammar used to generate sentences.
|
||||
:param start: The Nonterminal from which to start generate sentences.
|
||||
:param depth: The maximal depth of the generated tree.
|
||||
:param n: The maximum number of sentences to return.
|
||||
:return: An iterator of lists of terminal tokens.
|
||||
"""
|
||||
if not start:
|
||||
start = grammar.start()
|
||||
if depth is None:
|
||||
# Safe default, assuming the grammar may be recursive:
|
||||
depth = (sys.getrecursionlimit() // 3) - 3
|
||||
|
||||
iter = _generate_all(grammar, [start], depth)
|
||||
|
||||
if n:
|
||||
iter = itertools.islice(iter, n)
|
||||
|
||||
return iter
|
||||
|
||||
|
||||
def _generate_all(grammar, items, depth):
|
||||
if items:
|
||||
try:
|
||||
for frag1 in _generate_one(grammar, items[0], depth):
|
||||
for frag2 in _generate_all(grammar, items[1:], depth):
|
||||
yield frag1 + frag2
|
||||
except RecursionError as error:
|
||||
# Helpful error message while still showing the recursion stack.
|
||||
raise RuntimeError(
|
||||
"The grammar has rule(s) that yield infinite recursion!\n\
|
||||
Eventually use a lower 'depth', or a higher 'sys.setrecursionlimit()'."
|
||||
) from error
|
||||
else:
|
||||
yield []
|
||||
|
||||
|
||||
def _generate_one(grammar, item, depth):
|
||||
if depth > 0:
|
||||
if isinstance(item, Nonterminal):
|
||||
for prod in grammar.productions(lhs=item):
|
||||
yield from _generate_all(grammar, prod.rhs(), depth - 1)
|
||||
else:
|
||||
yield [item]
|
||||
|
||||
|
||||
demo_grammar = """
|
||||
S -> NP VP
|
||||
NP -> Det N
|
||||
PP -> P NP
|
||||
VP -> 'slept' | 'saw' NP | 'walked' PP
|
||||
Det -> 'the' | 'a'
|
||||
N -> 'man' | 'park' | 'dog'
|
||||
P -> 'in' | 'with'
|
||||
"""
|
||||
|
||||
|
||||
def demo(N=23):
|
||||
from nltk.grammar import CFG
|
||||
|
||||
print("Generating the first %d sentences for demo grammar:" % (N,))
|
||||
print(demo_grammar)
|
||||
grammar = CFG.fromstring(demo_grammar)
|
||||
for n, sent in enumerate(generate(grammar, n=N), 1):
|
||||
print("%3d. %s" % (n, " ".join(sent)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
393
Backend/venv/lib/python3.12/site-packages/nltk/parse/malt.py
Normal file
393
Backend/venv/lib/python3.12/site-packages/nltk/parse/malt.py
Normal file
@@ -0,0 +1,393 @@
|
||||
# Natural Language Toolkit: Interface to MaltParser
|
||||
#
|
||||
# Author: Dan Garrette <dhgarrette@gmail.com>
|
||||
# Contributor: Liling Tan, Mustufain, osamamukhtar11
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from nltk.data import ZipFilePathPointer
|
||||
from nltk.internals import find_dir, find_file, find_jars_within_path
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.parse.util import taggedsents_to_conll
|
||||
|
||||
|
||||
def malt_regex_tagger():
|
||||
from nltk.tag import RegexpTagger
|
||||
|
||||
_tagger = RegexpTagger(
|
||||
[
|
||||
(r"\.$", "."),
|
||||
(r"\,$", ","),
|
||||
(r"\?$", "?"), # fullstop, comma, Qmark
|
||||
(r"\($", "("),
|
||||
(r"\)$", ")"), # round brackets
|
||||
(r"\[$", "["),
|
||||
(r"\]$", "]"), # square brackets
|
||||
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
|
||||
(r"(The|the|A|a|An|an)$", "DT"), # articles
|
||||
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
|
||||
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
|
||||
(r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive
|
||||
(r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
|
||||
(r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
|
||||
(r"(till|Till|until|Until)$", "IN"), # time prepopsitions
|
||||
(r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
|
||||
(r"(under|Under|below|Below)$", "IN"), # space prepopsitions
|
||||
(r"(over|Over|above|Above)$", "IN"), # space prepopsitions
|
||||
(r"(across|Across|through|Through)$", "IN"), # space prepopsitions
|
||||
(r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
|
||||
(r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
|
||||
(r".*able$", "JJ"), # adjectives
|
||||
(r".*ness$", "NN"), # nouns formed from adjectives
|
||||
(r".*ly$", "RB"), # adverbs
|
||||
(r".*s$", "NNS"), # plural nouns
|
||||
(r".*ing$", "VBG"), # gerunds
|
||||
(r".*ed$", "VBD"), # past tense verbs
|
||||
(r".*", "NN"), # nouns (default)
|
||||
]
|
||||
)
|
||||
return _tagger.tag
|
||||
|
||||
|
||||
def find_maltparser(parser_dirname):
|
||||
"""
|
||||
A module to find MaltParser .jar file and its dependencies.
|
||||
"""
|
||||
if os.path.exists(parser_dirname): # If a full path is given.
|
||||
_malt_dir = parser_dirname
|
||||
else: # Try to find path to maltparser directory in environment variables.
|
||||
_malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
|
||||
# Checks that that the found directory contains all the necessary .jar
|
||||
malt_dependencies = ["", "", ""]
|
||||
_malt_jars = set(find_jars_within_path(_malt_dir))
|
||||
_jars = {os.path.split(jar)[1] for jar in _malt_jars}
|
||||
malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"}
|
||||
|
||||
assert malt_dependencies.issubset(_jars)
|
||||
assert any(
|
||||
filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
|
||||
)
|
||||
return list(_malt_jars)
|
||||
|
||||
|
||||
def find_malt_model(model_filename):
|
||||
"""
|
||||
A module to find pre-trained MaltParser model.
|
||||
"""
|
||||
if model_filename is None:
|
||||
return "malt_temp.mco"
|
||||
elif os.path.exists(model_filename): # If a full path is given.
|
||||
return model_filename
|
||||
else: # Try to find path to malt model in environment variables.
|
||||
return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
|
||||
|
||||
|
||||
class MaltParser(ParserI):
|
||||
"""
|
||||
A class for dependency parsing with MaltParser. The input is the paths to:
|
||||
- (optionally) a maltparser directory
|
||||
- (optionally) the path to a pre-trained MaltParser .mco model file
|
||||
- (optionally) the tagger to use for POS tagging before parsing
|
||||
- (optionally) additional Java arguments
|
||||
|
||||
Example:
|
||||
>>> from nltk.parse import malt
|
||||
>>> # With MALT_PARSER and MALT_MODEL environment set.
|
||||
>>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP
|
||||
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
>>> # Without MALT_PARSER and MALT_MODEL environment.
|
||||
>>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
|
||||
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser_dirname="",
|
||||
model_filename=None,
|
||||
tagger=None,
|
||||
additional_java_args=None,
|
||||
):
|
||||
"""
|
||||
An interface for parsing with the Malt Parser.
|
||||
|
||||
:param parser_dirname: The path to the maltparser directory that
|
||||
contains the maltparser-1.x.jar
|
||||
:type parser_dirname: str
|
||||
:param model_filename: The name of the pre-trained model with .mco file
|
||||
extension. If provided, training will not be required.
|
||||
(see http://www.maltparser.org/mco/mco.html and
|
||||
see http://www.patful.com/chalk/node/185)
|
||||
:type model_filename: str
|
||||
:param tagger: The tagger used to POS tag the raw string before
|
||||
formatting to CONLL format. It should behave like `nltk.pos_tag`
|
||||
:type tagger: function
|
||||
:param additional_java_args: This is the additional Java arguments that
|
||||
one can use when calling Maltparser, usually this is the heapsize
|
||||
limits, e.g. `additional_java_args=['-Xmx1024m']`
|
||||
(see https://javarevisited.blogspot.com/2011/05/java-heap-space-memory-size-jvm.html)
|
||||
:type additional_java_args: list
|
||||
"""
|
||||
|
||||
# Find all the necessary jar files for MaltParser.
|
||||
self.malt_jars = find_maltparser(parser_dirname)
|
||||
# Initialize additional java arguments.
|
||||
self.additional_java_args = (
|
||||
additional_java_args if additional_java_args is not None else []
|
||||
)
|
||||
# Initialize model.
|
||||
self.model = find_malt_model(model_filename)
|
||||
self._trained = self.model != "malt_temp.mco"
|
||||
# Set the working_dir parameters i.e. `-w` from MaltParser's option.
|
||||
self.working_dir = tempfile.gettempdir()
|
||||
# Initialize POS tagger.
|
||||
self.tagger = tagger if tagger is not None else malt_regex_tagger()
|
||||
|
||||
def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
|
||||
"""
|
||||
Use MaltParser to parse multiple POS tagged sentences. Takes multiple
|
||||
sentences where each sentence is a list of (word, tag) tuples.
|
||||
The sentences must have already been tokenized and tagged.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentence: list(list(tuple(str, str)))
|
||||
:return: iter(iter(``DependencyGraph``)) the dependency graph
|
||||
representation of each sentence
|
||||
"""
|
||||
if not self._trained:
|
||||
raise Exception("Parser has not been trained. Call train() first.")
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_output.conll.",
|
||||
dir=self.working_dir,
|
||||
mode="w",
|
||||
delete=False,
|
||||
) as output_file:
|
||||
# Convert list of sentences to CONLL format.
|
||||
for line in taggedsents_to_conll(sentences):
|
||||
input_file.write(str(line))
|
||||
input_file.close()
|
||||
|
||||
# Generate command to run maltparser.
|
||||
cmd = self.generate_malt_command(
|
||||
input_file.name, output_file.name, mode="parse"
|
||||
)
|
||||
|
||||
# This is a maltparser quirk, it needs to be run
|
||||
# where the model file is. otherwise it goes into an awkward
|
||||
# missing .jars or strange -w working_dir problem.
|
||||
_current_path = os.getcwd() # Remembers the current path.
|
||||
try: # Change to modelfile path
|
||||
os.chdir(os.path.split(self.model)[0])
|
||||
except:
|
||||
pass
|
||||
ret = self._execute(cmd, verbose) # Run command.
|
||||
os.chdir(_current_path) # Change back to current path.
|
||||
|
||||
if ret != 0:
|
||||
raise Exception(
|
||||
"MaltParser parsing (%s) failed with exit "
|
||||
"code %d" % (" ".join(cmd), ret)
|
||||
)
|
||||
|
||||
# Must return iter(iter(Tree))
|
||||
with open(output_file.name) as infile:
|
||||
for tree_str in infile.read().split("\n\n"):
|
||||
yield (
|
||||
iter(
|
||||
[
|
||||
DependencyGraph(
|
||||
tree_str, top_relation_label=top_relation_label
|
||||
)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
os.remove(input_file.name)
|
||||
os.remove(output_file.name)
|
||||
|
||||
def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
|
||||
"""
|
||||
Use MaltParser to parse multiple sentences.
|
||||
Takes a list of sentences, where each sentence is a list of words.
|
||||
Each sentence will be automatically tagged with this
|
||||
MaltParser instance's tagger.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentence: list(list(str))
|
||||
:return: iter(DependencyGraph)
|
||||
"""
|
||||
tagged_sentences = (self.tagger(sentence) for sentence in sentences)
|
||||
return self.parse_tagged_sents(
|
||||
tagged_sentences, verbose, top_relation_label=top_relation_label
|
||||
)
|
||||
|
||||
def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
|
||||
"""
|
||||
This function generates the maltparser command use at the terminal.
|
||||
|
||||
:param inputfilename: path to the input file
|
||||
:type inputfilename: str
|
||||
:param outputfilename: path to the output file
|
||||
:type outputfilename: str
|
||||
"""
|
||||
|
||||
cmd = ["java"]
|
||||
cmd += self.additional_java_args # Adds additional java arguments
|
||||
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
|
||||
classpaths_separator = ";" if sys.platform.startswith("win") else ":"
|
||||
cmd += [
|
||||
"-cp",
|
||||
classpaths_separator.join(self.malt_jars),
|
||||
] # Adds classpaths for jars
|
||||
cmd += ["org.maltparser.Malt"] # Adds the main function.
|
||||
|
||||
# Adds the model file.
|
||||
if os.path.exists(self.model): # when parsing
|
||||
cmd += ["-c", os.path.split(self.model)[-1]]
|
||||
else: # when learning
|
||||
cmd += ["-c", self.model]
|
||||
|
||||
cmd += ["-i", inputfilename]
|
||||
if mode == "parse":
|
||||
cmd += ["-o", outputfilename]
|
||||
cmd += ["-m", mode] # mode use to generate parses.
|
||||
return cmd
|
||||
|
||||
@staticmethod
|
||||
def _execute(cmd, verbose=False):
|
||||
output = None if verbose else subprocess.PIPE
|
||||
p = subprocess.Popen(cmd, stdout=output, stderr=output)
|
||||
return p.wait()
|
||||
|
||||
def train(self, depgraphs, verbose=False):
|
||||
"""
|
||||
Train MaltParser from a list of ``DependencyGraph`` objects
|
||||
|
||||
:param depgraphs: list of ``DependencyGraph`` objects for training input data
|
||||
:type depgraphs: DependencyGraph
|
||||
"""
|
||||
|
||||
# Write the conll_str to malt_train.conll file in /tmp/
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
|
||||
input_file.write(str(input_str))
|
||||
# Trains the model with the malt_train.conll
|
||||
self.train_from_file(input_file.name, verbose=verbose)
|
||||
# Removes the malt_train.conll once training finishes.
|
||||
os.remove(input_file.name)
|
||||
|
||||
def train_from_file(self, conll_file, verbose=False):
|
||||
"""
|
||||
Train MaltParser from a file
|
||||
:param conll_file: str for the filename of the training input data
|
||||
:type conll_file: str
|
||||
"""
|
||||
|
||||
# If conll_file is a ZipFilePathPointer,
|
||||
# then we need to do some extra massaging
|
||||
if isinstance(conll_file, ZipFilePathPointer):
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
with conll_file.open() as conll_input_file:
|
||||
conll_str = conll_input_file.read()
|
||||
input_file.write(str(conll_str))
|
||||
return self.train_from_file(input_file.name, verbose=verbose)
|
||||
|
||||
# Generate command to run maltparser.
|
||||
cmd = self.generate_malt_command(conll_file, mode="learn")
|
||||
ret = self._execute(cmd, verbose)
|
||||
if ret != 0:
|
||||
raise Exception(
|
||||
"MaltParser training (%s) failed with exit "
|
||||
"code %d" % (" ".join(cmd), ret)
|
||||
)
|
||||
self._trained = True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
A demonstration function to show how NLTK users can use the malt parser API.
|
||||
|
||||
>>> from nltk import pos_tag
|
||||
>>> assert 'MALT_PARSER' in os.environ, str(
|
||||
... "Please set MALT_PARSER in your global environment, e.g.:\n"
|
||||
... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'")
|
||||
>>>
|
||||
>>> assert 'MALT_MODEL' in os.environ, str(
|
||||
... "Please set MALT_MODEL in your global environment, e.g.:\n"
|
||||
... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
|
||||
>>>
|
||||
>>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
|
||||
... "2 sees _ VB _ _ 0 ROOT _ _\n"
|
||||
... "3 a _ DT _ _ 4 SPEC _ _\n"
|
||||
... "4 dog _ NN _ _ 2 OBJ _ _\n"
|
||||
... "5 . _ . _ _ 2 PUNCT _ _\n")
|
||||
>>>
|
||||
>>>
|
||||
>>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
|
||||
... "2 walks _ VB _ _ 0 ROOT _ _\n"
|
||||
... "3 . _ . _ _ 2 PUNCT _ _\n")
|
||||
>>> dg1 = DependencyGraph(_dg1_str)
|
||||
>>> dg2 = DependencyGraph(_dg2_str)
|
||||
>>> # Initialize a MaltParser object
|
||||
>>> mp = MaltParser()
|
||||
>>>
|
||||
>>> # Trains a model.
|
||||
>>> mp.train([dg1,dg2], verbose=False)
|
||||
>>> sent1 = ['John','sees','Mary', '.']
|
||||
>>> sent2 = ['John', 'walks', 'a', 'dog', '.']
|
||||
>>>
|
||||
>>> # Parse a single sentence.
|
||||
>>> parsed_sent1 = mp.parse_one(sent1)
|
||||
>>> parsed_sent2 = mp.parse_one(sent2)
|
||||
>>> print(parsed_sent1.tree())
|
||||
(sees John Mary .)
|
||||
>>> print(parsed_sent2.tree())
|
||||
(walks John (dog a) .)
|
||||
>>>
|
||||
>>> # Parsing multiple sentences.
|
||||
>>> sentences = [sent1,sent2]
|
||||
>>> parsed_sents = mp.parse_sents(sentences)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(sees John Mary .)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(walks John (dog a) .)
|
||||
>>>
|
||||
>>> # Initialize a MaltParser object with an English pre-trained model.
|
||||
>>> parser_dirname = 'maltparser-1.9.2'
|
||||
>>> model_name = 'engmalt.linear-1.7.mco'
|
||||
>>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
|
||||
>>> sent1 = 'I shot an elephant in my pajamas .'.split()
|
||||
>>> sent2 = 'Time flies like banana .'.split()
|
||||
>>> # Parse a single sentence.
|
||||
>>> print(mp.parse_one(sent1).tree())
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
# Parsing multiple sentences
|
||||
>>> sentences = [sent1,sent2]
|
||||
>>> parsed_sents = mp.parse_sents(sentences)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(shot I (elephant an) (in (pajamas my)) .)
|
||||
>>> print(next(next(parsed_sents)).tree())
|
||||
(flies Time (like banana) .)
|
||||
"""
|
||||
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
||||
@@ -0,0 +1,772 @@
|
||||
# Natural Language Toolkit: Dependency Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Jason Narad <jason.narad@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
import logging
|
||||
import math
|
||||
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#################################################################
|
||||
# DependencyScorerI - Interface for Graph-Edge Weight Calculation
|
||||
#################################################################
|
||||
|
||||
|
||||
class DependencyScorerI:
|
||||
"""
|
||||
A scorer for calculated the weights on the edges of a weighted
|
||||
dependency graph. This is used by a
|
||||
``ProbabilisticNonprojectiveParser`` to initialize the edge
|
||||
weights of a ``DependencyGraph``. While typically this would be done
|
||||
by training a binary classifier, any class that can return a
|
||||
multidimensional list representation of the edge weights can
|
||||
implement this interface. As such, it has no necessary
|
||||
fields.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == DependencyScorerI:
|
||||
raise TypeError("DependencyScorerI is an abstract interface")
|
||||
|
||||
def train(self, graphs):
|
||||
"""
|
||||
:type graphs: list(DependencyGraph)
|
||||
:param graphs: A list of dependency graphs to train the scorer.
|
||||
Typically the edges present in the graphs can be used as
|
||||
positive training examples, and the edges not present as negative
|
||||
examples.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def score(self, graph):
|
||||
"""
|
||||
:type graph: DependencyGraph
|
||||
:param graph: A dependency graph whose set of edges need to be
|
||||
scored.
|
||||
:rtype: A three-dimensional list of numbers.
|
||||
:return: The score is returned in a multidimensional(3) list, such
|
||||
that the outer-dimension refers to the head, and the
|
||||
inner-dimension refers to the dependencies. For instance,
|
||||
scores[0][1] would reference the list of scores corresponding to
|
||||
arcs from node 0 to node 1. The node's 'address' field can be used
|
||||
to determine its number identification.
|
||||
|
||||
For further illustration, a score list corresponding to Fig.2 of
|
||||
Keith Hall's 'K-best Spanning Tree Parsing' paper::
|
||||
|
||||
scores = [[[], [5], [1], [1]],
|
||||
[[], [], [11], [4]],
|
||||
[[], [10], [], [5]],
|
||||
[[], [8], [8], []]]
|
||||
|
||||
When used in conjunction with a MaxEntClassifier, each score would
|
||||
correspond to the confidence of a particular edge being classified
|
||||
with the positive training examples.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
#################################################################
|
||||
# NaiveBayesDependencyScorer
|
||||
#################################################################
|
||||
|
||||
|
||||
class NaiveBayesDependencyScorer(DependencyScorerI):
|
||||
"""
|
||||
A dependency scorer built around a MaxEnt classifier. In this
|
||||
particular class that classifier is a ``NaiveBayesClassifier``.
|
||||
It uses head-word, head-tag, child-word, and child-tag features
|
||||
for classification.
|
||||
|
||||
>>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2
|
||||
|
||||
>>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry]
|
||||
>>> npp = ProbabilisticNonprojectiveParser()
|
||||
>>> npp.train(graphs, NaiveBayesDependencyScorer())
|
||||
>>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'])
|
||||
>>> len(list(parses))
|
||||
1
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass # Do nothing without throwing error
|
||||
|
||||
def train(self, graphs):
|
||||
"""
|
||||
Trains a ``NaiveBayesClassifier`` using the edges present in
|
||||
graphs list as positive examples, the edges not present as
|
||||
negative examples. Uses a feature vector of head-word,
|
||||
head-tag, child-word, and child-tag.
|
||||
|
||||
:type graphs: list(DependencyGraph)
|
||||
:param graphs: A list of dependency graphs to train the scorer.
|
||||
"""
|
||||
|
||||
from nltk.classify import NaiveBayesClassifier
|
||||
|
||||
# Create training labeled training examples
|
||||
labeled_examples = []
|
||||
for graph in graphs:
|
||||
for head_node in graph.nodes.values():
|
||||
for child_index, child_node in graph.nodes.items():
|
||||
if child_index in head_node["deps"]:
|
||||
label = "T"
|
||||
else:
|
||||
label = "F"
|
||||
labeled_examples.append(
|
||||
(
|
||||
dict(
|
||||
a=head_node["word"],
|
||||
b=head_node["tag"],
|
||||
c=child_node["word"],
|
||||
d=child_node["tag"],
|
||||
),
|
||||
label,
|
||||
)
|
||||
)
|
||||
|
||||
self.classifier = NaiveBayesClassifier.train(labeled_examples)
|
||||
|
||||
def score(self, graph):
|
||||
"""
|
||||
Converts the graph into a feature-based representation of
|
||||
each edge, and then assigns a score to each based on the
|
||||
confidence of the classifier in assigning it to the
|
||||
positive label. Scores are returned in a multidimensional list.
|
||||
|
||||
:type graph: DependencyGraph
|
||||
:param graph: A dependency graph to score.
|
||||
:rtype: 3 dimensional list
|
||||
:return: Edge scores for the graph parameter.
|
||||
"""
|
||||
# Convert graph to feature representation
|
||||
edges = []
|
||||
for head_node in graph.nodes.values():
|
||||
for child_node in graph.nodes.values():
|
||||
edges.append(
|
||||
dict(
|
||||
a=head_node["word"],
|
||||
b=head_node["tag"],
|
||||
c=child_node["word"],
|
||||
d=child_node["tag"],
|
||||
)
|
||||
)
|
||||
|
||||
# Score edges
|
||||
edge_scores = []
|
||||
row = []
|
||||
count = 0
|
||||
for pdist in self.classifier.prob_classify_many(edges):
|
||||
logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
|
||||
# smoothing in case the probability = 0
|
||||
row.append([math.log(pdist.prob("T") + 0.00000000001)])
|
||||
count += 1
|
||||
if count == len(graph.nodes):
|
||||
edge_scores.append(row)
|
||||
row = []
|
||||
count = 0
|
||||
return edge_scores
|
||||
|
||||
|
||||
#################################################################
|
||||
# A Scorer for Demo Purposes
|
||||
#################################################################
|
||||
# A short class necessary to show parsing example from paper
|
||||
class DemoScorer(DependencyScorerI):
|
||||
def train(self, graphs):
|
||||
print("Training...")
|
||||
|
||||
def score(self, graph):
|
||||
# scores for Keith Hall 'K-best Spanning Tree Parsing' paper
|
||||
return [
|
||||
[[], [5], [1], [1]],
|
||||
[[], [], [11], [4]],
|
||||
[[], [10], [], [5]],
|
||||
[[], [8], [8], []],
|
||||
]
|
||||
|
||||
|
||||
#################################################################
|
||||
# Non-Projective Probabilistic Parsing
|
||||
#################################################################
|
||||
|
||||
|
||||
class ProbabilisticNonprojectiveParser:
|
||||
"""A probabilistic non-projective dependency parser.
|
||||
|
||||
Nonprojective dependencies allows for "crossing branches" in the parse tree
|
||||
which is necessary for representing particular linguistic phenomena, or even
|
||||
typical parses in some languages. This parser follows the MST parsing
|
||||
algorithm, outlined in McDonald(2005), which likens the search for the best
|
||||
non-projective parse to finding the maximum spanning tree in a weighted
|
||||
directed graph.
|
||||
|
||||
>>> class Scorer(DependencyScorerI):
|
||||
... def train(self, graphs):
|
||||
... pass
|
||||
...
|
||||
... def score(self, graph):
|
||||
... return [
|
||||
... [[], [5], [1], [1]],
|
||||
... [[], [], [11], [4]],
|
||||
... [[], [10], [], [5]],
|
||||
... [[], [8], [8], []],
|
||||
... ]
|
||||
|
||||
|
||||
>>> npp = ProbabilisticNonprojectiveParser()
|
||||
>>> npp.train([], Scorer())
|
||||
|
||||
>>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None])
|
||||
>>> len(list(parses))
|
||||
1
|
||||
|
||||
Rule based example
|
||||
|
||||
>>> from nltk.grammar import DependencyGrammar
|
||||
|
||||
>>> grammar = DependencyGrammar.fromstring('''
|
||||
... 'taught' -> 'play' | 'man'
|
||||
... 'man' -> 'the' | 'in'
|
||||
... 'in' -> 'corner'
|
||||
... 'corner' -> 'the'
|
||||
... 'play' -> 'golf' | 'dachshund' | 'to'
|
||||
... 'dachshund' -> 'his'
|
||||
... ''')
|
||||
|
||||
>>> ndp = NonprojectiveDependencyParser(grammar)
|
||||
>>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
|
||||
>>> len(list(parses))
|
||||
4
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Creates a new non-projective parser.
|
||||
"""
|
||||
logging.debug("initializing prob. nonprojective...")
|
||||
|
||||
def train(self, graphs, dependency_scorer):
|
||||
"""
|
||||
Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
|
||||
and establishes this as the parser's scorer. This is used to
|
||||
initialize the scores on a ``DependencyGraph`` during the parsing
|
||||
procedure.
|
||||
|
||||
:type graphs: list(DependencyGraph)
|
||||
:param graphs: A list of dependency graphs to train the scorer.
|
||||
:type dependency_scorer: DependencyScorerI
|
||||
:param dependency_scorer: A scorer which implements the
|
||||
``DependencyScorerI`` interface.
|
||||
"""
|
||||
self._scorer = dependency_scorer
|
||||
self._scorer.train(graphs)
|
||||
|
||||
def initialize_edge_scores(self, graph):
|
||||
"""
|
||||
Assigns a score to every edge in the ``DependencyGraph`` graph.
|
||||
These scores are generated via the parser's scorer which
|
||||
was assigned during the training process.
|
||||
|
||||
:type graph: DependencyGraph
|
||||
:param graph: A dependency graph to assign scores to.
|
||||
"""
|
||||
self.scores = self._scorer.score(graph)
|
||||
|
||||
def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph):
|
||||
"""
|
||||
Takes a list of nodes that have been identified to belong to a cycle,
|
||||
and collapses them into on larger node. The arcs of all nodes in
|
||||
the graph must be updated to account for this.
|
||||
|
||||
:type new_node: Node.
|
||||
:param new_node: A Node (Dictionary) to collapse the cycle nodes into.
|
||||
:type cycle_path: A list of integers.
|
||||
:param cycle_path: A list of node addresses, each of which is in the cycle.
|
||||
:type g_graph, b_graph, c_graph: DependencyGraph
|
||||
:param g_graph, b_graph, c_graph: Graphs which need to be updated.
|
||||
"""
|
||||
logger.debug("Collapsing nodes...")
|
||||
# Collapse all cycle nodes into v_n+1 in G_Graph
|
||||
for cycle_node_index in cycle_path:
|
||||
g_graph.remove_by_address(cycle_node_index)
|
||||
g_graph.add_node(new_node)
|
||||
g_graph.redirect_arcs(cycle_path, new_node["address"])
|
||||
|
||||
def update_edge_scores(self, new_node, cycle_path):
|
||||
"""
|
||||
Updates the edge scores to reflect a collapse operation into
|
||||
new_node.
|
||||
|
||||
:type new_node: A Node.
|
||||
:param new_node: The node which cycle nodes are collapsed into.
|
||||
:type cycle_path: A list of integers.
|
||||
:param cycle_path: A list of node addresses that belong to the cycle.
|
||||
"""
|
||||
logger.debug("cycle %s", cycle_path)
|
||||
|
||||
cycle_path = self.compute_original_indexes(cycle_path)
|
||||
|
||||
logger.debug("old cycle %s", cycle_path)
|
||||
logger.debug("Prior to update: %s", self.scores)
|
||||
|
||||
for i, row in enumerate(self.scores):
|
||||
for j, column in enumerate(self.scores[i]):
|
||||
logger.debug(self.scores[i][j])
|
||||
if j in cycle_path and i not in cycle_path and self.scores[i][j]:
|
||||
subtract_val = self.compute_max_subtract_score(j, cycle_path)
|
||||
|
||||
logger.debug("%s - %s", self.scores[i][j], subtract_val)
|
||||
|
||||
new_vals = []
|
||||
for cur_val in self.scores[i][j]:
|
||||
new_vals.append(cur_val - subtract_val)
|
||||
|
||||
self.scores[i][j] = new_vals
|
||||
|
||||
for i, row in enumerate(self.scores):
|
||||
for j, cell in enumerate(self.scores[i]):
|
||||
if i in cycle_path and j in cycle_path:
|
||||
self.scores[i][j] = []
|
||||
|
||||
logger.debug("After update: %s", self.scores)
|
||||
|
||||
def compute_original_indexes(self, new_indexes):
|
||||
"""
|
||||
As nodes are collapsed into others, they are replaced
|
||||
by the new node in the graph, but it's still necessary
|
||||
to keep track of what these original nodes were. This
|
||||
takes a list of node addresses and replaces any collapsed
|
||||
node addresses with their original addresses.
|
||||
|
||||
:type new_indexes: A list of integers.
|
||||
:param new_indexes: A list of node addresses to check for
|
||||
subsumed nodes.
|
||||
"""
|
||||
swapped = True
|
||||
while swapped:
|
||||
originals = []
|
||||
swapped = False
|
||||
for new_index in new_indexes:
|
||||
if new_index in self.inner_nodes:
|
||||
for old_val in self.inner_nodes[new_index]:
|
||||
if old_val not in originals:
|
||||
originals.append(old_val)
|
||||
swapped = True
|
||||
else:
|
||||
originals.append(new_index)
|
||||
new_indexes = originals
|
||||
return new_indexes
|
||||
|
||||
def compute_max_subtract_score(self, column_index, cycle_indexes):
|
||||
"""
|
||||
When updating scores the score of the highest-weighted incoming
|
||||
arc is subtracted upon collapse. This returns the correct
|
||||
amount to subtract from that edge.
|
||||
|
||||
:type column_index: integer.
|
||||
:param column_index: A index representing the column of incoming arcs
|
||||
to a particular node being updated
|
||||
:type cycle_indexes: A list of integers.
|
||||
:param cycle_indexes: Only arcs from cycle nodes are considered. This
|
||||
is a list of such nodes addresses.
|
||||
"""
|
||||
max_score = -100000
|
||||
for row_index in cycle_indexes:
|
||||
for subtract_val in self.scores[row_index][column_index]:
|
||||
if subtract_val > max_score:
|
||||
max_score = subtract_val
|
||||
return max_score
|
||||
|
||||
def best_incoming_arc(self, node_index):
|
||||
"""
|
||||
Returns the source of the best incoming arc to the
|
||||
node with address: node_index
|
||||
|
||||
:type node_index: integer.
|
||||
:param node_index: The address of the 'destination' node,
|
||||
the node that is arced to.
|
||||
"""
|
||||
originals = self.compute_original_indexes([node_index])
|
||||
logger.debug("originals: %s", originals)
|
||||
|
||||
max_arc = None
|
||||
max_score = None
|
||||
for row_index in range(len(self.scores)):
|
||||
for col_index in range(len(self.scores[row_index])):
|
||||
if col_index in originals and (
|
||||
max_score is None or self.scores[row_index][col_index] > max_score
|
||||
):
|
||||
max_score = self.scores[row_index][col_index]
|
||||
max_arc = row_index
|
||||
logger.debug("%s, %s", row_index, col_index)
|
||||
|
||||
logger.debug(max_score)
|
||||
|
||||
for key in self.inner_nodes:
|
||||
replaced_nodes = self.inner_nodes[key]
|
||||
if max_arc in replaced_nodes:
|
||||
return key
|
||||
|
||||
return max_arc
|
||||
|
||||
def original_best_arc(self, node_index):
|
||||
originals = self.compute_original_indexes([node_index])
|
||||
max_arc = None
|
||||
max_score = None
|
||||
max_orig = None
|
||||
for row_index in range(len(self.scores)):
|
||||
for col_index in range(len(self.scores[row_index])):
|
||||
if col_index in originals and (
|
||||
max_score is None or self.scores[row_index][col_index] > max_score
|
||||
):
|
||||
max_score = self.scores[row_index][col_index]
|
||||
max_arc = row_index
|
||||
max_orig = col_index
|
||||
return [max_arc, max_orig]
|
||||
|
||||
def parse(self, tokens, tags):
|
||||
"""
|
||||
Parses a list of tokens in accordance to the MST parsing algorithm
|
||||
for non-projective dependency parses. Assumes that the tokens to
|
||||
be parsed have already been tagged and those tags are provided. Various
|
||||
scoring methods can be used by implementing the ``DependencyScorerI``
|
||||
interface and passing it to the training algorithm.
|
||||
|
||||
:type tokens: list(str)
|
||||
:param tokens: A list of words or punctuation to be parsed.
|
||||
:type tags: list(str)
|
||||
:param tags: A list of tags corresponding by index to the words in the tokens list.
|
||||
:return: An iterator of non-projective parses.
|
||||
:rtype: iter(DependencyGraph)
|
||||
"""
|
||||
self.inner_nodes = {}
|
||||
|
||||
# Initialize g_graph
|
||||
g_graph = DependencyGraph()
|
||||
for index, token in enumerate(tokens):
|
||||
g_graph.nodes[index + 1].update(
|
||||
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
||||
)
|
||||
|
||||
# Fully connect non-root nodes in g_graph
|
||||
g_graph.connect_graph()
|
||||
original_graph = DependencyGraph()
|
||||
for index, token in enumerate(tokens):
|
||||
original_graph.nodes[index + 1].update(
|
||||
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
||||
)
|
||||
|
||||
b_graph = DependencyGraph()
|
||||
c_graph = DependencyGraph()
|
||||
|
||||
for index, token in enumerate(tokens):
|
||||
c_graph.nodes[index + 1].update(
|
||||
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
|
||||
)
|
||||
|
||||
# Assign initial scores to g_graph edges
|
||||
self.initialize_edge_scores(g_graph)
|
||||
logger.debug(self.scores)
|
||||
# Initialize a list of unvisited vertices (by node address)
|
||||
unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
|
||||
# Iterate over unvisited vertices
|
||||
nr_vertices = len(tokens)
|
||||
betas = {}
|
||||
while unvisited_vertices:
|
||||
# Mark current node as visited
|
||||
current_vertex = unvisited_vertices.pop(0)
|
||||
logger.debug("current_vertex: %s", current_vertex)
|
||||
# Get corresponding node n_i to vertex v_i
|
||||
current_node = g_graph.get_by_address(current_vertex)
|
||||
logger.debug("current_node: %s", current_node)
|
||||
# Get best in-edge node b for current node
|
||||
best_in_edge = self.best_incoming_arc(current_vertex)
|
||||
betas[current_vertex] = self.original_best_arc(current_vertex)
|
||||
logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
|
||||
# b_graph = Union(b_graph, b)
|
||||
for new_vertex in [current_vertex, best_in_edge]:
|
||||
b_graph.nodes[new_vertex].update(
|
||||
{"word": "TEMP", "rel": "NTOP", "address": new_vertex}
|
||||
)
|
||||
b_graph.add_arc(best_in_edge, current_vertex)
|
||||
# Beta(current node) = b - stored for parse recovery
|
||||
# If b_graph contains a cycle, collapse it
|
||||
cycle_path = b_graph.contains_cycle()
|
||||
if cycle_path:
|
||||
# Create a new node v_n+1 with address = len(nodes) + 1
|
||||
new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
|
||||
# c_graph = Union(c_graph, v_n+1)
|
||||
c_graph.add_node(new_node)
|
||||
# Collapse all nodes in cycle C into v_n+1
|
||||
self.update_edge_scores(new_node, cycle_path)
|
||||
self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
|
||||
for cycle_index in cycle_path:
|
||||
c_graph.add_arc(new_node["address"], cycle_index)
|
||||
# self.replaced_by[cycle_index] = new_node['address']
|
||||
|
||||
self.inner_nodes[new_node["address"]] = cycle_path
|
||||
|
||||
# Add v_n+1 to list of unvisited vertices
|
||||
unvisited_vertices.insert(0, nr_vertices + 1)
|
||||
|
||||
# increment # of nodes counter
|
||||
nr_vertices += 1
|
||||
|
||||
# Remove cycle nodes from b_graph; B = B - cycle c
|
||||
for cycle_node_address in cycle_path:
|
||||
b_graph.remove_by_address(cycle_node_address)
|
||||
|
||||
logger.debug("g_graph: %s", g_graph)
|
||||
logger.debug("b_graph: %s", b_graph)
|
||||
logger.debug("c_graph: %s", c_graph)
|
||||
logger.debug("Betas: %s", betas)
|
||||
logger.debug("replaced nodes %s", self.inner_nodes)
|
||||
|
||||
# Recover parse tree
|
||||
logger.debug("Final scores: %s", self.scores)
|
||||
|
||||
logger.debug("Recovering parse...")
|
||||
for i in range(len(tokens) + 1, nr_vertices + 1):
|
||||
betas[betas[i][1]] = betas[i]
|
||||
|
||||
logger.debug("Betas: %s", betas)
|
||||
for node in original_graph.nodes.values():
|
||||
# TODO: It's dangerous to assume that deps it a dictionary
|
||||
# because it's a default dictionary. Ideally, here we should not
|
||||
# be concerned how dependencies are stored inside of a dependency
|
||||
# graph.
|
||||
node["deps"] = {}
|
||||
for i in range(1, len(tokens) + 1):
|
||||
original_graph.add_arc(betas[i][0], betas[i][1])
|
||||
|
||||
logger.debug("Done.")
|
||||
yield original_graph
|
||||
|
||||
|
||||
#################################################################
|
||||
# Rule-based Non-Projective Parser
|
||||
#################################################################
|
||||
|
||||
|
||||
class NonprojectiveDependencyParser:
|
||||
"""
|
||||
A non-projective, rule-based, dependency parser. This parser
|
||||
will return the set of all possible non-projective parses based on
|
||||
the word-to-word relations defined in the parser's dependency
|
||||
grammar, and will allow the branches of the parse tree to cross
|
||||
in order to capture a variety of linguistic phenomena that a
|
||||
projective parser will not.
|
||||
"""
|
||||
|
||||
def __init__(self, dependency_grammar):
|
||||
"""
|
||||
Creates a new ``NonprojectiveDependencyParser``.
|
||||
|
||||
:param dependency_grammar: a grammar of word-to-word relations.
|
||||
:type dependency_grammar: DependencyGrammar
|
||||
"""
|
||||
self._grammar = dependency_grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Parses the input tokens with respect to the parser's grammar. Parsing
|
||||
is accomplished by representing the search-space of possible parses as
|
||||
a fully-connected directed graph. Arcs that would lead to ungrammatical
|
||||
parses are removed and a lattice is constructed of length n, where n is
|
||||
the number of input tokens, to represent all possible grammatical
|
||||
traversals. All possible paths through the lattice are then enumerated
|
||||
to produce the set of non-projective parses.
|
||||
|
||||
param tokens: A list of tokens to parse.
|
||||
type tokens: list(str)
|
||||
return: An iterator of non-projective parses.
|
||||
rtype: iter(DependencyGraph)
|
||||
"""
|
||||
# Create graph representation of tokens
|
||||
self._graph = DependencyGraph()
|
||||
|
||||
for index, token in enumerate(tokens):
|
||||
self._graph.nodes[index] = {
|
||||
"word": token,
|
||||
"deps": [],
|
||||
"rel": "NTOP",
|
||||
"address": index,
|
||||
}
|
||||
|
||||
for head_node in self._graph.nodes.values():
|
||||
deps = []
|
||||
for dep_node in self._graph.nodes.values():
|
||||
if (
|
||||
self._grammar.contains(head_node["word"], dep_node["word"])
|
||||
and head_node["word"] != dep_node["word"]
|
||||
):
|
||||
deps.append(dep_node["address"])
|
||||
head_node["deps"] = deps
|
||||
|
||||
# Create lattice of possible heads
|
||||
roots = []
|
||||
possible_heads = []
|
||||
for i, word in enumerate(tokens):
|
||||
heads = []
|
||||
for j, head in enumerate(tokens):
|
||||
if (i != j) and self._grammar.contains(head, word):
|
||||
heads.append(j)
|
||||
if len(heads) == 0:
|
||||
roots.append(i)
|
||||
possible_heads.append(heads)
|
||||
|
||||
# Set roots to attempt
|
||||
if len(roots) < 2:
|
||||
if len(roots) == 0:
|
||||
for i in range(len(tokens)):
|
||||
roots.append(i)
|
||||
|
||||
# Traverse lattice
|
||||
analyses = []
|
||||
for _ in roots:
|
||||
stack = []
|
||||
analysis = [[] for i in range(len(possible_heads))]
|
||||
i = 0
|
||||
forward = True
|
||||
while i >= 0:
|
||||
if forward:
|
||||
if len(possible_heads[i]) == 1:
|
||||
analysis[i] = possible_heads[i][0]
|
||||
elif len(possible_heads[i]) == 0:
|
||||
analysis[i] = -1
|
||||
else:
|
||||
head = possible_heads[i].pop()
|
||||
analysis[i] = head
|
||||
stack.append([i, head])
|
||||
if not forward:
|
||||
index_on_stack = False
|
||||
for stack_item in stack:
|
||||
if stack_item[0] == i:
|
||||
index_on_stack = True
|
||||
orig_length = len(possible_heads[i])
|
||||
|
||||
if index_on_stack and orig_length == 0:
|
||||
for j in range(len(stack) - 1, -1, -1):
|
||||
stack_item = stack[j]
|
||||
if stack_item[0] == i:
|
||||
possible_heads[i].append(stack.pop(j)[1])
|
||||
|
||||
elif index_on_stack and orig_length > 0:
|
||||
head = possible_heads[i].pop()
|
||||
analysis[i] = head
|
||||
stack.append([i, head])
|
||||
forward = True
|
||||
|
||||
if i + 1 == len(possible_heads):
|
||||
analyses.append(analysis[:])
|
||||
forward = False
|
||||
if forward:
|
||||
i += 1
|
||||
else:
|
||||
i -= 1
|
||||
|
||||
# Filter parses
|
||||
# ensure 1 root, every thing has 1 head
|
||||
for analysis in analyses:
|
||||
if analysis.count(-1) > 1:
|
||||
# there are several root elements!
|
||||
continue
|
||||
|
||||
graph = DependencyGraph()
|
||||
graph.root = graph.nodes[analysis.index(-1) + 1]
|
||||
|
||||
for address, (token, head_index) in enumerate(
|
||||
zip(tokens, analysis), start=1
|
||||
):
|
||||
head_address = head_index + 1
|
||||
|
||||
node = graph.nodes[address]
|
||||
node.update({"word": token, "address": address})
|
||||
|
||||
if head_address == 0:
|
||||
rel = "ROOT"
|
||||
else:
|
||||
rel = ""
|
||||
graph.nodes[head_index + 1]["deps"][rel].append(address)
|
||||
|
||||
# TODO: check for cycles
|
||||
yield graph
|
||||
|
||||
|
||||
#################################################################
|
||||
# Demos
|
||||
#################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
# hall_demo()
|
||||
nonprojective_conll_parse_demo()
|
||||
rule_based_demo()
|
||||
|
||||
|
||||
def hall_demo():
|
||||
npp = ProbabilisticNonprojectiveParser()
|
||||
npp.train([], DemoScorer())
|
||||
for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
|
||||
print(parse_graph)
|
||||
|
||||
|
||||
def nonprojective_conll_parse_demo():
|
||||
from nltk.parse.dependencygraph import conll_data2
|
||||
|
||||
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
|
||||
npp = ProbabilisticNonprojectiveParser()
|
||||
npp.train(graphs, NaiveBayesDependencyScorer())
|
||||
for parse_graph in npp.parse(
|
||||
["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
|
||||
):
|
||||
print(parse_graph)
|
||||
|
||||
|
||||
def rule_based_demo():
|
||||
from nltk.grammar import DependencyGrammar
|
||||
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'taught' -> 'play' | 'man'
|
||||
'man' -> 'the' | 'in'
|
||||
'in' -> 'corner'
|
||||
'corner' -> 'the'
|
||||
'play' -> 'golf' | 'dachshund' | 'to'
|
||||
'dachshund' -> 'his'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
ndp = NonprojectiveDependencyParser(grammar)
|
||||
graphs = ndp.parse(
|
||||
[
|
||||
"the",
|
||||
"man",
|
||||
"in",
|
||||
"the",
|
||||
"corner",
|
||||
"taught",
|
||||
"his",
|
||||
"dachshund",
|
||||
"to",
|
||||
"play",
|
||||
"golf",
|
||||
]
|
||||
)
|
||||
print("Graphs:")
|
||||
for graph in graphs:
|
||||
print(graph)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
579
Backend/venv/lib/python3.12/site-packages/nltk/parse/pchart.py
Normal file
579
Backend/venv/lib/python3.12/site-packages/nltk/parse/pchart.py
Normal file
@@ -0,0 +1,579 @@
|
||||
# Natural Language Toolkit: Probabilistic Chart Parsers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classes and interfaces for associating probabilities with tree
|
||||
structures that represent the internal organization of a text. The
|
||||
probabilistic parser module defines ``BottomUpProbabilisticChartParser``.
|
||||
|
||||
``BottomUpProbabilisticChartParser`` is an abstract class that implements
|
||||
a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges,
|
||||
and adds them to the chart one at a time. The ordering of this queue
|
||||
is based on the probabilities associated with the edges, allowing the
|
||||
parser to expand more likely edges before less likely ones. Each
|
||||
subclass implements a different queue ordering, producing different
|
||||
search strategies. Currently the following subclasses are defined:
|
||||
|
||||
- ``InsideChartParser`` searches edges in decreasing order of
|
||||
their trees' inside probabilities.
|
||||
- ``RandomChartParser`` searches edges in random order.
|
||||
- ``LongestChartParser`` searches edges in decreasing order of their
|
||||
location's length.
|
||||
|
||||
The ``BottomUpProbabilisticChartParser`` constructor has an optional
|
||||
argument beam_size. If non-zero, this controls the size of the beam
|
||||
(aka the edge queue). This option is most useful with InsideChartParser.
|
||||
"""
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Bottom-Up PCFG Chart Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
# [XX] This might not be implemented quite right -- it would be better
|
||||
# to associate probabilities with child pointer lists.
|
||||
|
||||
import random
|
||||
from functools import reduce
|
||||
|
||||
from nltk.grammar import PCFG, Nonterminal
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.chart import AbstractChartRule, Chart, LeafEdge, TreeEdge
|
||||
from nltk.tree import ProbabilisticTree, Tree
|
||||
|
||||
|
||||
# Probabilistic edges
|
||||
class ProbabilisticLeafEdge(LeafEdge):
|
||||
def prob(self):
|
||||
return 1.0
|
||||
|
||||
|
||||
class ProbabilisticTreeEdge(TreeEdge):
|
||||
def __init__(self, prob, *args, **kwargs):
|
||||
TreeEdge.__init__(self, *args, **kwargs)
|
||||
self._prob = prob
|
||||
# two edges with different probabilities are not equal.
|
||||
self._comparison_key = (self._comparison_key, prob)
|
||||
|
||||
def prob(self):
|
||||
return self._prob
|
||||
|
||||
@staticmethod
|
||||
def from_production(production, index, p):
|
||||
return ProbabilisticTreeEdge(
|
||||
p, (index, index), production.lhs(), production.rhs(), 0
|
||||
)
|
||||
|
||||
|
||||
# Rules using probabilistic edges
|
||||
class ProbabilisticBottomUpInitRule(AbstractChartRule):
|
||||
NUM_EDGES = 0
|
||||
|
||||
def apply(self, chart, grammar):
|
||||
for index in range(chart.num_leaves()):
|
||||
new_edge = ProbabilisticLeafEdge(chart.leaf(index), index)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class ProbabilisticBottomUpPredictRule(AbstractChartRule):
|
||||
NUM_EDGES = 1
|
||||
|
||||
def apply(self, chart, grammar, edge):
|
||||
if edge.is_incomplete():
|
||||
return
|
||||
for prod in grammar.productions():
|
||||
if edge.lhs() == prod.rhs()[0]:
|
||||
new_edge = ProbabilisticTreeEdge.from_production(
|
||||
prod, edge.start(), prod.prob()
|
||||
)
|
||||
if chart.insert(new_edge, ()):
|
||||
yield new_edge
|
||||
|
||||
|
||||
class ProbabilisticFundamentalRule(AbstractChartRule):
|
||||
NUM_EDGES = 2
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
# Make sure the rule is applicable.
|
||||
if not (
|
||||
left_edge.end() == right_edge.start()
|
||||
and left_edge.nextsym() == right_edge.lhs()
|
||||
and left_edge.is_incomplete()
|
||||
and right_edge.is_complete()
|
||||
):
|
||||
return
|
||||
|
||||
# Construct the new edge.
|
||||
p = left_edge.prob() * right_edge.prob()
|
||||
new_edge = ProbabilisticTreeEdge(
|
||||
p,
|
||||
span=(left_edge.start(), right_edge.end()),
|
||||
lhs=left_edge.lhs(),
|
||||
rhs=left_edge.rhs(),
|
||||
dot=left_edge.dot() + 1,
|
||||
)
|
||||
|
||||
# Add it to the chart, with appropriate child pointers.
|
||||
changed_chart = False
|
||||
for cpl1 in chart.child_pointer_lists(left_edge):
|
||||
if chart.insert(new_edge, cpl1 + (right_edge,)):
|
||||
changed_chart = True
|
||||
|
||||
# If we changed the chart, then generate the edge.
|
||||
if changed_chart:
|
||||
yield new_edge
|
||||
|
||||
|
||||
class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
|
||||
NUM_EDGES = 1
|
||||
|
||||
_fundamental_rule = ProbabilisticFundamentalRule()
|
||||
|
||||
def apply(self, chart, grammar, edge1):
|
||||
fr = self._fundamental_rule
|
||||
if edge1.is_incomplete():
|
||||
# edge1 = left_edge; edge2 = right_edge
|
||||
for edge2 in chart.select(
|
||||
start=edge1.end(), is_complete=True, lhs=edge1.nextsym()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, edge1, edge2)
|
||||
else:
|
||||
# edge2 = left_edge; edge1 = right_edge
|
||||
for edge2 in chart.select(
|
||||
end=edge1.start(), is_complete=False, nextsym=edge1.lhs()
|
||||
):
|
||||
yield from fr.apply(chart, grammar, edge2, edge1)
|
||||
|
||||
def __str__(self):
|
||||
return "Fundamental Rule"
|
||||
|
||||
|
||||
class BottomUpProbabilisticChartParser(ParserI):
|
||||
"""
|
||||
An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to
|
||||
record partial results. ``BottomUpProbabilisticChartParser`` maintains
|
||||
a queue of edges that can be added to the chart. This queue is
|
||||
initialized with edges for each token in the text that is being
|
||||
parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into
|
||||
the chart one at a time, starting with the most likely edges, and
|
||||
proceeding to less likely edges. For each edge that is added to
|
||||
the chart, it may become possible to insert additional edges into
|
||||
the chart; these are added to the queue. This process continues
|
||||
until enough complete parses have been generated, or until the
|
||||
queue is empty.
|
||||
|
||||
The sorting order for the queue is not specified by
|
||||
``BottomUpProbabilisticChartParser``. Different sorting orders will
|
||||
result in different search strategies. The sorting order for the
|
||||
queue is defined by the method ``sort_queue``; subclasses are required
|
||||
to provide a definition for this method.
|
||||
|
||||
:type _grammar: PCFG
|
||||
:ivar _grammar: The grammar used to parse sentences.
|
||||
:type _trace: int
|
||||
:ivar _trace: The level of tracing output that should be generated
|
||||
when parsing a text.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, beam_size=0, trace=0):
|
||||
"""
|
||||
Create a new ``BottomUpProbabilisticChartParser``, that uses
|
||||
``grammar`` to parse texts.
|
||||
|
||||
:type grammar: PCFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type beam_size: int
|
||||
:param beam_size: The maximum length for the parser's edge queue.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
if not isinstance(grammar, PCFG):
|
||||
raise ValueError("The grammar must be probabilistic PCFG")
|
||||
self._grammar = grammar
|
||||
self.beam_size = beam_size
|
||||
self._trace = trace
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
self._trace = trace
|
||||
|
||||
# TODO: change this to conform more with the standard ChartParser
|
||||
def parse(self, tokens):
|
||||
self._grammar.check_coverage(tokens)
|
||||
chart = Chart(list(tokens))
|
||||
grammar = self._grammar
|
||||
|
||||
# Chart parser rules.
|
||||
bu_init = ProbabilisticBottomUpInitRule()
|
||||
bu = ProbabilisticBottomUpPredictRule()
|
||||
fr = SingleEdgeProbabilisticFundamentalRule()
|
||||
|
||||
# Our queue
|
||||
queue = []
|
||||
|
||||
# Initialize the chart.
|
||||
for edge in bu_init.apply(chart, grammar):
|
||||
if self._trace > 1:
|
||||
print(
|
||||
" %-50s [%s]"
|
||||
% (chart.pretty_format_edge(edge, width=2), edge.prob())
|
||||
)
|
||||
queue.append(edge)
|
||||
|
||||
while len(queue) > 0:
|
||||
# Re-sort the queue.
|
||||
self.sort_queue(queue, chart)
|
||||
|
||||
# Prune the queue to the correct size if a beam was defined
|
||||
if self.beam_size:
|
||||
self._prune(queue, chart)
|
||||
|
||||
# Get the best edge.
|
||||
edge = queue.pop()
|
||||
if self._trace > 0:
|
||||
print(
|
||||
" %-50s [%s]"
|
||||
% (chart.pretty_format_edge(edge, width=2), edge.prob())
|
||||
)
|
||||
|
||||
# Apply BU & FR to it.
|
||||
queue.extend(bu.apply(chart, grammar, edge))
|
||||
queue.extend(fr.apply(chart, grammar, edge))
|
||||
|
||||
# Get a list of complete parses.
|
||||
parses = list(chart.parses(grammar.start(), ProbabilisticTree))
|
||||
|
||||
# Assign probabilities to the trees.
|
||||
prod_probs = {}
|
||||
for prod in grammar.productions():
|
||||
prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
|
||||
for parse in parses:
|
||||
self._setprob(parse, prod_probs)
|
||||
|
||||
# Sort by probability
|
||||
parses.sort(reverse=True, key=lambda tree: tree.prob())
|
||||
|
||||
return iter(parses)
|
||||
|
||||
def _setprob(self, tree, prod_probs):
|
||||
if tree.prob() is not None:
|
||||
return
|
||||
|
||||
# Get the prob of the CFG production.
|
||||
lhs = Nonterminal(tree.label())
|
||||
rhs = []
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
rhs.append(Nonterminal(child.label()))
|
||||
else:
|
||||
rhs.append(child)
|
||||
prob = prod_probs[lhs, tuple(rhs)]
|
||||
|
||||
# Get the probs of children.
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
self._setprob(child, prod_probs)
|
||||
prob *= child.prob()
|
||||
|
||||
tree.set_prob(prob)
|
||||
|
||||
def sort_queue(self, queue, chart):
|
||||
"""
|
||||
Sort the given queue of ``Edge`` objects, placing the edge that should
|
||||
be tried first at the beginning of the queue. This method
|
||||
will be called after each ``Edge`` is added to the queue.
|
||||
|
||||
:param queue: The queue of ``Edge`` objects to sort. Each edge in
|
||||
this queue is an edge that could be added to the chart by
|
||||
the fundamental rule; but that has not yet been added.
|
||||
:type queue: list(Edge)
|
||||
:param chart: The chart being used to parse the text. This
|
||||
chart can be used to provide extra information for sorting
|
||||
the queue.
|
||||
:type chart: Chart
|
||||
:rtype: None
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def _prune(self, queue, chart):
|
||||
"""Discard items in the queue if the queue is longer than the beam."""
|
||||
if len(queue) > self.beam_size:
|
||||
split = len(queue) - self.beam_size
|
||||
if self._trace > 2:
|
||||
for edge in queue[:split]:
|
||||
print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2))
|
||||
del queue[:split]
|
||||
|
||||
|
||||
class InsideChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries edges in descending
|
||||
order of the inside probabilities of their trees. The "inside
|
||||
probability" of a tree is simply the
|
||||
probability of the entire tree, ignoring its context. In
|
||||
particular, the inside probability of a tree generated by
|
||||
production *p* with children *c[1], c[2], ..., c[n]* is
|
||||
*P(p)P(c[1])P(c[2])...P(c[n])*; and the inside
|
||||
probability of a token is 1 if it is present in the text, and 0 if
|
||||
it is absent.
|
||||
|
||||
This sorting order results in a type of lowest-cost-first search
|
||||
strategy.
|
||||
"""
|
||||
|
||||
# Inherit constructor.
|
||||
def sort_queue(self, queue, chart):
|
||||
"""
|
||||
Sort the given queue of edges, in descending order of the
|
||||
inside probabilities of the edges' trees.
|
||||
|
||||
:param queue: The queue of ``Edge`` objects to sort. Each edge in
|
||||
this queue is an edge that could be added to the chart by
|
||||
the fundamental rule; but that has not yet been added.
|
||||
:type queue: list(Edge)
|
||||
:param chart: The chart being used to parse the text. This
|
||||
chart can be used to provide extra information for sorting
|
||||
the queue.
|
||||
:type chart: Chart
|
||||
:rtype: None
|
||||
"""
|
||||
queue.sort(key=lambda edge: edge.prob())
|
||||
|
||||
|
||||
# Eventually, this will become some sort of inside-outside parser:
|
||||
# class InsideOutsideParser(BottomUpProbabilisticChartParser):
|
||||
# def __init__(self, grammar, trace=0):
|
||||
# # Inherit docs.
|
||||
# BottomUpProbabilisticChartParser.__init__(self, grammar, trace)
|
||||
#
|
||||
# # Find the best path from S to each nonterminal
|
||||
# bestp = {}
|
||||
# for production in grammar.productions(): bestp[production.lhs()]=0
|
||||
# bestp[grammar.start()] = 1.0
|
||||
#
|
||||
# for i in range(len(grammar.productions())):
|
||||
# for production in grammar.productions():
|
||||
# lhs = production.lhs()
|
||||
# for elt in production.rhs():
|
||||
# bestp[elt] = max(bestp[lhs]*production.prob(),
|
||||
# bestp.get(elt,0))
|
||||
#
|
||||
# self._bestp = bestp
|
||||
# for (k,v) in self._bestp.items(): print(k,v)
|
||||
#
|
||||
# def _sortkey(self, edge):
|
||||
# return edge.structure()[PROB] * self._bestp[edge.lhs()]
|
||||
#
|
||||
# def sort_queue(self, queue, chart):
|
||||
# queue.sort(key=self._sortkey)
|
||||
|
||||
|
||||
class RandomChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries edges in random order.
|
||||
This sorting order results in a random search strategy.
|
||||
"""
|
||||
|
||||
# Inherit constructor
|
||||
def sort_queue(self, queue, chart):
|
||||
i = random.randint(0, len(queue) - 1)
|
||||
(queue[-1], queue[i]) = (queue[i], queue[-1])
|
||||
|
||||
|
||||
class UnsortedChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order.
|
||||
"""
|
||||
|
||||
# Inherit constructor
|
||||
def sort_queue(self, queue, chart):
|
||||
return
|
||||
|
||||
|
||||
class LongestChartParser(BottomUpProbabilisticChartParser):
|
||||
"""
|
||||
A bottom-up parser for ``PCFG`` grammars that tries longer edges before
|
||||
shorter ones. This sorting order results in a type of best-first
|
||||
search strategy.
|
||||
"""
|
||||
|
||||
# Inherit constructor
|
||||
def sort_queue(self, queue, chart):
|
||||
queue.sort(key=lambda edge: edge.length())
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Test Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo(choice=None, draw_parses=None, print_parses=None):
|
||||
"""
|
||||
A demonstration of the probabilistic parsers. The user is
|
||||
prompted to select which demo to run, and how many parses should
|
||||
be found; and then each parser is run on the same demo, and a
|
||||
summary of the results are displayed.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
from nltk import tokenize
|
||||
from nltk.parse import pchart
|
||||
|
||||
# Define two demos. Each demo has a sentence and a grammar.
|
||||
toy_pcfg1 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
|
||||
Det -> 'the' [0.8] | 'my' [0.2]
|
||||
N -> 'man' [0.5] | 'telescope' [0.5]
|
||||
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
|
||||
V -> 'ate' [0.35] | 'saw' [0.65]
|
||||
PP -> P NP [1.0]
|
||||
P -> 'with' [0.61] | 'under' [0.39]
|
||||
"""
|
||||
)
|
||||
|
||||
toy_pcfg2 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
VP -> V NP [.59]
|
||||
VP -> V [.40]
|
||||
VP -> VP PP [.01]
|
||||
NP -> Det N [.41]
|
||||
NP -> Name [.28]
|
||||
NP -> NP PP [.31]
|
||||
PP -> P NP [1.0]
|
||||
V -> 'saw' [.21]
|
||||
V -> 'ate' [.51]
|
||||
V -> 'ran' [.28]
|
||||
N -> 'boy' [.11]
|
||||
N -> 'cookie' [.12]
|
||||
N -> 'table' [.13]
|
||||
N -> 'telescope' [.14]
|
||||
N -> 'hill' [.5]
|
||||
Name -> 'Jack' [.52]
|
||||
Name -> 'Bob' [.48]
|
||||
P -> 'with' [.61]
|
||||
P -> 'under' [.39]
|
||||
Det -> 'the' [.41]
|
||||
Det -> 'a' [.31]
|
||||
Det -> 'my' [.28]
|
||||
"""
|
||||
)
|
||||
|
||||
demos = [
|
||||
("I saw John with my telescope", toy_pcfg1),
|
||||
("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
|
||||
]
|
||||
|
||||
if choice is None:
|
||||
# Ask the user which demo they want to use.
|
||||
print()
|
||||
for i in range(len(demos)):
|
||||
print(f"{i + 1:>3}: {demos[i][0]}")
|
||||
print(" %r" % demos[i][1])
|
||||
print()
|
||||
print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
|
||||
choice = int(sys.stdin.readline().strip()) - 1
|
||||
try:
|
||||
sent, grammar = demos[choice]
|
||||
except:
|
||||
print("Bad sentence number")
|
||||
return
|
||||
|
||||
# Tokenize the sentence.
|
||||
tokens = sent.split()
|
||||
|
||||
# Define a list of parsers. We'll use all parsers.
|
||||
parsers = [
|
||||
pchart.InsideChartParser(grammar),
|
||||
pchart.RandomChartParser(grammar),
|
||||
pchart.UnsortedChartParser(grammar),
|
||||
pchart.LongestChartParser(grammar),
|
||||
pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser
|
||||
]
|
||||
|
||||
# Run the parsers on the tokenized sentence.
|
||||
times = []
|
||||
average_p = []
|
||||
num_parses = []
|
||||
all_parses = {}
|
||||
for parser in parsers:
|
||||
print(f"\ns: {sent}\nparser: {parser}\ngrammar: {grammar}")
|
||||
parser.trace(3)
|
||||
t = time.time()
|
||||
parses = list(parser.parse(tokens))
|
||||
times.append(time.time() - t)
|
||||
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
|
||||
average_p.append(p)
|
||||
num_parses.append(len(parses))
|
||||
for p in parses:
|
||||
all_parses[p.freeze()] = 1
|
||||
|
||||
# Print some summary statistics
|
||||
print()
|
||||
print(" Parser Beam | Time (secs) # Parses Average P(parse)")
|
||||
print("------------------------+------------------------------------------")
|
||||
for i in range(len(parsers)):
|
||||
print(
|
||||
"%18s %4d |%11.4f%11d%19.14f"
|
||||
% (
|
||||
parsers[i].__class__.__name__,
|
||||
parsers[i].beam_size,
|
||||
times[i],
|
||||
num_parses[i],
|
||||
average_p[i],
|
||||
)
|
||||
)
|
||||
parses = all_parses.keys()
|
||||
if parses:
|
||||
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
|
||||
else:
|
||||
p = 0
|
||||
print("------------------------+------------------------------------------")
|
||||
print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p))
|
||||
|
||||
if draw_parses is None:
|
||||
# Ask the user if we should draw the parses.
|
||||
print()
|
||||
print("Draw parses (y/n)? ", end=" ")
|
||||
draw_parses = sys.stdin.readline().strip().lower().startswith("y")
|
||||
if draw_parses:
|
||||
from nltk.draw.tree import draw_trees
|
||||
|
||||
print(" please wait...")
|
||||
draw_trees(*parses)
|
||||
|
||||
if print_parses is None:
|
||||
# Ask the user if we should print the parses.
|
||||
print()
|
||||
print("Print parses (y/n)? ", end=" ")
|
||||
print_parses = sys.stdin.readline().strip().lower().startswith("y")
|
||||
if print_parses:
|
||||
for parse in parses:
|
||||
print(parse)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,716 @@
|
||||
# Natural Language Toolkit: Dependency Grammars
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Jason Narad <jason.narad@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
from collections import defaultdict
|
||||
from functools import total_ordering
|
||||
from itertools import chain
|
||||
|
||||
from nltk.grammar import (
|
||||
DependencyGrammar,
|
||||
DependencyProduction,
|
||||
ProbabilisticDependencyGrammar,
|
||||
)
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
|
||||
#################################################################
|
||||
# Dependency Span
|
||||
#################################################################
|
||||
|
||||
|
||||
@total_ordering
|
||||
class DependencySpan:
|
||||
"""
|
||||
A contiguous span over some part of the input string representing
|
||||
dependency (head -> modifier) relationships amongst words. An atomic
|
||||
span corresponds to only one word so it isn't a 'span' in the conventional
|
||||
sense, as its _start_index = _end_index = _head_index for concatenation
|
||||
purposes. All other spans are assumed to have arcs between all nodes
|
||||
within the start and end indexes of the span, and one head index corresponding
|
||||
to the head word for the entire span. This is the same as the root node if
|
||||
the dependency structure were depicted as a graph.
|
||||
"""
|
||||
|
||||
def __init__(self, start_index, end_index, head_index, arcs, tags):
|
||||
self._start_index = start_index
|
||||
self._end_index = end_index
|
||||
self._head_index = head_index
|
||||
self._arcs = arcs
|
||||
self._tags = tags
|
||||
self._comparison_key = (start_index, end_index, head_index, tuple(arcs))
|
||||
self._hash = hash(self._comparison_key)
|
||||
|
||||
def head_index(self):
|
||||
"""
|
||||
:return: An value indexing the head of the entire ``DependencySpan``.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._head_index
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
:return: A concise string representatino of the ``DependencySpan``.
|
||||
:rtype: str.
|
||||
"""
|
||||
return "Span %d-%d; Head Index: %d" % (
|
||||
self._start_index,
|
||||
self._end_index,
|
||||
self._head_index,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
:return: A verbose string representation of the ``DependencySpan``.
|
||||
:rtype: str
|
||||
"""
|
||||
str = "Span %d-%d; Head Index: %d" % (
|
||||
self._start_index,
|
||||
self._end_index,
|
||||
self._head_index,
|
||||
)
|
||||
for i in range(len(self._arcs)):
|
||||
str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i])
|
||||
return str
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
type(self) == type(other) and self._comparison_key == other._comparison_key
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, DependencySpan):
|
||||
raise_unorderable_types("<", self, other)
|
||||
return self._comparison_key < other._comparison_key
|
||||
|
||||
def __hash__(self):
|
||||
"""
|
||||
:return: The hash value of this ``DependencySpan``.
|
||||
"""
|
||||
return self._hash
|
||||
|
||||
|
||||
#################################################################
|
||||
# Chart Cell
|
||||
#################################################################
|
||||
|
||||
|
||||
class ChartCell:
|
||||
"""
|
||||
A cell from the parse chart formed when performing the CYK algorithm.
|
||||
Each cell keeps track of its x and y coordinates (though this will probably
|
||||
be discarded), and a list of spans serving as the cell's entries.
|
||||
"""
|
||||
|
||||
def __init__(self, x, y):
|
||||
"""
|
||||
:param x: This cell's x coordinate.
|
||||
:type x: int.
|
||||
:param y: This cell's y coordinate.
|
||||
:type y: int.
|
||||
"""
|
||||
self._x = x
|
||||
self._y = y
|
||||
self._entries = set()
|
||||
|
||||
def add(self, span):
|
||||
"""
|
||||
Appends the given span to the list of spans
|
||||
representing the chart cell's entries.
|
||||
|
||||
:param span: The span to add.
|
||||
:type span: DependencySpan
|
||||
"""
|
||||
self._entries.add(span)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
:return: A verbose string representation of this ``ChartCell``.
|
||||
:rtype: str.
|
||||
"""
|
||||
return "CC[%d,%d]: %s" % (self._x, self._y, self._entries)
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
:return: A concise string representation of this ``ChartCell``.
|
||||
:rtype: str.
|
||||
"""
|
||||
return "%s" % self
|
||||
|
||||
|
||||
#################################################################
|
||||
# Parsing with Dependency Grammars
|
||||
#################################################################
|
||||
|
||||
|
||||
class ProjectiveDependencyParser:
|
||||
"""
|
||||
A projective, rule-based, dependency parser. A ProjectiveDependencyParser
|
||||
is created with a DependencyGrammar, a set of productions specifying
|
||||
word-to-word dependency relations. The parse() method will then
|
||||
return the set of all parses, in tree representation, for a given input
|
||||
sequence of tokens. Each parse must meet the requirements of the both
|
||||
the grammar and the projectivity constraint which specifies that the
|
||||
branches of the dependency tree are not allowed to cross. Alternatively,
|
||||
this can be understood as stating that each parent node and its children
|
||||
in the parse tree form a continuous substring of the input sequence.
|
||||
"""
|
||||
|
||||
def __init__(self, dependency_grammar):
|
||||
"""
|
||||
Create a new ProjectiveDependencyParser, from a word-to-word
|
||||
dependency grammar ``DependencyGrammar``.
|
||||
|
||||
:param dependency_grammar: A word-to-word relation dependencygrammar.
|
||||
:type dependency_grammar: DependencyGrammar
|
||||
"""
|
||||
self._grammar = dependency_grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Performs a projective dependency parse on the list of tokens using
|
||||
a chart-based, span-concatenation algorithm similar to Eisner (1996).
|
||||
|
||||
:param tokens: The list of input tokens.
|
||||
:type tokens: list(str)
|
||||
:return: An iterator over parse trees.
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
self._tokens = list(tokens)
|
||||
chart = []
|
||||
for i in range(0, len(self._tokens) + 1):
|
||||
chart.append([])
|
||||
for j in range(0, len(self._tokens) + 1):
|
||||
chart[i].append(ChartCell(i, j))
|
||||
if i == j + 1:
|
||||
chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"]))
|
||||
|
||||
for i in range(1, len(self._tokens) + 1):
|
||||
for j in range(i - 2, -1, -1):
|
||||
for k in range(i - 1, j, -1):
|
||||
for span1 in chart[k][j]._entries:
|
||||
for span2 in chart[i][k]._entries:
|
||||
for newspan in self.concatenate(span1, span2):
|
||||
chart[i][j].add(newspan)
|
||||
|
||||
for parse in chart[len(self._tokens)][0]._entries:
|
||||
conll_format = ""
|
||||
# malt_format = ""
|
||||
for i in range(len(tokens)):
|
||||
# malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
|
||||
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
|
||||
# Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
|
||||
conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
|
||||
i + 1,
|
||||
tokens[i],
|
||||
tokens[i],
|
||||
"null",
|
||||
"null",
|
||||
"null",
|
||||
parse._arcs[i] + 1,
|
||||
"ROOT",
|
||||
"-",
|
||||
"-",
|
||||
)
|
||||
dg = DependencyGraph(conll_format)
|
||||
# if self.meets_arity(dg):
|
||||
yield dg.tree()
|
||||
|
||||
def concatenate(self, span1, span2):
|
||||
"""
|
||||
Concatenates the two spans in whichever way possible. This
|
||||
includes rightward concatenation (from the leftmost word of the
|
||||
leftmost span to the rightmost word of the rightmost span) and
|
||||
leftward concatenation (vice-versa) between adjacent spans. Unlike
|
||||
Eisner's presentation of span concatenation, these spans do not
|
||||
share or pivot on a particular word/word-index.
|
||||
|
||||
:return: A list of new spans formed through concatenation.
|
||||
:rtype: list(DependencySpan)
|
||||
"""
|
||||
spans = []
|
||||
if span1._start_index == span2._start_index:
|
||||
print("Error: Mismatched spans - replace this with thrown error")
|
||||
if span1._start_index > span2._start_index:
|
||||
temp_span = span1
|
||||
span1 = span2
|
||||
span2 = temp_span
|
||||
# adjacent rightward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
new_tags = span1._tags + span2._tags
|
||||
if self._grammar.contains(
|
||||
self._tokens[span1._head_index], self._tokens[span2._head_index]
|
||||
):
|
||||
# print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index))
|
||||
new_arcs[span2._head_index - span1._start_index] = span1._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span1._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
# adjacent leftward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
if self._grammar.contains(
|
||||
self._tokens[span2._head_index], self._tokens[span1._head_index]
|
||||
):
|
||||
# print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index))
|
||||
new_arcs[span1._head_index - span1._start_index] = span2._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span2._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
return spans
|
||||
|
||||
|
||||
#################################################################
|
||||
# Parsing with Probabilistic Dependency Grammars
|
||||
#################################################################
|
||||
|
||||
|
||||
class ProbabilisticProjectiveDependencyParser:
|
||||
"""A probabilistic, projective dependency parser.
|
||||
|
||||
This parser returns the most probable projective parse derived from the
|
||||
probabilistic dependency grammar derived from the train() method. The
|
||||
probabilistic model is an implementation of Eisner's (1996) Model C, which
|
||||
conditions on head-word, head-tag, child-word, and child-tag. The decoding
|
||||
uses a bottom-up chart-based span concatenation algorithm that's identical
|
||||
to the one utilized by the rule-based projective parser.
|
||||
|
||||
Usage example
|
||||
|
||||
>>> from nltk.parse.dependencygraph import conll_data2
|
||||
|
||||
>>> graphs = [
|
||||
... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry
|
||||
... ]
|
||||
|
||||
>>> ppdp = ProbabilisticProjectiveDependencyParser()
|
||||
>>> ppdp.train(graphs)
|
||||
|
||||
>>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
|
||||
>>> list(ppdp.parse(sent))
|
||||
[Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])]
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Create a new probabilistic dependency parser. No additional
|
||||
operations are necessary.
|
||||
"""
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Parses the list of tokens subject to the projectivity constraint
|
||||
and the productions in the parser's grammar. This uses a method
|
||||
similar to the span-concatenation algorithm defined in Eisner (1996).
|
||||
It returns the most probable parse derived from the parser's
|
||||
probabilistic dependency grammar.
|
||||
"""
|
||||
self._tokens = list(tokens)
|
||||
chart = []
|
||||
for i in range(0, len(self._tokens) + 1):
|
||||
chart.append([])
|
||||
for j in range(0, len(self._tokens) + 1):
|
||||
chart[i].append(ChartCell(i, j))
|
||||
if i == j + 1:
|
||||
if tokens[i - 1] in self._grammar._tags:
|
||||
for tag in self._grammar._tags[tokens[i - 1]]:
|
||||
chart[i][j].add(
|
||||
DependencySpan(i - 1, i, i - 1, [-1], [tag])
|
||||
)
|
||||
else:
|
||||
print(
|
||||
"No tag found for input token '%s', parse is impossible."
|
||||
% tokens[i - 1]
|
||||
)
|
||||
return []
|
||||
for i in range(1, len(self._tokens) + 1):
|
||||
for j in range(i - 2, -1, -1):
|
||||
for k in range(i - 1, j, -1):
|
||||
for span1 in chart[k][j]._entries:
|
||||
for span2 in chart[i][k]._entries:
|
||||
for newspan in self.concatenate(span1, span2):
|
||||
chart[i][j].add(newspan)
|
||||
trees = []
|
||||
max_parse = None
|
||||
max_score = 0
|
||||
for parse in chart[len(self._tokens)][0]._entries:
|
||||
conll_format = ""
|
||||
malt_format = ""
|
||||
for i in range(len(tokens)):
|
||||
malt_format += "%s\t%s\t%d\t%s\n" % (
|
||||
tokens[i],
|
||||
"null",
|
||||
parse._arcs[i] + 1,
|
||||
"null",
|
||||
)
|
||||
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
|
||||
# Modify to comply with recent change in dependency graph such that there must be a ROOT element.
|
||||
conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
|
||||
i + 1,
|
||||
tokens[i],
|
||||
tokens[i],
|
||||
parse._tags[i],
|
||||
parse._tags[i],
|
||||
"null",
|
||||
parse._arcs[i] + 1,
|
||||
"ROOT",
|
||||
"-",
|
||||
"-",
|
||||
)
|
||||
dg = DependencyGraph(conll_format)
|
||||
score = self.compute_prob(dg)
|
||||
trees.append((score, dg.tree()))
|
||||
trees.sort()
|
||||
return (tree for (score, tree) in trees)
|
||||
|
||||
def concatenate(self, span1, span2):
|
||||
"""
|
||||
Concatenates the two spans in whichever way possible. This
|
||||
includes rightward concatenation (from the leftmost word of the
|
||||
leftmost span to the rightmost word of the rightmost span) and
|
||||
leftward concatenation (vice-versa) between adjacent spans. Unlike
|
||||
Eisner's presentation of span concatenation, these spans do not
|
||||
share or pivot on a particular word/word-index.
|
||||
|
||||
:return: A list of new spans formed through concatenation.
|
||||
:rtype: list(DependencySpan)
|
||||
"""
|
||||
spans = []
|
||||
if span1._start_index == span2._start_index:
|
||||
print("Error: Mismatched spans - replace this with thrown error")
|
||||
if span1._start_index > span2._start_index:
|
||||
temp_span = span1
|
||||
span1 = span2
|
||||
span2 = temp_span
|
||||
# adjacent rightward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
new_tags = span1._tags + span2._tags
|
||||
if self._grammar.contains(
|
||||
self._tokens[span1._head_index], self._tokens[span2._head_index]
|
||||
):
|
||||
new_arcs[span2._head_index - span1._start_index] = span1._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span1._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
# adjacent leftward covered concatenation
|
||||
new_arcs = span1._arcs + span2._arcs
|
||||
new_tags = span1._tags + span2._tags
|
||||
if self._grammar.contains(
|
||||
self._tokens[span2._head_index], self._tokens[span1._head_index]
|
||||
):
|
||||
new_arcs[span1._head_index - span1._start_index] = span2._head_index
|
||||
spans.append(
|
||||
DependencySpan(
|
||||
span1._start_index,
|
||||
span2._end_index,
|
||||
span2._head_index,
|
||||
new_arcs,
|
||||
new_tags,
|
||||
)
|
||||
)
|
||||
return spans
|
||||
|
||||
def train(self, graphs):
|
||||
"""
|
||||
Trains a ProbabilisticDependencyGrammar based on the list of input
|
||||
DependencyGraphs. This model is an implementation of Eisner's (1996)
|
||||
Model C, which derives its statistics from head-word, head-tag,
|
||||
child-word, and child-tag relationships.
|
||||
|
||||
:param graphs: A list of dependency graphs to train from.
|
||||
:type: list(DependencyGraph)
|
||||
"""
|
||||
productions = []
|
||||
events = defaultdict(int)
|
||||
tags = {}
|
||||
for dg in graphs:
|
||||
for node_index in range(1, len(dg.nodes)):
|
||||
# children = dg.nodes[node_index]['deps']
|
||||
children = list(
|
||||
chain.from_iterable(dg.nodes[node_index]["deps"].values())
|
||||
)
|
||||
|
||||
nr_left_children = dg.left_children(node_index)
|
||||
nr_right_children = dg.right_children(node_index)
|
||||
nr_children = nr_left_children + nr_right_children
|
||||
for child_index in range(
|
||||
0 - (nr_left_children + 1), nr_right_children + 2
|
||||
):
|
||||
head_word = dg.nodes[node_index]["word"]
|
||||
head_tag = dg.nodes[node_index]["tag"]
|
||||
if head_word in tags:
|
||||
tags[head_word].add(head_tag)
|
||||
else:
|
||||
tags[head_word] = {head_tag}
|
||||
child = "STOP"
|
||||
child_tag = "STOP"
|
||||
prev_word = "START"
|
||||
prev_tag = "START"
|
||||
if child_index < 0:
|
||||
array_index = child_index + nr_left_children
|
||||
if array_index >= 0:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != -1:
|
||||
prev_word = dg.nodes[children[array_index + 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index + 1]]["tag"]
|
||||
if child != "STOP":
|
||||
productions.append(DependencyProduction(head_word, [child]))
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) left))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
events[head_event] += 1
|
||||
events[mod_event] += 1
|
||||
elif child_index > 0:
|
||||
array_index = child_index + nr_left_children - 1
|
||||
if array_index < nr_children:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != 1:
|
||||
prev_word = dg.nodes[children[array_index - 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index - 1]]["tag"]
|
||||
if child != "STOP":
|
||||
productions.append(DependencyProduction(head_word, [child]))
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) right))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
events[head_event] += 1
|
||||
events[mod_event] += 1
|
||||
self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)
|
||||
|
||||
def compute_prob(self, dg):
|
||||
"""
|
||||
Computes the probability of a dependency graph based
|
||||
on the parser's probability model (defined by the parser's
|
||||
statistical dependency grammar).
|
||||
|
||||
:param dg: A dependency graph to score.
|
||||
:type dg: DependencyGraph
|
||||
:return: The probability of the dependency graph.
|
||||
:rtype: int
|
||||
"""
|
||||
prob = 1.0
|
||||
for node_index in range(1, len(dg.nodes)):
|
||||
# children = dg.nodes[node_index]['deps']
|
||||
children = list(chain.from_iterable(dg.nodes[node_index]["deps"].values()))
|
||||
|
||||
nr_left_children = dg.left_children(node_index)
|
||||
nr_right_children = dg.right_children(node_index)
|
||||
nr_children = nr_left_children + nr_right_children
|
||||
for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
|
||||
head_word = dg.nodes[node_index]["word"]
|
||||
head_tag = dg.nodes[node_index]["tag"]
|
||||
child = "STOP"
|
||||
child_tag = "STOP"
|
||||
prev_word = "START"
|
||||
prev_tag = "START"
|
||||
if child_index < 0:
|
||||
array_index = child_index + nr_left_children
|
||||
if array_index >= 0:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != -1:
|
||||
prev_word = dg.nodes[children[array_index + 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index + 1]]["tag"]
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) left))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
h_count = self._grammar._events[head_event]
|
||||
m_count = self._grammar._events[mod_event]
|
||||
|
||||
# If the grammar is not covered
|
||||
if m_count != 0:
|
||||
prob *= h_count / m_count
|
||||
else:
|
||||
prob = 0.00000001 # Very small number
|
||||
|
||||
elif child_index > 0:
|
||||
array_index = child_index + nr_left_children - 1
|
||||
if array_index < nr_children:
|
||||
child = dg.nodes[children[array_index]]["word"]
|
||||
child_tag = dg.nodes[children[array_index]]["tag"]
|
||||
if child_index != 1:
|
||||
prev_word = dg.nodes[children[array_index - 1]]["word"]
|
||||
prev_tag = dg.nodes[children[array_index - 1]]["tag"]
|
||||
head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format(
|
||||
child,
|
||||
child_tag,
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
mod_event = "(mods ({}, {}, {}) right))".format(
|
||||
prev_tag,
|
||||
head_word,
|
||||
head_tag,
|
||||
)
|
||||
h_count = self._grammar._events[head_event]
|
||||
m_count = self._grammar._events[mod_event]
|
||||
|
||||
if m_count != 0:
|
||||
prob *= h_count / m_count
|
||||
else:
|
||||
prob = 0.00000001 # Very small number
|
||||
|
||||
return prob
|
||||
|
||||
|
||||
#################################################################
|
||||
# Demos
|
||||
#################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
projective_rule_parse_demo()
|
||||
# arity_parse_demo()
|
||||
projective_prob_parse_demo()
|
||||
|
||||
|
||||
def projective_rule_parse_demo():
|
||||
"""
|
||||
A demonstration showing the creation and use of a
|
||||
``DependencyGrammar`` to perform a projective dependency
|
||||
parse.
|
||||
"""
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'scratch' -> 'cats' | 'walls'
|
||||
'walls' -> 'the'
|
||||
'cats' -> 'the'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
pdp = ProjectiveDependencyParser(grammar)
|
||||
trees = pdp.parse(["the", "cats", "scratch", "the", "walls"])
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
|
||||
|
||||
def arity_parse_demo():
|
||||
"""
|
||||
A demonstration showing the creation of a ``DependencyGrammar``
|
||||
in which a specific number of modifiers is listed for a given
|
||||
head. This can further constrain the number of possible parses
|
||||
created by a ``ProjectiveDependencyParser``.
|
||||
"""
|
||||
print()
|
||||
print("A grammar with no arity constraints. Each DependencyProduction")
|
||||
print("specifies a relationship between one head word and only one")
|
||||
print("modifier word.")
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'fell' -> 'price' | 'stock'
|
||||
'price' -> 'of' | 'the'
|
||||
'of' -> 'stock'
|
||||
'stock' -> 'the'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
|
||||
print()
|
||||
print("For the sentence 'The price of the stock fell', this grammar")
|
||||
print("will produce the following three parses:")
|
||||
pdp = ProjectiveDependencyParser(grammar)
|
||||
trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
|
||||
print()
|
||||
print("By contrast, the following grammar contains a ")
|
||||
print("DependencyProduction that specifies a relationship")
|
||||
print("between a single head word, 'price', and two modifier")
|
||||
print("words, 'of' and 'the'.")
|
||||
grammar = DependencyGrammar.fromstring(
|
||||
"""
|
||||
'fell' -> 'price' | 'stock'
|
||||
'price' -> 'of' 'the'
|
||||
'of' -> 'stock'
|
||||
'stock' -> 'the'
|
||||
"""
|
||||
)
|
||||
print(grammar)
|
||||
|
||||
print()
|
||||
print(
|
||||
"This constrains the number of possible parses to just one:"
|
||||
) # unimplemented, soon to replace
|
||||
pdp = ProjectiveDependencyParser(grammar)
|
||||
trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
|
||||
|
||||
def projective_prob_parse_demo():
|
||||
"""
|
||||
A demo showing the training and use of a projective
|
||||
dependency parser.
|
||||
"""
|
||||
from nltk.parse.dependencygraph import conll_data2
|
||||
|
||||
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
|
||||
ppdp = ProbabilisticProjectiveDependencyParser()
|
||||
print("Training Probabilistic Projective Dependency Parser...")
|
||||
ppdp.train(graphs)
|
||||
|
||||
sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."]
|
||||
print("Parsing '", " ".join(sent), "'...")
|
||||
print("Parse:")
|
||||
for tree in ppdp.parse(sent):
|
||||
print(tree)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,684 @@
|
||||
# Natural Language Toolkit: Recursive Descent Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.grammar import Nonterminal
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import ImmutableTree, Tree
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Recursive Descent Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class RecursiveDescentParser(ParserI):
|
||||
"""
|
||||
A simple top-down CFG parser that parses texts by recursively
|
||||
expanding the fringe of a Tree, and matching it against a
|
||||
text.
|
||||
|
||||
``RecursiveDescentParser`` uses a list of tree locations called a
|
||||
"frontier" to remember which subtrees have not yet been expanded
|
||||
and which leaves have not yet been matched against the text. Each
|
||||
tree location consists of a list of child indices specifying the
|
||||
path from the root of the tree to a subtree or a leaf; see the
|
||||
reference documentation for Tree for more information
|
||||
about tree locations.
|
||||
|
||||
When the parser begins parsing a text, it constructs a tree
|
||||
containing only the start symbol, and a frontier containing the
|
||||
location of the tree's root node. It then extends the tree to
|
||||
cover the text, using the following recursive procedure:
|
||||
|
||||
- If the frontier is empty, and the text is covered by the tree,
|
||||
then return the tree as a possible parse.
|
||||
- If the frontier is empty, and the text is not covered by the
|
||||
tree, then return no parses.
|
||||
- If the first element of the frontier is a subtree, then
|
||||
use CFG productions to "expand" it. For each applicable
|
||||
production, add the expanded subtree's children to the
|
||||
frontier, and recursively find all parses that can be
|
||||
generated by the new tree and frontier.
|
||||
- If the first element of the frontier is a token, then "match"
|
||||
it against the next token from the text. Remove the token
|
||||
from the frontier, and recursively find all parses that can be
|
||||
generated by the new tree and frontier.
|
||||
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
"""
|
||||
Create a new ``RecursiveDescentParser``, that uses ``grammar``
|
||||
to parse texts.
|
||||
|
||||
:type grammar: CFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
# Inherit docs from ParserI
|
||||
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
|
||||
# Start a recursive descent parse, with an initial tree
|
||||
# containing just the start symbol.
|
||||
start = self._grammar.start().symbol()
|
||||
initial_tree = Tree(start, [])
|
||||
frontier = [()]
|
||||
if self._trace:
|
||||
self._trace_start(initial_tree, frontier, tokens)
|
||||
return self._parse(tokens, initial_tree, frontier)
|
||||
|
||||
def _parse(self, remaining_text, tree, frontier):
|
||||
"""
|
||||
Recursively expand and match each elements of ``tree``
|
||||
specified by ``frontier``, to cover ``remaining_text``. Return
|
||||
a list of all parses found.
|
||||
|
||||
:return: An iterator of all parses that can be generated by
|
||||
matching and expanding the elements of ``tree``
|
||||
specified by ``frontier``.
|
||||
:rtype: iter(Tree)
|
||||
:type tree: Tree
|
||||
:param tree: A partial structure for the text that is
|
||||
currently being parsed. The elements of ``tree``
|
||||
that are specified by ``frontier`` have not yet been
|
||||
expanded or matched.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``tree``.
|
||||
:type frontier: list(tuple(int))
|
||||
:param frontier: A list of the locations within ``tree`` of
|
||||
all subtrees that have not yet been expanded, and all
|
||||
leaves that have not yet been matched. This list sorted
|
||||
in left-to-right order of location within the tree.
|
||||
"""
|
||||
|
||||
# If the tree covers the text, and there's nothing left to
|
||||
# expand, then we've found a complete parse; return it.
|
||||
if len(remaining_text) == 0 and len(frontier) == 0:
|
||||
if self._trace:
|
||||
self._trace_succeed(tree, frontier)
|
||||
yield tree
|
||||
|
||||
# If there's still text, but nothing left to expand, we failed.
|
||||
elif len(frontier) == 0:
|
||||
if self._trace:
|
||||
self._trace_backtrack(tree, frontier)
|
||||
|
||||
# If the next element on the frontier is a tree, expand it.
|
||||
elif isinstance(tree[frontier[0]], Tree):
|
||||
yield from self._expand(remaining_text, tree, frontier)
|
||||
|
||||
# If the next element on the frontier is a token, match it.
|
||||
else:
|
||||
yield from self._match(remaining_text, tree, frontier)
|
||||
|
||||
def _match(self, rtext, tree, frontier):
|
||||
"""
|
||||
:rtype: iter(Tree)
|
||||
:return: an iterator of all parses that can be generated by
|
||||
matching the first element of ``frontier`` against the
|
||||
first token in ``rtext``. In particular, if the first
|
||||
element of ``frontier`` has the same type as the first
|
||||
token in ``rtext``, then substitute the token into
|
||||
``tree``; and return all parses that can be generated by
|
||||
matching and expanding the remaining elements of
|
||||
``frontier``. If the first element of ``frontier`` does not
|
||||
have the same type as the first token in ``rtext``, then
|
||||
return empty list.
|
||||
|
||||
:type tree: Tree
|
||||
:param tree: A partial structure for the text that is
|
||||
currently being parsed. The elements of ``tree``
|
||||
that are specified by ``frontier`` have not yet been
|
||||
expanded or matched.
|
||||
:type rtext: list(str)
|
||||
:param rtext: The portion of the text that is not yet
|
||||
covered by ``tree``.
|
||||
:type frontier: list of tuple of int
|
||||
:param frontier: A list of the locations within ``tree`` of
|
||||
all subtrees that have not yet been expanded, and all
|
||||
leaves that have not yet been matched.
|
||||
"""
|
||||
|
||||
tree_leaf = tree[frontier[0]]
|
||||
if len(rtext) > 0 and tree_leaf == rtext[0]:
|
||||
# If it's a terminal that matches rtext[0], then substitute
|
||||
# in the token, and continue parsing.
|
||||
newtree = tree.copy(deep=True)
|
||||
newtree[frontier[0]] = rtext[0]
|
||||
if self._trace:
|
||||
self._trace_match(newtree, frontier[1:], rtext[0])
|
||||
yield from self._parse(rtext[1:], newtree, frontier[1:])
|
||||
else:
|
||||
# If it's a non-matching terminal, fail.
|
||||
if self._trace:
|
||||
self._trace_backtrack(tree, frontier, rtext[:1])
|
||||
|
||||
def _expand(self, remaining_text, tree, frontier, production=None):
|
||||
"""
|
||||
:rtype: iter(Tree)
|
||||
:return: An iterator of all parses that can be generated by
|
||||
expanding the first element of ``frontier`` with
|
||||
``production``. In particular, if the first element of
|
||||
``frontier`` is a subtree whose node type is equal to
|
||||
``production``'s left hand side, then add a child to that
|
||||
subtree for each element of ``production``'s right hand
|
||||
side; and return all parses that can be generated by
|
||||
matching and expanding the remaining elements of
|
||||
``frontier``. If the first element of ``frontier`` is not a
|
||||
subtree whose node type is equal to ``production``'s left
|
||||
hand side, then return an empty list. If ``production`` is
|
||||
not specified, then return a list of all parses that can
|
||||
be generated by expanding the first element of ``frontier``
|
||||
with *any* CFG production.
|
||||
|
||||
:type tree: Tree
|
||||
:param tree: A partial structure for the text that is
|
||||
currently being parsed. The elements of ``tree``
|
||||
that are specified by ``frontier`` have not yet been
|
||||
expanded or matched.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``tree``.
|
||||
:type frontier: list(tuple(int))
|
||||
:param frontier: A list of the locations within ``tree`` of
|
||||
all subtrees that have not yet been expanded, and all
|
||||
leaves that have not yet been matched.
|
||||
"""
|
||||
|
||||
if production is None:
|
||||
productions = self._grammar.productions()
|
||||
else:
|
||||
productions = [production]
|
||||
|
||||
for production in productions:
|
||||
lhs = production.lhs().symbol()
|
||||
if lhs == tree[frontier[0]].label():
|
||||
subtree = self._production_to_tree(production)
|
||||
if frontier[0] == ():
|
||||
newtree = subtree
|
||||
else:
|
||||
newtree = tree.copy(deep=True)
|
||||
newtree[frontier[0]] = subtree
|
||||
new_frontier = [
|
||||
frontier[0] + (i,) for i in range(len(production.rhs()))
|
||||
]
|
||||
if self._trace:
|
||||
self._trace_expand(newtree, new_frontier, production)
|
||||
yield from self._parse(
|
||||
remaining_text, newtree, new_frontier + frontier[1:]
|
||||
)
|
||||
|
||||
def _production_to_tree(self, production):
|
||||
"""
|
||||
:rtype: Tree
|
||||
:return: The Tree that is licensed by ``production``.
|
||||
In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
|
||||
return a tree that has a node ``lhs.symbol``, and
|
||||
``n`` children. For each nonterminal element
|
||||
``elt[i]`` in the production, the tree token has a
|
||||
childless subtree with node value ``elt[i].symbol``; and
|
||||
for each terminal element ``elt[j]``, the tree token has
|
||||
a leaf token with type ``elt[j]``.
|
||||
|
||||
:param production: The CFG production that licenses the tree
|
||||
token that should be returned.
|
||||
:type production: Production
|
||||
"""
|
||||
children = []
|
||||
for elt in production.rhs():
|
||||
if isinstance(elt, Nonterminal):
|
||||
children.append(Tree(elt.symbol(), []))
|
||||
else:
|
||||
# This will be matched.
|
||||
children.append(elt)
|
||||
return Tree(production.lhs().symbol(), children)
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
self._trace = trace
|
||||
|
||||
def _trace_fringe(self, tree, treeloc=None):
|
||||
"""
|
||||
Print trace output displaying the fringe of ``tree``. The
|
||||
fringe of ``tree`` consists of all of its leaves and all of
|
||||
its childless subtrees.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
if treeloc == ():
|
||||
print("*", end=" ")
|
||||
if isinstance(tree, Tree):
|
||||
if len(tree) == 0:
|
||||
print(repr(Nonterminal(tree.label())), end=" ")
|
||||
for i in range(len(tree)):
|
||||
if treeloc is not None and i == treeloc[0]:
|
||||
self._trace_fringe(tree[i], treeloc[1:])
|
||||
else:
|
||||
self._trace_fringe(tree[i])
|
||||
else:
|
||||
print(repr(tree), end=" ")
|
||||
|
||||
def _trace_tree(self, tree, frontier, operation):
|
||||
"""
|
||||
Print trace output displaying the parser's current state.
|
||||
|
||||
:param operation: A character identifying the operation that
|
||||
generated the current state.
|
||||
:rtype: None
|
||||
"""
|
||||
if self._trace == 2:
|
||||
print(" %c [" % operation, end=" ")
|
||||
else:
|
||||
print(" [", end=" ")
|
||||
if len(frontier) > 0:
|
||||
self._trace_fringe(tree, frontier[0])
|
||||
else:
|
||||
self._trace_fringe(tree)
|
||||
print("]")
|
||||
|
||||
def _trace_start(self, tree, frontier, text):
|
||||
print("Parsing %r" % " ".join(text))
|
||||
if self._trace > 2:
|
||||
print("Start:")
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, " ")
|
||||
|
||||
def _trace_expand(self, tree, frontier, production):
|
||||
if self._trace > 2:
|
||||
print("Expand: %s" % production)
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, "E")
|
||||
|
||||
def _trace_match(self, tree, frontier, tok):
|
||||
if self._trace > 2:
|
||||
print("Match: %r" % tok)
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, "M")
|
||||
|
||||
def _trace_succeed(self, tree, frontier):
|
||||
if self._trace > 2:
|
||||
print("GOOD PARSE:")
|
||||
if self._trace == 1:
|
||||
print("Found a parse:\n%s" % tree)
|
||||
if self._trace > 1:
|
||||
self._trace_tree(tree, frontier, "+")
|
||||
|
||||
def _trace_backtrack(self, tree, frontier, toks=None):
|
||||
if self._trace > 2:
|
||||
if toks:
|
||||
print("Backtrack: %r match failed" % toks[0])
|
||||
else:
|
||||
print("Backtrack")
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Stepping Recursive Descent Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class SteppingRecursiveDescentParser(RecursiveDescentParser):
|
||||
"""
|
||||
A ``RecursiveDescentParser`` that allows you to step through the
|
||||
parsing process, performing a single operation at a time.
|
||||
|
||||
The ``initialize`` method is used to start parsing a text.
|
||||
``expand`` expands the first element on the frontier using a single
|
||||
CFG production, and ``match`` matches the first element on the
|
||||
frontier against the next text token. ``backtrack`` undoes the most
|
||||
recent expand or match operation. ``step`` performs a single
|
||||
expand, match, or backtrack operation. ``parses`` returns the set
|
||||
of parses that have been found by the parser.
|
||||
|
||||
:ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
|
||||
containing the previous states of the parser. This history is
|
||||
used to implement the ``backtrack`` operation.
|
||||
:ivar _tried_e: A record of all productions that have been tried
|
||||
for a given tree. This record is used by ``expand`` to perform
|
||||
the next untried production.
|
||||
:ivar _tried_m: A record of what tokens have been matched for a
|
||||
given tree. This record is used by ``step`` to decide whether
|
||||
or not to match a token.
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
super().__init__(grammar, trace)
|
||||
self._rtext = None
|
||||
self._tree = None
|
||||
self._frontier = [()]
|
||||
self._tried_e = {}
|
||||
self._tried_m = {}
|
||||
self._history = []
|
||||
self._parses = []
|
||||
|
||||
# [XX] TEMPORARY HACK WARNING! This should be replaced with
|
||||
# something nicer when we get the chance.
|
||||
def _freeze(self, tree):
|
||||
c = tree.copy()
|
||||
# for pos in c.treepositions('leaves'):
|
||||
# c[pos] = c[pos].freeze()
|
||||
return ImmutableTree.convert(c)
|
||||
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
self.initialize(tokens)
|
||||
while self.step() is not None:
|
||||
pass
|
||||
return self.parses()
|
||||
|
||||
def initialize(self, tokens):
|
||||
"""
|
||||
Start parsing a given text. This sets the parser's tree to
|
||||
the start symbol, its frontier to the root node, and its
|
||||
remaining text to ``token['SUBTOKENS']``.
|
||||
"""
|
||||
|
||||
self._rtext = tokens
|
||||
start = self._grammar.start().symbol()
|
||||
self._tree = Tree(start, [])
|
||||
self._frontier = [()]
|
||||
self._tried_e = {}
|
||||
self._tried_m = {}
|
||||
self._history = []
|
||||
self._parses = []
|
||||
if self._trace:
|
||||
self._trace_start(self._tree, self._frontier, self._rtext)
|
||||
|
||||
def remaining_text(self):
|
||||
"""
|
||||
:return: The portion of the text that is not yet covered by the
|
||||
tree.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self._rtext
|
||||
|
||||
def frontier(self):
|
||||
"""
|
||||
:return: A list of the tree locations of all subtrees that
|
||||
have not yet been expanded, and all leaves that have not
|
||||
yet been matched.
|
||||
:rtype: list(tuple(int))
|
||||
"""
|
||||
return self._frontier
|
||||
|
||||
def tree(self):
|
||||
"""
|
||||
:return: A partial structure for the text that is
|
||||
currently being parsed. The elements specified by the
|
||||
frontier have not yet been expanded or matched.
|
||||
:rtype: Tree
|
||||
"""
|
||||
return self._tree
|
||||
|
||||
def step(self):
|
||||
"""
|
||||
Perform a single parsing operation. If an untried match is
|
||||
possible, then perform the match, and return the matched
|
||||
token. If an untried expansion is possible, then perform the
|
||||
expansion, and return the production that it is based on. If
|
||||
backtracking is possible, then backtrack, and return True.
|
||||
Otherwise, return None.
|
||||
|
||||
:return: None if no operation was performed; a token if a match
|
||||
was performed; a production if an expansion was performed;
|
||||
and True if a backtrack operation was performed.
|
||||
:rtype: Production or String or bool
|
||||
"""
|
||||
# Try matching (if we haven't already)
|
||||
if self.untried_match():
|
||||
token = self.match()
|
||||
if token is not None:
|
||||
return token
|
||||
|
||||
# Try expanding.
|
||||
production = self.expand()
|
||||
if production is not None:
|
||||
return production
|
||||
|
||||
# Try backtracking
|
||||
if self.backtrack():
|
||||
self._trace_backtrack(self._tree, self._frontier)
|
||||
return True
|
||||
|
||||
# Nothing left to do.
|
||||
return None
|
||||
|
||||
def expand(self, production=None):
|
||||
"""
|
||||
Expand the first element of the frontier. In particular, if
|
||||
the first element of the frontier is a subtree whose node type
|
||||
is equal to ``production``'s left hand side, then add a child
|
||||
to that subtree for each element of ``production``'s right hand
|
||||
side. If ``production`` is not specified, then use the first
|
||||
untried expandable production. If all expandable productions
|
||||
have been tried, do nothing.
|
||||
|
||||
:return: The production used to expand the frontier, if an
|
||||
expansion was performed. If no expansion was performed,
|
||||
return None.
|
||||
:rtype: Production or None
|
||||
"""
|
||||
|
||||
# Make sure we *can* expand.
|
||||
if len(self._frontier) == 0:
|
||||
return None
|
||||
if not isinstance(self._tree[self._frontier[0]], Tree):
|
||||
return None
|
||||
|
||||
# If they didn't specify a production, check all untried ones.
|
||||
if production is None:
|
||||
productions = self.untried_expandable_productions()
|
||||
else:
|
||||
productions = [production]
|
||||
|
||||
parses = []
|
||||
for prod in productions:
|
||||
# Record that we've tried this production now.
|
||||
self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
|
||||
|
||||
# Try expanding.
|
||||
for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
|
||||
return prod
|
||||
|
||||
# We didn't expand anything.
|
||||
return None
|
||||
|
||||
def match(self):
|
||||
"""
|
||||
Match the first element of the frontier. In particular, if
|
||||
the first element of the frontier has the same type as the
|
||||
next text token, then substitute the text token into the tree.
|
||||
|
||||
:return: The token matched, if a match operation was
|
||||
performed. If no match was performed, return None
|
||||
:rtype: str or None
|
||||
"""
|
||||
|
||||
# Record that we've tried matching this token.
|
||||
tok = self._rtext[0]
|
||||
self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
|
||||
|
||||
# Make sure we *can* match.
|
||||
if len(self._frontier) == 0:
|
||||
return None
|
||||
if isinstance(self._tree[self._frontier[0]], Tree):
|
||||
return None
|
||||
|
||||
for _result in self._match(self._rtext, self._tree, self._frontier):
|
||||
# Return the token we just matched.
|
||||
return self._history[-1][0][0]
|
||||
return None
|
||||
|
||||
def backtrack(self):
|
||||
"""
|
||||
Return the parser to its state before the most recent
|
||||
match or expand operation. Calling ``undo`` repeatedly return
|
||||
the parser to successively earlier states. If no match or
|
||||
expand operations have been performed, ``undo`` will make no
|
||||
changes.
|
||||
|
||||
:return: true if an operation was successfully undone.
|
||||
:rtype: bool
|
||||
"""
|
||||
if len(self._history) == 0:
|
||||
return False
|
||||
(self._rtext, self._tree, self._frontier) = self._history.pop()
|
||||
return True
|
||||
|
||||
def expandable_productions(self):
|
||||
"""
|
||||
:return: A list of all the productions for which expansions
|
||||
are available for the current parser state.
|
||||
:rtype: list(Production)
|
||||
"""
|
||||
# Make sure we *can* expand.
|
||||
if len(self._frontier) == 0:
|
||||
return []
|
||||
frontier_child = self._tree[self._frontier[0]]
|
||||
if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
|
||||
return []
|
||||
|
||||
return [
|
||||
p
|
||||
for p in self._grammar.productions()
|
||||
if p.lhs().symbol() == frontier_child.label()
|
||||
]
|
||||
|
||||
def untried_expandable_productions(self):
|
||||
"""
|
||||
:return: A list of all the untried productions for which
|
||||
expansions are available for the current parser state.
|
||||
:rtype: list(Production)
|
||||
"""
|
||||
|
||||
tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
|
||||
return [p for p in self.expandable_productions() if p not in tried_expansions]
|
||||
|
||||
def untried_match(self):
|
||||
"""
|
||||
:return: Whether the first element of the frontier is a token
|
||||
that has not yet been matched.
|
||||
:rtype: bool
|
||||
"""
|
||||
|
||||
if len(self._rtext) == 0:
|
||||
return False
|
||||
tried_matches = self._tried_m.get(self._freeze(self._tree), [])
|
||||
return self._rtext[0] not in tried_matches
|
||||
|
||||
def currently_complete(self):
|
||||
"""
|
||||
:return: Whether the parser's current state represents a
|
||||
complete parse.
|
||||
:rtype: bool
|
||||
"""
|
||||
return len(self._frontier) == 0 and len(self._rtext) == 0
|
||||
|
||||
def _parse(self, remaining_text, tree, frontier):
|
||||
"""
|
||||
A stub version of ``_parse`` that sets the parsers current
|
||||
state to the given arguments. In ``RecursiveDescentParser``,
|
||||
the ``_parse`` method is used to recursively continue parsing a
|
||||
text. ``SteppingRecursiveDescentParser`` overrides it to
|
||||
capture these recursive calls. It records the parser's old
|
||||
state in the history (to allow for backtracking), and updates
|
||||
the parser's new state using the given arguments. Finally, it
|
||||
returns ``[1]``, which is used by ``match`` and ``expand`` to
|
||||
detect whether their operations were successful.
|
||||
|
||||
:return: ``[1]``
|
||||
:rtype: list of int
|
||||
"""
|
||||
self._history.append((self._rtext, self._tree, self._frontier))
|
||||
self._rtext = remaining_text
|
||||
self._tree = tree
|
||||
self._frontier = frontier
|
||||
|
||||
# Is it a good parse? If so, record it.
|
||||
if len(frontier) == 0 and len(remaining_text) == 0:
|
||||
self._parses.append(tree)
|
||||
self._trace_succeed(self._tree, self._frontier)
|
||||
|
||||
return [1]
|
||||
|
||||
def parses(self):
|
||||
"""
|
||||
:return: An iterator of the parses that have been found by this
|
||||
parser so far.
|
||||
:rtype: list of Tree
|
||||
"""
|
||||
return iter(self._parses)
|
||||
|
||||
def set_grammar(self, grammar):
|
||||
"""
|
||||
Change the grammar used to parse texts.
|
||||
|
||||
:param grammar: The new grammar.
|
||||
:type grammar: CFG
|
||||
"""
|
||||
self._grammar = grammar
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demonstration Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the recursive descent parser.
|
||||
"""
|
||||
|
||||
from nltk import CFG, parse
|
||||
|
||||
grammar = CFG.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
NP -> Det N | Det N PP
|
||||
VP -> V NP | V NP PP
|
||||
PP -> P NP
|
||||
NP -> 'I'
|
||||
N -> 'man' | 'park' | 'telescope' | 'dog'
|
||||
Det -> 'the' | 'a'
|
||||
P -> 'in' | 'with'
|
||||
V -> 'saw'
|
||||
"""
|
||||
)
|
||||
|
||||
for prod in grammar.productions():
|
||||
print(prod)
|
||||
|
||||
sent = "I saw a man in the park".split()
|
||||
parser = parse.RecursiveDescentParser(grammar, trace=2)
|
||||
for p in parser.parse(sent):
|
||||
print(p)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,478 @@
|
||||
# Natural Language Toolkit: Shift-Reduce Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.grammar import Nonterminal
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Shift/Reduce Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class ShiftReduceParser(ParserI):
|
||||
"""
|
||||
A simple bottom-up CFG parser that uses two operations, "shift"
|
||||
and "reduce", to find a single parse for a text.
|
||||
|
||||
``ShiftReduceParser`` maintains a stack, which records the
|
||||
structure of a portion of the text. This stack is a list of
|
||||
strings and Trees that collectively cover a portion of
|
||||
the text. For example, while parsing the sentence "the dog saw
|
||||
the man" with a typical grammar, ``ShiftReduceParser`` will produce
|
||||
the following stack, which covers "the dog saw"::
|
||||
|
||||
[(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
|
||||
|
||||
``ShiftReduceParser`` attempts to extend the stack to cover the
|
||||
entire text, and to combine the stack elements into a single tree,
|
||||
producing a complete parse for the sentence.
|
||||
|
||||
Initially, the stack is empty. It is extended to cover the text,
|
||||
from left to right, by repeatedly applying two operations:
|
||||
|
||||
- "shift" moves a token from the beginning of the text to the
|
||||
end of the stack.
|
||||
- "reduce" uses a CFG production to combine the rightmost stack
|
||||
elements into a single Tree.
|
||||
|
||||
Often, more than one operation can be performed on a given stack.
|
||||
In this case, ``ShiftReduceParser`` uses the following heuristics
|
||||
to decide which operation to perform:
|
||||
|
||||
- Only shift if no reductions are available.
|
||||
- If multiple reductions are available, then apply the reduction
|
||||
whose CFG production is listed earliest in the grammar.
|
||||
|
||||
Note that these heuristics are not guaranteed to choose an
|
||||
operation that leads to a parse of the text. Also, if multiple
|
||||
parses exists, ``ShiftReduceParser`` will return at most one of
|
||||
them.
|
||||
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
"""
|
||||
Create a new ``ShiftReduceParser``, that uses ``grammar`` to
|
||||
parse texts.
|
||||
|
||||
:type grammar: Grammar
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
self._check_grammar()
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
|
||||
# initialize the stack.
|
||||
stack = []
|
||||
remaining_text = tokens
|
||||
|
||||
# Trace output.
|
||||
if self._trace:
|
||||
print("Parsing %r" % " ".join(tokens))
|
||||
self._trace_stack(stack, remaining_text)
|
||||
|
||||
# iterate through the text, pushing the token onto
|
||||
# the stack, then reducing the stack.
|
||||
while len(remaining_text) > 0:
|
||||
self._shift(stack, remaining_text)
|
||||
while self._reduce(stack, remaining_text):
|
||||
pass
|
||||
|
||||
# Did we reduce everything?
|
||||
if len(stack) == 1:
|
||||
# Did we end up with the right category?
|
||||
if stack[0].label() == self._grammar.start().symbol():
|
||||
yield stack[0]
|
||||
|
||||
def _shift(self, stack, remaining_text):
|
||||
"""
|
||||
Move a token from the beginning of ``remaining_text`` to the
|
||||
end of ``stack``.
|
||||
|
||||
:type stack: list(str and Tree)
|
||||
:param stack: A list of strings and Trees, encoding
|
||||
the structure of the text that has been parsed so far.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``stack``.
|
||||
:rtype: None
|
||||
"""
|
||||
stack.append(remaining_text[0])
|
||||
remaining_text.remove(remaining_text[0])
|
||||
if self._trace:
|
||||
self._trace_shift(stack, remaining_text)
|
||||
|
||||
def _match_rhs(self, rhs, rightmost_stack):
|
||||
"""
|
||||
:rtype: bool
|
||||
:return: true if the right hand side of a CFG production
|
||||
matches the rightmost elements of the stack. ``rhs``
|
||||
matches ``rightmost_stack`` if they are the same length,
|
||||
and each element of ``rhs`` matches the corresponding
|
||||
element of ``rightmost_stack``. A nonterminal element of
|
||||
``rhs`` matches any Tree whose node value is equal
|
||||
to the nonterminal's symbol. A terminal element of ``rhs``
|
||||
matches any string whose type is equal to the terminal.
|
||||
:type rhs: list(terminal and Nonterminal)
|
||||
:param rhs: The right hand side of a CFG production.
|
||||
:type rightmost_stack: list(string and Tree)
|
||||
:param rightmost_stack: The rightmost elements of the parser's
|
||||
stack.
|
||||
"""
|
||||
|
||||
if len(rightmost_stack) != len(rhs):
|
||||
return False
|
||||
for i in range(len(rightmost_stack)):
|
||||
if isinstance(rightmost_stack[i], Tree):
|
||||
if not isinstance(rhs[i], Nonterminal):
|
||||
return False
|
||||
if rightmost_stack[i].label() != rhs[i].symbol():
|
||||
return False
|
||||
else:
|
||||
if isinstance(rhs[i], Nonterminal):
|
||||
return False
|
||||
if rightmost_stack[i] != rhs[i]:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _reduce(self, stack, remaining_text, production=None):
|
||||
"""
|
||||
Find a CFG production whose right hand side matches the
|
||||
rightmost stack elements; and combine those stack elements
|
||||
into a single Tree, with the node specified by the
|
||||
production's left-hand side. If more than one CFG production
|
||||
matches the stack, then use the production that is listed
|
||||
earliest in the grammar. The new Tree replaces the
|
||||
elements in the stack.
|
||||
|
||||
:rtype: Production or None
|
||||
:return: If a reduction is performed, then return the CFG
|
||||
production that the reduction is based on; otherwise,
|
||||
return false.
|
||||
:type stack: list(string and Tree)
|
||||
:param stack: A list of strings and Trees, encoding
|
||||
the structure of the text that has been parsed so far.
|
||||
:type remaining_text: list(str)
|
||||
:param remaining_text: The portion of the text that is not yet
|
||||
covered by ``stack``.
|
||||
"""
|
||||
if production is None:
|
||||
productions = self._grammar.productions()
|
||||
else:
|
||||
productions = [production]
|
||||
|
||||
# Try each production, in order.
|
||||
for production in productions:
|
||||
rhslen = len(production.rhs())
|
||||
|
||||
# check if the RHS of a production matches the top of the stack
|
||||
if self._match_rhs(production.rhs(), stack[-rhslen:]):
|
||||
# combine the tree to reflect the reduction
|
||||
tree = Tree(production.lhs().symbol(), stack[-rhslen:])
|
||||
stack[-rhslen:] = [tree]
|
||||
|
||||
# We reduced something
|
||||
if self._trace:
|
||||
self._trace_reduce(stack, production, remaining_text)
|
||||
return production
|
||||
|
||||
# We didn't reduce anything
|
||||
return None
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
# 1: just show shifts.
|
||||
# 2: show shifts & reduces
|
||||
# 3: display which tokens & productions are shifed/reduced
|
||||
self._trace = trace
|
||||
|
||||
def _trace_stack(self, stack, remaining_text, marker=" "):
|
||||
"""
|
||||
Print trace output displaying the given stack and text.
|
||||
|
||||
:rtype: None
|
||||
:param marker: A character that is printed to the left of the
|
||||
stack. This is used with trace level 2 to print 'S'
|
||||
before shifted stacks and 'R' before reduced stacks.
|
||||
"""
|
||||
s = " " + marker + " [ "
|
||||
for elt in stack:
|
||||
if isinstance(elt, Tree):
|
||||
s += repr(Nonterminal(elt.label())) + " "
|
||||
else:
|
||||
s += repr(elt) + " "
|
||||
s += "* " + " ".join(remaining_text) + "]"
|
||||
print(s)
|
||||
|
||||
def _trace_shift(self, stack, remaining_text):
|
||||
"""
|
||||
Print trace output displaying that a token has been shifted.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
if self._trace > 2:
|
||||
print("Shift %r:" % stack[-1])
|
||||
if self._trace == 2:
|
||||
self._trace_stack(stack, remaining_text, "S")
|
||||
elif self._trace > 0:
|
||||
self._trace_stack(stack, remaining_text)
|
||||
|
||||
def _trace_reduce(self, stack, production, remaining_text):
|
||||
"""
|
||||
Print trace output displaying that ``production`` was used to
|
||||
reduce ``stack``.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
if self._trace > 2:
|
||||
rhs = " ".join(production.rhs())
|
||||
print(f"Reduce {production.lhs()!r} <- {rhs}")
|
||||
if self._trace == 2:
|
||||
self._trace_stack(stack, remaining_text, "R")
|
||||
elif self._trace > 1:
|
||||
self._trace_stack(stack, remaining_text)
|
||||
|
||||
def _check_grammar(self):
|
||||
"""
|
||||
Check to make sure that all of the CFG productions are
|
||||
potentially useful. If any productions can never be used,
|
||||
then print a warning.
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
productions = self._grammar.productions()
|
||||
|
||||
# Any production whose RHS is an extension of another production's RHS
|
||||
# will never be used.
|
||||
for i in range(len(productions)):
|
||||
for j in range(i + 1, len(productions)):
|
||||
rhs1 = productions[i].rhs()
|
||||
rhs2 = productions[j].rhs()
|
||||
if rhs1[: len(rhs2)] == rhs2:
|
||||
print("Warning: %r will never be used" % productions[i])
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Stepping Shift/Reduce Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
class SteppingShiftReduceParser(ShiftReduceParser):
|
||||
"""
|
||||
A ``ShiftReduceParser`` that allows you to setp through the parsing
|
||||
process, performing a single operation at a time. It also allows
|
||||
you to change the parser's grammar midway through parsing a text.
|
||||
|
||||
The ``initialize`` method is used to start parsing a text.
|
||||
``shift`` performs a single shift operation, and ``reduce`` performs
|
||||
a single reduce operation. ``step`` will perform a single reduce
|
||||
operation if possible; otherwise, it will perform a single shift
|
||||
operation. ``parses`` returns the set of parses that have been
|
||||
found by the parser.
|
||||
|
||||
:ivar _history: A list of ``(stack, remaining_text)`` pairs,
|
||||
containing all of the previous states of the parser. This
|
||||
history is used to implement the ``undo`` operation.
|
||||
:see: ``nltk.grammar``
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
super().__init__(grammar, trace)
|
||||
self._stack = None
|
||||
self._remaining_text = None
|
||||
self._history = []
|
||||
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
self.initialize(tokens)
|
||||
while self.step():
|
||||
pass
|
||||
return self.parses()
|
||||
|
||||
def stack(self):
|
||||
"""
|
||||
:return: The parser's stack.
|
||||
:rtype: list(str and Tree)
|
||||
"""
|
||||
return self._stack
|
||||
|
||||
def remaining_text(self):
|
||||
"""
|
||||
:return: The portion of the text that is not yet covered by the
|
||||
stack.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self._remaining_text
|
||||
|
||||
def initialize(self, tokens):
|
||||
"""
|
||||
Start parsing a given text. This sets the parser's stack to
|
||||
``[]`` and sets its remaining text to ``tokens``.
|
||||
"""
|
||||
self._stack = []
|
||||
self._remaining_text = tokens
|
||||
self._history = []
|
||||
|
||||
def step(self):
|
||||
"""
|
||||
Perform a single parsing operation. If a reduction is
|
||||
possible, then perform that reduction, and return the
|
||||
production that it is based on. Otherwise, if a shift is
|
||||
possible, then perform it, and return True. Otherwise,
|
||||
return False.
|
||||
|
||||
:return: False if no operation was performed; True if a shift was
|
||||
performed; and the CFG production used to reduce if a
|
||||
reduction was performed.
|
||||
:rtype: Production or bool
|
||||
"""
|
||||
return self.reduce() or self.shift()
|
||||
|
||||
def shift(self):
|
||||
"""
|
||||
Move a token from the beginning of the remaining text to the
|
||||
end of the stack. If there are no more tokens in the
|
||||
remaining text, then do nothing.
|
||||
|
||||
:return: True if the shift operation was successful.
|
||||
:rtype: bool
|
||||
"""
|
||||
if len(self._remaining_text) == 0:
|
||||
return False
|
||||
self._history.append((self._stack[:], self._remaining_text[:]))
|
||||
self._shift(self._stack, self._remaining_text)
|
||||
return True
|
||||
|
||||
def reduce(self, production=None):
|
||||
"""
|
||||
Use ``production`` to combine the rightmost stack elements into
|
||||
a single Tree. If ``production`` does not match the
|
||||
rightmost stack elements, then do nothing.
|
||||
|
||||
:return: The production used to reduce the stack, if a
|
||||
reduction was performed. If no reduction was performed,
|
||||
return None.
|
||||
|
||||
:rtype: Production or None
|
||||
"""
|
||||
self._history.append((self._stack[:], self._remaining_text[:]))
|
||||
return_val = self._reduce(self._stack, self._remaining_text, production)
|
||||
|
||||
if not return_val:
|
||||
self._history.pop()
|
||||
return return_val
|
||||
|
||||
def undo(self):
|
||||
"""
|
||||
Return the parser to its state before the most recent
|
||||
shift or reduce operation. Calling ``undo`` repeatedly return
|
||||
the parser to successively earlier states. If no shift or
|
||||
reduce operations have been performed, ``undo`` will make no
|
||||
changes.
|
||||
|
||||
:return: true if an operation was successfully undone.
|
||||
:rtype: bool
|
||||
"""
|
||||
if len(self._history) == 0:
|
||||
return False
|
||||
(self._stack, self._remaining_text) = self._history.pop()
|
||||
return True
|
||||
|
||||
def reducible_productions(self):
|
||||
"""
|
||||
:return: A list of the productions for which reductions are
|
||||
available for the current parser state.
|
||||
:rtype: list(Production)
|
||||
"""
|
||||
productions = []
|
||||
for production in self._grammar.productions():
|
||||
rhslen = len(production.rhs())
|
||||
if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
|
||||
productions.append(production)
|
||||
return productions
|
||||
|
||||
def parses(self):
|
||||
"""
|
||||
:return: An iterator of the parses that have been found by this
|
||||
parser so far.
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
if (
|
||||
len(self._remaining_text) == 0
|
||||
and len(self._stack) == 1
|
||||
and self._stack[0].label() == self._grammar.start().symbol()
|
||||
):
|
||||
yield self._stack[0]
|
||||
|
||||
# copied from nltk.parser
|
||||
|
||||
def set_grammar(self, grammar):
|
||||
"""
|
||||
Change the grammar used to parse texts.
|
||||
|
||||
:param grammar: The new grammar.
|
||||
:type grammar: CFG
|
||||
"""
|
||||
self._grammar = grammar
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Demonstration Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the shift-reduce parser.
|
||||
"""
|
||||
|
||||
from nltk import CFG, parse
|
||||
|
||||
grammar = CFG.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
NP -> Det N | Det N PP
|
||||
VP -> V NP | V NP PP
|
||||
PP -> P NP
|
||||
NP -> 'I'
|
||||
N -> 'man' | 'park' | 'telescope' | 'dog'
|
||||
Det -> 'the' | 'a'
|
||||
P -> 'in' | 'with'
|
||||
V -> 'saw'
|
||||
"""
|
||||
)
|
||||
|
||||
sent = "I saw a man in the park".split()
|
||||
|
||||
parser = parse.ShiftReduceParser(grammar, trace=2)
|
||||
for p in parser.parse(sent):
|
||||
print(p)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
468
Backend/venv/lib/python3.12/site-packages/nltk/parse/stanford.py
Normal file
468
Backend/venv/lib/python3.12/site-packages/nltk/parse/stanford.py
Normal file
@@ -0,0 +1,468 @@
|
||||
# Natural Language Toolkit: Interface to the Stanford Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Xu <xxu@student.unimelb.edu.au>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import warnings
|
||||
from subprocess import PIPE
|
||||
|
||||
from nltk.internals import (
|
||||
_java_options,
|
||||
config_java,
|
||||
find_jar_iter,
|
||||
find_jars_within_path,
|
||||
java,
|
||||
)
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.parse.dependencygraph import DependencyGraph
|
||||
from nltk.tree import Tree
|
||||
|
||||
_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
|
||||
|
||||
|
||||
class GenericStanfordParser(ParserI):
|
||||
"""Interface to the Stanford Parser"""
|
||||
|
||||
_MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
|
||||
_JAR = r"stanford-parser\.jar"
|
||||
_MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
|
||||
|
||||
_USE_STDIN = False
|
||||
_DOUBLE_SPACED_OUTPUT = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
path_to_models_jar=None,
|
||||
model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
|
||||
encoding="utf8",
|
||||
verbose=False,
|
||||
java_options="-mx4g",
|
||||
corenlp_options="",
|
||||
):
|
||||
# find the most recent code and model jar
|
||||
stanford_jar = max(
|
||||
find_jar_iter(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
),
|
||||
key=lambda model_path: os.path.dirname(model_path),
|
||||
)
|
||||
|
||||
model_jar = max(
|
||||
find_jar_iter(
|
||||
self._MODEL_JAR_PATTERN,
|
||||
path_to_models_jar,
|
||||
env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
is_regex=True,
|
||||
),
|
||||
key=lambda model_path: os.path.dirname(model_path),
|
||||
)
|
||||
|
||||
# self._classpath = (stanford_jar, model_jar)
|
||||
|
||||
# Adding logging jar files to classpath
|
||||
stanford_dir = os.path.split(stanford_jar)[0]
|
||||
self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
|
||||
|
||||
self.model_path = model_path
|
||||
self._encoding = encoding
|
||||
self.corenlp_options = corenlp_options
|
||||
self.java_options = java_options
|
||||
|
||||
def _parse_trees_output(self, output_):
|
||||
res = []
|
||||
cur_lines = []
|
||||
cur_trees = []
|
||||
blank = False
|
||||
for line in output_.splitlines(False):
|
||||
if line == "":
|
||||
if blank:
|
||||
res.append(iter(cur_trees))
|
||||
cur_trees = []
|
||||
blank = False
|
||||
elif self._DOUBLE_SPACED_OUTPUT:
|
||||
cur_trees.append(self._make_tree("\n".join(cur_lines)))
|
||||
cur_lines = []
|
||||
blank = True
|
||||
else:
|
||||
res.append(iter([self._make_tree("\n".join(cur_lines))]))
|
||||
cur_lines = []
|
||||
else:
|
||||
cur_lines.append(line)
|
||||
blank = False
|
||||
return iter(res)
|
||||
|
||||
def parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
|
||||
list where each sentence is a list of words.
|
||||
Each sentence will be automatically tagged with this StanfordParser instance's
|
||||
tagger.
|
||||
If whitespaces exists inside a token, then the token will be treated as
|
||||
separate tokens.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(list(str))
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
cmd = [
|
||||
self._MAIN_CLASS,
|
||||
"-model",
|
||||
self.model_path,
|
||||
"-sentences",
|
||||
"newline",
|
||||
"-outputFormat",
|
||||
self._OUTPUT_FORMAT,
|
||||
"-tokenized",
|
||||
"-escaper",
|
||||
"edu.stanford.nlp.process.PTBEscapingProcessor",
|
||||
]
|
||||
return self._parse_trees_output(
|
||||
self._execute(
|
||||
cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
|
||||
)
|
||||
)
|
||||
|
||||
def raw_parse(self, sentence, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse a sentence. Takes a sentence as a string;
|
||||
before parsing, it will be automatically tokenized and tagged by
|
||||
the Stanford Parser.
|
||||
|
||||
:param sentence: Input sentence to parse
|
||||
:type sentence: str
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
return next(self.raw_parse_sents([sentence], verbose))
|
||||
|
||||
def raw_parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
|
||||
list of strings.
|
||||
Each sentence will be automatically tokenized and tagged by the Stanford Parser.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(str)
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
cmd = [
|
||||
self._MAIN_CLASS,
|
||||
"-model",
|
||||
self.model_path,
|
||||
"-sentences",
|
||||
"newline",
|
||||
"-outputFormat",
|
||||
self._OUTPUT_FORMAT,
|
||||
]
|
||||
return self._parse_trees_output(
|
||||
self._execute(cmd, "\n".join(sentences), verbose)
|
||||
)
|
||||
|
||||
def tagged_parse(self, sentence, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse a sentence. Takes a sentence as a list of
|
||||
(word, tag) tuples; the sentence must have already been tokenized and
|
||||
tagged.
|
||||
|
||||
:param sentence: Input sentence to parse
|
||||
:type sentence: list(tuple(str, str))
|
||||
:rtype: iter(Tree)
|
||||
"""
|
||||
return next(self.tagged_parse_sents([sentence], verbose))
|
||||
|
||||
def tagged_parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Use StanfordParser to parse multiple sentences. Takes multiple sentences
|
||||
where each sentence is a list of (word, tag) tuples.
|
||||
The sentences must have already been tokenized and tagged.
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentences: list(list(tuple(str, str)))
|
||||
:rtype: iter(iter(Tree))
|
||||
"""
|
||||
tag_separator = "/"
|
||||
cmd = [
|
||||
self._MAIN_CLASS,
|
||||
"-model",
|
||||
self.model_path,
|
||||
"-sentences",
|
||||
"newline",
|
||||
"-outputFormat",
|
||||
self._OUTPUT_FORMAT,
|
||||
"-tokenized",
|
||||
"-tagSeparator",
|
||||
tag_separator,
|
||||
"-tokenizerFactory",
|
||||
"edu.stanford.nlp.process.WhitespaceTokenizer",
|
||||
"-tokenizerMethod",
|
||||
"newCoreLabelTokenizerFactory",
|
||||
]
|
||||
# We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
|
||||
return self._parse_trees_output(
|
||||
self._execute(
|
||||
cmd,
|
||||
"\n".join(
|
||||
" ".join(tag_separator.join(tagged) for tagged in sentence)
|
||||
for sentence in sentences
|
||||
),
|
||||
verbose,
|
||||
)
|
||||
)
|
||||
|
||||
def _execute(self, cmd, input_, verbose=False):
|
||||
encoding = self._encoding
|
||||
cmd.extend(["-encoding", encoding])
|
||||
if self.corenlp_options:
|
||||
cmd.extend(self.corenlp_options.split())
|
||||
|
||||
default_options = " ".join(_java_options)
|
||||
|
||||
# Configure java.
|
||||
config_java(options=self.java_options, verbose=verbose)
|
||||
|
||||
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
|
||||
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
|
||||
# Write the actual sentences to the temporary input file
|
||||
if isinstance(input_, str) and encoding:
|
||||
input_ = input_.encode(encoding)
|
||||
input_file.write(input_)
|
||||
input_file.flush()
|
||||
|
||||
# Run the tagger and get the output.
|
||||
if self._USE_STDIN:
|
||||
input_file.seek(0)
|
||||
stdout, stderr = java(
|
||||
cmd,
|
||||
classpath=self._classpath,
|
||||
stdin=input_file,
|
||||
stdout=PIPE,
|
||||
stderr=PIPE,
|
||||
)
|
||||
else:
|
||||
cmd.append(input_file.name)
|
||||
stdout, stderr = java(
|
||||
cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
|
||||
stdout = stdout.replace(b"\xc2\xa0", b" ")
|
||||
stdout = stdout.replace(b"\x00\xa0", b" ")
|
||||
stdout = stdout.decode(encoding)
|
||||
|
||||
os.unlink(input_file.name)
|
||||
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=False)
|
||||
|
||||
return stdout
|
||||
|
||||
|
||||
class StanfordParser(GenericStanfordParser):
|
||||
"""
|
||||
>>> parser=StanfordParser(
|
||||
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
|
||||
... ) # doctest: +SKIP
|
||||
|
||||
>>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
||||
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
|
||||
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
|
||||
|
||||
>>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
|
||||
... "the quick brown fox jumps over the lazy dog",
|
||||
... "the quick grey wolf jumps over the lazy fox"
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
||||
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
|
||||
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
|
||||
[Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
|
||||
[Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
|
||||
Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
|
||||
|
||||
>>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
|
||||
Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
|
||||
[Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
|
||||
Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
|
||||
Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
|
||||
|
||||
>>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
|
||||
... (
|
||||
... ("The", "DT"),
|
||||
... ("quick", "JJ"),
|
||||
... ("brown", "JJ"),
|
||||
... ("fox", "NN"),
|
||||
... ("jumped", "VBD"),
|
||||
... ("over", "IN"),
|
||||
... ("the", "DT"),
|
||||
... ("lazy", "JJ"),
|
||||
... ("dog", "NN"),
|
||||
... (".", "."),
|
||||
... ),
|
||||
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
|
||||
Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
|
||||
[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "penn"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(
|
||||
"The StanfordParser will be deprecated\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _make_tree(self, result):
|
||||
return Tree.fromstring(result)
|
||||
|
||||
|
||||
class StanfordDependencyParser(GenericStanfordParser):
|
||||
"""
|
||||
>>> dep_parser=StanfordDependencyParser(
|
||||
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
|
||||
... ) # doctest: +SKIP
|
||||
|
||||
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
|
||||
|
||||
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
|
||||
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
|
||||
((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
|
||||
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
|
||||
... "The quick brown fox jumps over the lazy dog.",
|
||||
... "The quick grey wolf jumps over the lazy fox."
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
|
||||
Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
|
||||
|
||||
>>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
|
||||
... (
|
||||
... ("The", "DT"),
|
||||
... ("quick", "JJ"),
|
||||
... ("brown", "JJ"),
|
||||
... ("fox", "NN"),
|
||||
... ("jumped", "VBD"),
|
||||
... ("over", "IN"),
|
||||
... ("the", "DT"),
|
||||
... ("lazy", "JJ"),
|
||||
... ("dog", "NN"),
|
||||
... (".", "."),
|
||||
... ),
|
||||
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
|
||||
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
|
||||
((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
|
||||
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
|
||||
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "conll2007"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(
|
||||
"The StanfordDependencyParser will be deprecated\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _make_tree(self, result):
|
||||
return DependencyGraph(result, top_relation_label="root")
|
||||
|
||||
|
||||
class StanfordNeuralDependencyParser(GenericStanfordParser):
|
||||
"""
|
||||
>>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
|
||||
>>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP
|
||||
|
||||
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
|
||||
|
||||
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
|
||||
(u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
|
||||
u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
|
||||
((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
|
||||
(u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
|
||||
u'punct', (u'.', u'.'))]]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
|
||||
... "The quick brown fox jumps over the lazy dog.",
|
||||
... "The quick grey wolf jumps over the lazy fox."
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
|
||||
'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
|
||||
Tree('fox', ['over', 'the', 'lazy']), '.'])]
|
||||
|
||||
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
|
||||
... "I 'm a dog".split(),
|
||||
... "This is my friends ' cat ( the tabby )".split(),
|
||||
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
|
||||
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
|
||||
['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
|
||||
"""
|
||||
|
||||
_OUTPUT_FORMAT = "conll"
|
||||
_MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
|
||||
_JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
|
||||
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
|
||||
_USE_STDIN = True
|
||||
_DOUBLE_SPACED_OUTPUT = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn(
|
||||
"The StanfordNeuralDependencyParser will be deprecated\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
|
||||
|
||||
def tagged_parse_sents(self, sentences, verbose=False):
|
||||
"""
|
||||
Currently unimplemented because the neural dependency parser (and
|
||||
the StanfordCoreNLP pipeline class) doesn't support passing in pre-
|
||||
tagged tokens.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"tagged_parse[_sents] is not supported by "
|
||||
"StanfordNeuralDependencyParser; use "
|
||||
"parse[_sents] or raw_parse[_sents] instead."
|
||||
)
|
||||
|
||||
def _make_tree(self, result):
|
||||
return DependencyGraph(result, top_relation_label="ROOT")
|
||||
@@ -0,0 +1,793 @@
|
||||
# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers
|
||||
#
|
||||
# Author: Long Duong <longdt219@gmail.com>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import pickle
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from operator import itemgetter
|
||||
from os import remove
|
||||
|
||||
try:
|
||||
from numpy import array
|
||||
from scipy import sparse
|
||||
from sklearn import svm
|
||||
from sklearn.datasets import load_svmlight_file
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI
|
||||
|
||||
|
||||
class Configuration:
|
||||
"""
|
||||
Class for holding configuration which is the partial analysis of the input sentence.
|
||||
The transition based parser aims at finding set of operators that transfer the initial
|
||||
configuration to the terminal configuration.
|
||||
|
||||
The configuration includes:
|
||||
- Stack: for storing partially proceeded words
|
||||
- Buffer: for storing remaining input words
|
||||
- Set of arcs: for storing partially built dependency tree
|
||||
|
||||
This class also provides a method to represent a configuration as list of features.
|
||||
"""
|
||||
|
||||
def __init__(self, dep_graph):
|
||||
"""
|
||||
:param dep_graph: the representation of an input in the form of dependency graph.
|
||||
:type dep_graph: DependencyGraph where the dependencies are not specified.
|
||||
"""
|
||||
# dep_graph.nodes contain list of token for a sentence
|
||||
self.stack = [0] # The root element
|
||||
self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer
|
||||
self.arcs = [] # empty set of arc
|
||||
self._tokens = dep_graph.nodes
|
||||
self._max_address = len(self.buffer)
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
"Stack : "
|
||||
+ str(self.stack)
|
||||
+ " Buffer : "
|
||||
+ str(self.buffer)
|
||||
+ " Arcs : "
|
||||
+ str(self.arcs)
|
||||
)
|
||||
|
||||
def _check_informative(self, feat, flag=False):
|
||||
"""
|
||||
Check whether a feature is informative
|
||||
The flag control whether "_" is informative or not
|
||||
"""
|
||||
if feat is None:
|
||||
return False
|
||||
if feat == "":
|
||||
return False
|
||||
if flag is False:
|
||||
if feat == "_":
|
||||
return False
|
||||
return True
|
||||
|
||||
def extract_features(self):
|
||||
"""
|
||||
Extract the set of features for the current configuration. Implement standard features as describe in
|
||||
Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
|
||||
Please note that these features are very basic.
|
||||
:return: list(str)
|
||||
"""
|
||||
result = []
|
||||
# Todo : can come up with more complicated features set for better
|
||||
# performance.
|
||||
if len(self.stack) > 0:
|
||||
# Stack 0
|
||||
stack_idx0 = self.stack[len(self.stack) - 1]
|
||||
token = self._tokens[stack_idx0]
|
||||
if self._check_informative(token["word"], True):
|
||||
result.append("STK_0_FORM_" + token["word"])
|
||||
if "lemma" in token and self._check_informative(token["lemma"]):
|
||||
result.append("STK_0_LEMMA_" + token["lemma"])
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("STK_0_POS_" + token["tag"])
|
||||
if "feats" in token and self._check_informative(token["feats"]):
|
||||
feats = token["feats"].split("|")
|
||||
for feat in feats:
|
||||
result.append("STK_0_FEATS_" + feat)
|
||||
# Stack 1
|
||||
if len(self.stack) > 1:
|
||||
stack_idx1 = self.stack[len(self.stack) - 2]
|
||||
token = self._tokens[stack_idx1]
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("STK_1_POS_" + token["tag"])
|
||||
|
||||
# Left most, right most dependency of stack[0]
|
||||
left_most = 1000000
|
||||
right_most = -1
|
||||
dep_left_most = ""
|
||||
dep_right_most = ""
|
||||
for wi, r, wj in self.arcs:
|
||||
if wi == stack_idx0:
|
||||
if (wj > wi) and (wj > right_most):
|
||||
right_most = wj
|
||||
dep_right_most = r
|
||||
if (wj < wi) and (wj < left_most):
|
||||
left_most = wj
|
||||
dep_left_most = r
|
||||
if self._check_informative(dep_left_most):
|
||||
result.append("STK_0_LDEP_" + dep_left_most)
|
||||
if self._check_informative(dep_right_most):
|
||||
result.append("STK_0_RDEP_" + dep_right_most)
|
||||
|
||||
# Check Buffered 0
|
||||
if len(self.buffer) > 0:
|
||||
# Buffer 0
|
||||
buffer_idx0 = self.buffer[0]
|
||||
token = self._tokens[buffer_idx0]
|
||||
if self._check_informative(token["word"], True):
|
||||
result.append("BUF_0_FORM_" + token["word"])
|
||||
if "lemma" in token and self._check_informative(token["lemma"]):
|
||||
result.append("BUF_0_LEMMA_" + token["lemma"])
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_0_POS_" + token["tag"])
|
||||
if "feats" in token and self._check_informative(token["feats"]):
|
||||
feats = token["feats"].split("|")
|
||||
for feat in feats:
|
||||
result.append("BUF_0_FEATS_" + feat)
|
||||
# Buffer 1
|
||||
if len(self.buffer) > 1:
|
||||
buffer_idx1 = self.buffer[1]
|
||||
token = self._tokens[buffer_idx1]
|
||||
if self._check_informative(token["word"], True):
|
||||
result.append("BUF_1_FORM_" + token["word"])
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_1_POS_" + token["tag"])
|
||||
if len(self.buffer) > 2:
|
||||
buffer_idx2 = self.buffer[2]
|
||||
token = self._tokens[buffer_idx2]
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_2_POS_" + token["tag"])
|
||||
if len(self.buffer) > 3:
|
||||
buffer_idx3 = self.buffer[3]
|
||||
token = self._tokens[buffer_idx3]
|
||||
if self._check_informative(token["tag"]):
|
||||
result.append("BUF_3_POS_" + token["tag"])
|
||||
# Left most, right most dependency of stack[0]
|
||||
left_most = 1000000
|
||||
right_most = -1
|
||||
dep_left_most = ""
|
||||
dep_right_most = ""
|
||||
for wi, r, wj in self.arcs:
|
||||
if wi == buffer_idx0:
|
||||
if (wj > wi) and (wj > right_most):
|
||||
right_most = wj
|
||||
dep_right_most = r
|
||||
if (wj < wi) and (wj < left_most):
|
||||
left_most = wj
|
||||
dep_left_most = r
|
||||
if self._check_informative(dep_left_most):
|
||||
result.append("BUF_0_LDEP_" + dep_left_most)
|
||||
if self._check_informative(dep_right_most):
|
||||
result.append("BUF_0_RDEP_" + dep_right_most)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class Transition:
|
||||
"""
|
||||
This class defines a set of transition which is applied to a configuration to get another configuration
|
||||
Note that for different parsing algorithm, the transition is different.
|
||||
"""
|
||||
|
||||
# Define set of transitions
|
||||
LEFT_ARC = "LEFTARC"
|
||||
RIGHT_ARC = "RIGHTARC"
|
||||
SHIFT = "SHIFT"
|
||||
REDUCE = "REDUCE"
|
||||
|
||||
def __init__(self, alg_option):
|
||||
"""
|
||||
:param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
|
||||
:type alg_option: str
|
||||
"""
|
||||
self._algo = alg_option
|
||||
if alg_option not in [
|
||||
TransitionParser.ARC_STANDARD,
|
||||
TransitionParser.ARC_EAGER,
|
||||
]:
|
||||
raise ValueError(
|
||||
" Currently we only support %s and %s "
|
||||
% (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)
|
||||
)
|
||||
|
||||
def left_arc(self, conf, relation):
|
||||
"""
|
||||
Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
|
||||
return -1
|
||||
if conf.buffer[0] == 0:
|
||||
# here is the Root element
|
||||
return -1
|
||||
|
||||
idx_wi = conf.stack[len(conf.stack) - 1]
|
||||
|
||||
flag = True
|
||||
if self._algo == TransitionParser.ARC_EAGER:
|
||||
for idx_parent, r, idx_child in conf.arcs:
|
||||
if idx_child == idx_wi:
|
||||
flag = False
|
||||
|
||||
if flag:
|
||||
conf.stack.pop()
|
||||
idx_wj = conf.buffer[0]
|
||||
conf.arcs.append((idx_wj, relation, idx_wi))
|
||||
else:
|
||||
return -1
|
||||
|
||||
def right_arc(self, conf, relation):
|
||||
"""
|
||||
Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
|
||||
return -1
|
||||
if self._algo == TransitionParser.ARC_STANDARD:
|
||||
idx_wi = conf.stack.pop()
|
||||
idx_wj = conf.buffer[0]
|
||||
conf.buffer[0] = idx_wi
|
||||
conf.arcs.append((idx_wi, relation, idx_wj))
|
||||
else: # arc-eager
|
||||
idx_wi = conf.stack[len(conf.stack) - 1]
|
||||
idx_wj = conf.buffer.pop(0)
|
||||
conf.stack.append(idx_wj)
|
||||
conf.arcs.append((idx_wi, relation, idx_wj))
|
||||
|
||||
def reduce(self, conf):
|
||||
"""
|
||||
Note that the algorithm for reduce is only available for arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
|
||||
if self._algo != TransitionParser.ARC_EAGER:
|
||||
return -1
|
||||
if len(conf.stack) <= 0:
|
||||
return -1
|
||||
|
||||
idx_wi = conf.stack[len(conf.stack) - 1]
|
||||
flag = False
|
||||
for idx_parent, r, idx_child in conf.arcs:
|
||||
if idx_child == idx_wi:
|
||||
flag = True
|
||||
if flag:
|
||||
conf.stack.pop() # reduce it
|
||||
else:
|
||||
return -1
|
||||
|
||||
def shift(self, conf):
|
||||
"""
|
||||
Note that the algorithm for shift is the SAME for arc-standard and arc-eager
|
||||
|
||||
:param configuration: is the current configuration
|
||||
:return: A new configuration or -1 if the pre-condition is not satisfied
|
||||
"""
|
||||
if len(conf.buffer) <= 0:
|
||||
return -1
|
||||
idx_wi = conf.buffer.pop(0)
|
||||
conf.stack.append(idx_wi)
|
||||
|
||||
|
||||
class TransitionParser(ParserI):
|
||||
"""
|
||||
Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
|
||||
"""
|
||||
|
||||
ARC_STANDARD = "arc-standard"
|
||||
ARC_EAGER = "arc-eager"
|
||||
|
||||
def __init__(self, algorithm):
|
||||
"""
|
||||
:param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
|
||||
:type algorithm: str
|
||||
"""
|
||||
if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
|
||||
raise ValueError(
|
||||
" Currently we only support %s and %s "
|
||||
% (self.ARC_STANDARD, self.ARC_EAGER)
|
||||
)
|
||||
self._algorithm = algorithm
|
||||
|
||||
self._dictionary = {}
|
||||
self._transition = {}
|
||||
self._match_transition = {}
|
||||
|
||||
def _get_dep_relation(self, idx_parent, idx_child, depgraph):
|
||||
p_node = depgraph.nodes[idx_parent]
|
||||
c_node = depgraph.nodes[idx_child]
|
||||
|
||||
if c_node["word"] is None:
|
||||
return None # Root word
|
||||
|
||||
if c_node["head"] == p_node["address"]:
|
||||
return c_node["rel"]
|
||||
else:
|
||||
return None
|
||||
|
||||
def _convert_to_binary_features(self, features):
|
||||
"""
|
||||
:param features: list of feature string which is needed to convert to binary features
|
||||
:type features: list(str)
|
||||
:return : string of binary features in libsvm format which is 'featureID:value' pairs
|
||||
"""
|
||||
unsorted_result = []
|
||||
for feature in features:
|
||||
self._dictionary.setdefault(feature, len(self._dictionary))
|
||||
unsorted_result.append(self._dictionary[feature])
|
||||
|
||||
# Default value of each feature is 1.0
|
||||
return " ".join(
|
||||
str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
|
||||
)
|
||||
|
||||
def _is_projective(self, depgraph):
|
||||
arc_list = []
|
||||
for key in depgraph.nodes:
|
||||
node = depgraph.nodes[key]
|
||||
|
||||
if "head" in node:
|
||||
childIdx = node["address"]
|
||||
parentIdx = node["head"]
|
||||
if parentIdx is not None:
|
||||
arc_list.append((parentIdx, childIdx))
|
||||
|
||||
for parentIdx, childIdx in arc_list:
|
||||
# Ensure that childIdx < parentIdx
|
||||
if childIdx > parentIdx:
|
||||
temp = childIdx
|
||||
childIdx = parentIdx
|
||||
parentIdx = temp
|
||||
for k in range(childIdx + 1, parentIdx):
|
||||
for m in range(len(depgraph.nodes)):
|
||||
if (m < childIdx) or (m > parentIdx):
|
||||
if (k, m) in arc_list:
|
||||
return False
|
||||
if (m, k) in arc_list:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _write_to_file(self, key, binary_features, input_file):
|
||||
"""
|
||||
write the binary features to input file and update the transition dictionary
|
||||
"""
|
||||
self._transition.setdefault(key, len(self._transition) + 1)
|
||||
self._match_transition[self._transition[key]] = key
|
||||
|
||||
input_str = str(self._transition[key]) + " " + binary_features + "\n"
|
||||
input_file.write(input_str.encode("utf-8"))
|
||||
|
||||
def _create_training_examples_arc_std(self, depgraphs, input_file):
|
||||
"""
|
||||
Create the training example in the libsvm format and write it to the input_file.
|
||||
Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009)
|
||||
"""
|
||||
operation = Transition(self.ARC_STANDARD)
|
||||
count_proj = 0
|
||||
training_seq = []
|
||||
|
||||
for depgraph in depgraphs:
|
||||
if not self._is_projective(depgraph):
|
||||
continue
|
||||
|
||||
count_proj += 1
|
||||
conf = Configuration(depgraph)
|
||||
while len(conf.buffer) > 0:
|
||||
b0 = conf.buffer[0]
|
||||
features = conf.extract_features()
|
||||
binary_features = self._convert_to_binary_features(features)
|
||||
|
||||
if len(conf.stack) > 0:
|
||||
s0 = conf.stack[len(conf.stack) - 1]
|
||||
# Left-arc operation
|
||||
rel = self._get_dep_relation(b0, s0, depgraph)
|
||||
if rel is not None:
|
||||
key = Transition.LEFT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.left_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Right-arc operation
|
||||
rel = self._get_dep_relation(s0, b0, depgraph)
|
||||
if rel is not None:
|
||||
precondition = True
|
||||
# Get the max-index of buffer
|
||||
maxID = conf._max_address
|
||||
|
||||
for w in range(maxID + 1):
|
||||
if w != b0:
|
||||
relw = self._get_dep_relation(b0, w, depgraph)
|
||||
if relw is not None:
|
||||
if (b0, relw, w) not in conf.arcs:
|
||||
precondition = False
|
||||
|
||||
if precondition:
|
||||
key = Transition.RIGHT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.right_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Shift operation as the default
|
||||
key = Transition.SHIFT
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.shift(conf)
|
||||
training_seq.append(key)
|
||||
|
||||
print(" Number of training examples : " + str(len(depgraphs)))
|
||||
print(" Number of valid (projective) examples : " + str(count_proj))
|
||||
return training_seq
|
||||
|
||||
def _create_training_examples_arc_eager(self, depgraphs, input_file):
|
||||
"""
|
||||
Create the training example in the libsvm format and write it to the input_file.
|
||||
Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre
|
||||
"""
|
||||
operation = Transition(self.ARC_EAGER)
|
||||
countProj = 0
|
||||
training_seq = []
|
||||
|
||||
for depgraph in depgraphs:
|
||||
if not self._is_projective(depgraph):
|
||||
continue
|
||||
|
||||
countProj += 1
|
||||
conf = Configuration(depgraph)
|
||||
while len(conf.buffer) > 0:
|
||||
b0 = conf.buffer[0]
|
||||
features = conf.extract_features()
|
||||
binary_features = self._convert_to_binary_features(features)
|
||||
|
||||
if len(conf.stack) > 0:
|
||||
s0 = conf.stack[len(conf.stack) - 1]
|
||||
# Left-arc operation
|
||||
rel = self._get_dep_relation(b0, s0, depgraph)
|
||||
if rel is not None:
|
||||
key = Transition.LEFT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.left_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Right-arc operation
|
||||
rel = self._get_dep_relation(s0, b0, depgraph)
|
||||
if rel is not None:
|
||||
key = Transition.RIGHT_ARC + ":" + rel
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.right_arc(conf, rel)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# reduce operation
|
||||
flag = False
|
||||
for k in range(s0):
|
||||
if self._get_dep_relation(k, b0, depgraph) is not None:
|
||||
flag = True
|
||||
if self._get_dep_relation(b0, k, depgraph) is not None:
|
||||
flag = True
|
||||
if flag:
|
||||
key = Transition.REDUCE
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.reduce(conf)
|
||||
training_seq.append(key)
|
||||
continue
|
||||
|
||||
# Shift operation as the default
|
||||
key = Transition.SHIFT
|
||||
self._write_to_file(key, binary_features, input_file)
|
||||
operation.shift(conf)
|
||||
training_seq.append(key)
|
||||
|
||||
print(" Number of training examples : " + str(len(depgraphs)))
|
||||
print(" Number of valid (projective) examples : " + str(countProj))
|
||||
return training_seq
|
||||
|
||||
def train(self, depgraphs, modelfile, verbose=True):
|
||||
"""
|
||||
:param depgraphs : list of DependencyGraph as the training data
|
||||
:type depgraphs : DependencyGraph
|
||||
:param modelfile : file name to save the trained model
|
||||
:type modelfile : str
|
||||
"""
|
||||
|
||||
try:
|
||||
input_file = tempfile.NamedTemporaryFile(
|
||||
prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
|
||||
)
|
||||
|
||||
if self._algorithm == self.ARC_STANDARD:
|
||||
self._create_training_examples_arc_std(depgraphs, input_file)
|
||||
else:
|
||||
self._create_training_examples_arc_eager(depgraphs, input_file)
|
||||
|
||||
input_file.close()
|
||||
# Using the temporary file to train the libsvm classifier
|
||||
x_train, y_train = load_svmlight_file(input_file.name)
|
||||
# The parameter is set according to the paper:
|
||||
# Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
|
||||
# Todo : because of probability = True => very slow due to
|
||||
# cross-validation. Need to improve the speed here
|
||||
model = svm.SVC(
|
||||
kernel="poly",
|
||||
degree=2,
|
||||
coef0=0,
|
||||
gamma=0.2,
|
||||
C=0.5,
|
||||
verbose=verbose,
|
||||
probability=True,
|
||||
)
|
||||
|
||||
model.fit(x_train, y_train)
|
||||
# Save the model to file name (as pickle)
|
||||
pickle.dump(model, open(modelfile, "wb"))
|
||||
finally:
|
||||
remove(input_file.name)
|
||||
|
||||
def parse(self, depgraphs, modelFile):
|
||||
"""
|
||||
:param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy
|
||||
:type depgraphs: list(DependencyGraph)
|
||||
:param modelfile: the model file
|
||||
:type modelfile: str
|
||||
:return: list (DependencyGraph) with the 'head' and 'rel' information
|
||||
"""
|
||||
result = []
|
||||
# First load the model
|
||||
model = pickle.load(open(modelFile, "rb"))
|
||||
operation = Transition(self._algorithm)
|
||||
|
||||
for depgraph in depgraphs:
|
||||
conf = Configuration(depgraph)
|
||||
while len(conf.buffer) > 0:
|
||||
features = conf.extract_features()
|
||||
col = []
|
||||
row = []
|
||||
data = []
|
||||
for feature in features:
|
||||
if feature in self._dictionary:
|
||||
col.append(self._dictionary[feature])
|
||||
row.append(0)
|
||||
data.append(1.0)
|
||||
np_col = array(sorted(col)) # NB : index must be sorted
|
||||
np_row = array(row)
|
||||
np_data = array(data)
|
||||
|
||||
x_test = sparse.csr_matrix(
|
||||
(np_data, (np_row, np_col)), shape=(1, len(self._dictionary))
|
||||
)
|
||||
|
||||
# It's best to use decision function as follow BUT it's not supported yet for sparse SVM
|
||||
# Using decision function to build the votes array
|
||||
# dec_func = model.decision_function(x_test)[0]
|
||||
# votes = {}
|
||||
# k = 0
|
||||
# for i in range(len(model.classes_)):
|
||||
# for j in range(i+1, len(model.classes_)):
|
||||
# #if dec_func[k] > 0:
|
||||
# votes.setdefault(i,0)
|
||||
# votes[i] +=1
|
||||
# else:
|
||||
# votes.setdefault(j,0)
|
||||
# votes[j] +=1
|
||||
# k +=1
|
||||
# Sort votes according to the values
|
||||
# sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
|
||||
|
||||
# We will use predict_proba instead of decision_function
|
||||
prob_dict = {}
|
||||
pred_prob = model.predict_proba(x_test)[0]
|
||||
for i in range(len(pred_prob)):
|
||||
prob_dict[i] = pred_prob[i]
|
||||
sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True)
|
||||
|
||||
# Note that SHIFT is always a valid operation
|
||||
for y_pred_idx, confidence in sorted_Prob:
|
||||
# y_pred = model.predict(x_test)[0]
|
||||
# From the prediction match to the operation
|
||||
y_pred = model.classes_[y_pred_idx]
|
||||
|
||||
if y_pred in self._match_transition:
|
||||
strTransition = self._match_transition[y_pred]
|
||||
baseTransition = strTransition.split(":")[0]
|
||||
|
||||
if baseTransition == Transition.LEFT_ARC:
|
||||
if (
|
||||
operation.left_arc(conf, strTransition.split(":")[1])
|
||||
!= -1
|
||||
):
|
||||
break
|
||||
elif baseTransition == Transition.RIGHT_ARC:
|
||||
if (
|
||||
operation.right_arc(conf, strTransition.split(":")[1])
|
||||
!= -1
|
||||
):
|
||||
break
|
||||
elif baseTransition == Transition.REDUCE:
|
||||
if operation.reduce(conf) != -1:
|
||||
break
|
||||
elif baseTransition == Transition.SHIFT:
|
||||
if operation.shift(conf) != -1:
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"The predicted transition is not recognized, expected errors"
|
||||
)
|
||||
|
||||
# Finish with operations build the dependency graph from Conf.arcs
|
||||
|
||||
new_depgraph = deepcopy(depgraph)
|
||||
for key in new_depgraph.nodes:
|
||||
node = new_depgraph.nodes[key]
|
||||
node["rel"] = ""
|
||||
# With the default, all the token depend on the Root
|
||||
node["head"] = 0
|
||||
for head, rel, child in conf.arcs:
|
||||
c_node = new_depgraph.nodes[child]
|
||||
c_node["head"] = head
|
||||
c_node["rel"] = rel
|
||||
result.append(new_depgraph)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
>>> from nltk.parse import DependencyGraph, DependencyEvaluator
|
||||
>>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
|
||||
>>> gold_sent = DependencyGraph(\"""
|
||||
... Economic JJ 2 ATT
|
||||
... news NN 3 SBJ
|
||||
... has VBD 0 ROOT
|
||||
... little JJ 5 ATT
|
||||
... effect NN 3 OBJ
|
||||
... on IN 5 ATT
|
||||
... financial JJ 8 ATT
|
||||
... markets NNS 6 PC
|
||||
... . . 3 PU
|
||||
... \""")
|
||||
|
||||
>>> conf = Configuration(gold_sent)
|
||||
|
||||
###################### Check the Initial Feature ########################
|
||||
|
||||
>>> print(', '.join(conf.extract_features()))
|
||||
STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ
|
||||
|
||||
###################### Check The Transition #######################
|
||||
Check the Initialized Configuration
|
||||
>>> print(conf)
|
||||
Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : []
|
||||
|
||||
A. Do some transition checks for ARC-STANDARD
|
||||
|
||||
>>> operation = Transition('arc-standard')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf, "ATT")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,"SBJ")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf, "ATT")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf, "ATT")
|
||||
|
||||
Middle Configuration and Features Check
|
||||
>>> print(conf)
|
||||
Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)]
|
||||
|
||||
>>> print(', '.join(conf.extract_features()))
|
||||
STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT
|
||||
|
||||
>>> operation.right_arc(conf, "PC")
|
||||
>>> operation.right_arc(conf, "ATT")
|
||||
>>> operation.right_arc(conf, "OBJ")
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.right_arc(conf, "PU")
|
||||
>>> operation.right_arc(conf, "ROOT")
|
||||
>>> operation.shift(conf)
|
||||
|
||||
Terminated Configuration Check
|
||||
>>> print(conf)
|
||||
Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)]
|
||||
|
||||
|
||||
B. Do some transition checks for ARC-EAGER
|
||||
|
||||
>>> conf = Configuration(gold_sent)
|
||||
>>> operation = Transition('arc-eager')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'ATT')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'SBJ')
|
||||
>>> operation.right_arc(conf,'ROOT')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'ATT')
|
||||
>>> operation.right_arc(conf,'OBJ')
|
||||
>>> operation.right_arc(conf,'ATT')
|
||||
>>> operation.shift(conf)
|
||||
>>> operation.left_arc(conf,'ATT')
|
||||
>>> operation.right_arc(conf,'PC')
|
||||
>>> operation.reduce(conf)
|
||||
>>> operation.reduce(conf)
|
||||
>>> operation.reduce(conf)
|
||||
>>> operation.right_arc(conf,'PU')
|
||||
>>> print(conf)
|
||||
Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)]
|
||||
|
||||
###################### Check The Training Function #######################
|
||||
|
||||
A. Check the ARC-STANDARD training
|
||||
>>> import tempfile
|
||||
>>> import os
|
||||
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)
|
||||
|
||||
>>> parser_std = TransitionParser('arc-standard')
|
||||
>>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file)))
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
|
||||
|
||||
>>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
>>> input_file.close()
|
||||
>>> remove(input_file.name)
|
||||
|
||||
B. Check the ARC-EAGER training
|
||||
|
||||
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False)
|
||||
>>> parser_eager = TransitionParser('arc-eager')
|
||||
>>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file)))
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
|
||||
|
||||
>>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
|
||||
Number of training examples : 1
|
||||
Number of valid (projective) examples : 1
|
||||
|
||||
>>> input_file.close()
|
||||
>>> remove(input_file.name)
|
||||
|
||||
###################### Check The Parsing Function ########################
|
||||
|
||||
A. Check the ARC-STANDARD parser
|
||||
|
||||
>>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
|
||||
>>> de = DependencyEvaluator(result, [gold_sent])
|
||||
>>> de.eval() >= (0, 0)
|
||||
True
|
||||
|
||||
B. Check the ARC-EAGER parser
|
||||
>>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
|
||||
>>> de = DependencyEvaluator(result, [gold_sent])
|
||||
>>> de.eval() >= (0, 0)
|
||||
True
|
||||
|
||||
Remove test temporary files
|
||||
>>> remove('temp.arceager.model')
|
||||
>>> remove('temp.arcstd.model')
|
||||
|
||||
Note that result is very poor because of only one training example.
|
||||
"""
|
||||
234
Backend/venv/lib/python3.12/site-packages/nltk/parse/util.py
Normal file
234
Backend/venv/lib/python3.12/site-packages/nltk/parse/util.py
Normal file
@@ -0,0 +1,234 @@
|
||||
# Natural Language Toolkit: Parser Utility Functions
|
||||
#
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# Tom Aarsen <>
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
Utility functions for parsers.
|
||||
"""
|
||||
|
||||
from nltk.data import load
|
||||
from nltk.grammar import CFG, PCFG, FeatureGrammar
|
||||
from nltk.parse.chart import Chart, ChartParser
|
||||
from nltk.parse.featurechart import FeatureChart, FeatureChartParser
|
||||
from nltk.parse.pchart import InsideChartParser
|
||||
|
||||
|
||||
def load_parser(
|
||||
grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
|
||||
):
|
||||
"""
|
||||
Load a grammar from a file, and build a parser based on that grammar.
|
||||
The parser depends on the grammar format, and might also depend
|
||||
on properties of the grammar itself.
|
||||
|
||||
The following grammar formats are currently supported:
|
||||
- ``'cfg'`` (CFGs: ``CFG``)
|
||||
- ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
|
||||
- ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
|
||||
|
||||
:type grammar_url: str
|
||||
:param grammar_url: A URL specifying where the grammar is located.
|
||||
The default protocol is ``"nltk:"``, which searches for the file
|
||||
in the the NLTK data package.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing output.
|
||||
:param parser: The class used for parsing; should be ``ChartParser``
|
||||
or a subclass.
|
||||
If None, the class depends on the grammar format.
|
||||
:param chart_class: The class used for storing the chart;
|
||||
should be ``Chart`` or a subclass.
|
||||
Only used for CFGs and feature CFGs.
|
||||
If None, the chart class depends on the grammar format.
|
||||
:type beam_size: int
|
||||
:param beam_size: The maximum length for the parser's edge queue.
|
||||
Only used for probabilistic CFGs.
|
||||
:param load_args: Keyword parameters used when loading the grammar.
|
||||
See ``data.load`` for more information.
|
||||
"""
|
||||
grammar = load(grammar_url, **load_args)
|
||||
if not isinstance(grammar, CFG):
|
||||
raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
|
||||
if isinstance(grammar, PCFG):
|
||||
if parser is None:
|
||||
parser = InsideChartParser
|
||||
return parser(grammar, trace=trace, beam_size=beam_size)
|
||||
|
||||
elif isinstance(grammar, FeatureGrammar):
|
||||
if parser is None:
|
||||
parser = FeatureChartParser
|
||||
if chart_class is None:
|
||||
chart_class = FeatureChart
|
||||
return parser(grammar, trace=trace, chart_class=chart_class)
|
||||
|
||||
else: # Plain CFG.
|
||||
if parser is None:
|
||||
parser = ChartParser
|
||||
if chart_class is None:
|
||||
chart_class = Chart
|
||||
return parser(grammar, trace=trace, chart_class=chart_class)
|
||||
|
||||
|
||||
def taggedsent_to_conll(sentence):
|
||||
"""
|
||||
A module to convert a single POS tagged sentence into CONLL format.
|
||||
|
||||
>>> from nltk import word_tokenize, pos_tag
|
||||
>>> text = "This is a foobar sentence."
|
||||
>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE
|
||||
... print(line, end="")
|
||||
1 This _ DT DT _ 0 a _ _
|
||||
2 is _ VBZ VBZ _ 0 a _ _
|
||||
3 a _ DT DT _ 0 a _ _
|
||||
4 foobar _ JJ JJ _ 0 a _ _
|
||||
5 sentence _ NN NN _ 0 a _ _
|
||||
6 . _ . . _ 0 a _ _
|
||||
|
||||
:param sentence: A single input sentence to parse
|
||||
:type sentence: list(tuple(str, str))
|
||||
:rtype: iter(str)
|
||||
:return: a generator yielding a single sentence in CONLL format.
|
||||
"""
|
||||
for i, (word, tag) in enumerate(sentence, start=1):
|
||||
input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
|
||||
input_str = "\t".join(input_str) + "\n"
|
||||
yield input_str
|
||||
|
||||
|
||||
def taggedsents_to_conll(sentences):
|
||||
"""
|
||||
A module to convert the a POS tagged document stream
|
||||
(i.e. list of list of tuples, a list of sentences) and yield lines
|
||||
in CONLL format. This module yields one line per word and two newlines
|
||||
for end of sentence.
|
||||
|
||||
>>> from nltk import word_tokenize, sent_tokenize, pos_tag
|
||||
>>> text = "This is a foobar sentence. Is that right?"
|
||||
>>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
|
||||
>>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE
|
||||
... if line:
|
||||
... print(line, end="")
|
||||
1 This _ DT DT _ 0 a _ _
|
||||
2 is _ VBZ VBZ _ 0 a _ _
|
||||
3 a _ DT DT _ 0 a _ _
|
||||
4 foobar _ JJ JJ _ 0 a _ _
|
||||
5 sentence _ NN NN _ 0 a _ _
|
||||
6 . _ . . _ 0 a _ _
|
||||
<BLANKLINE>
|
||||
<BLANKLINE>
|
||||
1 Is _ VBZ VBZ _ 0 a _ _
|
||||
2 that _ IN IN _ 0 a _ _
|
||||
3 right _ NN NN _ 0 a _ _
|
||||
4 ? _ . . _ 0 a _ _
|
||||
<BLANKLINE>
|
||||
<BLANKLINE>
|
||||
|
||||
:param sentences: Input sentences to parse
|
||||
:type sentence: list(list(tuple(str, str)))
|
||||
:rtype: iter(str)
|
||||
:return: a generator yielding sentences in CONLL format.
|
||||
"""
|
||||
for sentence in sentences:
|
||||
yield from taggedsent_to_conll(sentence)
|
||||
yield "\n\n"
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Test Suites
|
||||
######################################################################
|
||||
|
||||
|
||||
class TestGrammar:
|
||||
"""
|
||||
Unit tests for CFG.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, suite, accept=None, reject=None):
|
||||
self.test_grammar = grammar
|
||||
|
||||
self.cp = load_parser(grammar, trace=0)
|
||||
self.suite = suite
|
||||
self._accept = accept
|
||||
self._reject = reject
|
||||
|
||||
def run(self, show_trees=False):
|
||||
"""
|
||||
Sentences in the test suite are divided into two classes:
|
||||
|
||||
- grammatical (``accept``) and
|
||||
- ungrammatical (``reject``).
|
||||
|
||||
If a sentence should parse according to the grammar, the value of
|
||||
``trees`` will be a non-empty list. If a sentence should be rejected
|
||||
according to the grammar, then the value of ``trees`` will be None.
|
||||
"""
|
||||
for test in self.suite:
|
||||
print(test["doc"] + ":", end=" ")
|
||||
for key in ["accept", "reject"]:
|
||||
for sent in test[key]:
|
||||
tokens = sent.split()
|
||||
trees = list(self.cp.parse(tokens))
|
||||
if show_trees and trees:
|
||||
print()
|
||||
print(sent)
|
||||
for tree in trees:
|
||||
print(tree)
|
||||
if key == "accept":
|
||||
if trees == []:
|
||||
raise ValueError("Sentence '%s' failed to parse'" % sent)
|
||||
else:
|
||||
accepted = True
|
||||
else:
|
||||
if trees:
|
||||
raise ValueError("Sentence '%s' received a parse'" % sent)
|
||||
else:
|
||||
rejected = True
|
||||
if accepted and rejected:
|
||||
print("All tests passed!")
|
||||
|
||||
|
||||
def extract_test_sentences(string, comment_chars="#%;", encoding=None):
|
||||
"""
|
||||
Parses a string with one test sentence per line.
|
||||
Lines can optionally begin with:
|
||||
|
||||
- a bool, saying if the sentence is grammatical or not, or
|
||||
- an int, giving the number of parse trees is should have,
|
||||
|
||||
The result information is followed by a colon, and then the sentence.
|
||||
Empty lines and lines beginning with a comment char are ignored.
|
||||
|
||||
:return: a list of tuple of sentences and expected results,
|
||||
where a sentence is a list of str,
|
||||
and a result is None, or bool, or int
|
||||
|
||||
:param comment_chars: ``str`` of possible comment characters.
|
||||
:param encoding: the encoding of the string, if it is binary
|
||||
"""
|
||||
if encoding is not None:
|
||||
string = string.decode(encoding)
|
||||
sentences = []
|
||||
for sentence in string.split("\n"):
|
||||
if sentence == "" or sentence[0] in comment_chars:
|
||||
continue
|
||||
split_info = sentence.split(":", 1)
|
||||
result = None
|
||||
if len(split_info) == 2:
|
||||
if split_info[0] in ["True", "true", "False", "false"]:
|
||||
result = split_info[0] in ["True", "true"]
|
||||
sentence = split_info[1]
|
||||
else:
|
||||
result = int(split_info[0])
|
||||
sentence = split_info[1]
|
||||
tokens = sentence.split()
|
||||
if tokens == []:
|
||||
continue
|
||||
sentences += [(tokens, result)]
|
||||
return sentences
|
||||
453
Backend/venv/lib/python3.12/site-packages/nltk/parse/viterbi.py
Normal file
453
Backend/venv/lib/python3.12/site-packages/nltk/parse/viterbi.py
Normal file
@@ -0,0 +1,453 @@
|
||||
# Natural Language Toolkit: Viterbi Probabilistic Parser
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from functools import reduce
|
||||
|
||||
from nltk.parse.api import ParserI
|
||||
from nltk.tree import ProbabilisticTree, Tree
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Viterbi PCFG Parser
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class ViterbiParser(ParserI):
|
||||
"""
|
||||
A bottom-up ``PCFG`` parser that uses dynamic programming to find
|
||||
the single most likely parse for a text. The ``ViterbiParser`` parser
|
||||
parses texts by filling in a "most likely constituent table".
|
||||
This table records the most probable tree representation for any
|
||||
given span and node value. In particular, it has an entry for
|
||||
every start index, end index, and node value, recording the most
|
||||
likely subtree that spans from the start index to the end index,
|
||||
and has the given node value.
|
||||
|
||||
The ``ViterbiParser`` parser fills in this table incrementally. It starts
|
||||
by filling in all entries for constituents that span one element
|
||||
of text (i.e., entries where the end index is one greater than the
|
||||
start index). After it has filled in all table entries for
|
||||
constituents that span one element of text, it fills in the
|
||||
entries for constitutants that span two elements of text. It
|
||||
continues filling in the entries for constituents spanning larger
|
||||
and larger portions of the text, until the entire table has been
|
||||
filled. Finally, it returns the table entry for a constituent
|
||||
spanning the entire text, whose node value is the grammar's start
|
||||
symbol.
|
||||
|
||||
In order to find the most likely constituent with a given span and
|
||||
node value, the ``ViterbiParser`` parser considers all productions that
|
||||
could produce that node value. For each production, it finds all
|
||||
children that collectively cover the span and have the node values
|
||||
specified by the production's right hand side. If the probability
|
||||
of the tree formed by applying the production to the children is
|
||||
greater than the probability of the current entry in the table,
|
||||
then the table is updated with this new tree.
|
||||
|
||||
A pseudo-code description of the algorithm used by
|
||||
``ViterbiParser`` is:
|
||||
|
||||
| Create an empty most likely constituent table, *MLC*.
|
||||
| For width in 1...len(text):
|
||||
| For start in 1...len(text)-width:
|
||||
| For prod in grammar.productions:
|
||||
| For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
|
||||
| where t[i].label()==prod.rhs[i],
|
||||
| and the sequence covers [start:start+width]:
|
||||
| old_p = MLC[start, start+width, prod.lhs]
|
||||
| new_p = P(t[1])P(t[1])...P(t[n])P(prod)
|
||||
| if new_p > old_p:
|
||||
| new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
|
||||
| MLC[start, start+width, prod.lhs] = new_tree
|
||||
| Return MLC[0, len(text), start_symbol]
|
||||
|
||||
:type _grammar: PCFG
|
||||
:ivar _grammar: The grammar used to parse sentences.
|
||||
:type _trace: int
|
||||
:ivar _trace: The level of tracing output that should be generated
|
||||
when parsing a text.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, trace=0):
|
||||
"""
|
||||
Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
|
||||
parse texts.
|
||||
|
||||
:type grammar: PCFG
|
||||
:param grammar: The grammar used to parse texts.
|
||||
:type trace: int
|
||||
:param trace: The level of tracing that should be used when
|
||||
parsing a text. ``0`` will generate no tracing output;
|
||||
and higher numbers will produce more verbose tracing
|
||||
output.
|
||||
"""
|
||||
self._grammar = grammar
|
||||
self._trace = trace
|
||||
|
||||
def grammar(self):
|
||||
return self._grammar
|
||||
|
||||
def trace(self, trace=2):
|
||||
"""
|
||||
Set the level of tracing output that should be generated when
|
||||
parsing a text.
|
||||
|
||||
:type trace: int
|
||||
:param trace: The trace level. A trace level of ``0`` will
|
||||
generate no tracing output; and higher trace levels will
|
||||
produce more verbose tracing output.
|
||||
:rtype: None
|
||||
"""
|
||||
self._trace = trace
|
||||
|
||||
def parse(self, tokens):
|
||||
# Inherit docs from ParserI
|
||||
|
||||
tokens = list(tokens)
|
||||
self._grammar.check_coverage(tokens)
|
||||
|
||||
# The most likely constituent table. This table specifies the
|
||||
# most likely constituent for a given span and type.
|
||||
# Constituents can be either Trees or tokens. For Trees,
|
||||
# the "type" is the Nonterminal for the tree's root node
|
||||
# value. For Tokens, the "type" is the token's type.
|
||||
# The table is stored as a dictionary, since it is sparse.
|
||||
constituents = {}
|
||||
|
||||
# Initialize the constituents dictionary with the words from
|
||||
# the text.
|
||||
if self._trace:
|
||||
print("Inserting tokens into the most likely" + " constituents table...")
|
||||
for index in range(len(tokens)):
|
||||
token = tokens[index]
|
||||
constituents[index, index + 1, token] = token
|
||||
if self._trace > 1:
|
||||
self._trace_lexical_insertion(token, index, len(tokens))
|
||||
|
||||
# Consider each span of length 1, 2, ..., n; and add any trees
|
||||
# that might cover that span to the constituents dictionary.
|
||||
for length in range(1, len(tokens) + 1):
|
||||
if self._trace:
|
||||
print(
|
||||
"Finding the most likely constituents"
|
||||
+ " spanning %d text elements..." % length
|
||||
)
|
||||
for start in range(len(tokens) - length + 1):
|
||||
span = (start, start + length)
|
||||
self._add_constituents_spanning(span, constituents, tokens)
|
||||
|
||||
# Return the tree that spans the entire text & have the right cat
|
||||
tree = constituents.get((0, len(tokens), self._grammar.start()))
|
||||
if tree is not None:
|
||||
yield tree
|
||||
|
||||
def _add_constituents_spanning(self, span, constituents, tokens):
|
||||
"""
|
||||
Find any constituents that might cover ``span``, and add them
|
||||
to the most likely constituents table.
|
||||
|
||||
:rtype: None
|
||||
:type span: tuple(int, int)
|
||||
:param span: The section of the text for which we are
|
||||
trying to find possible constituents. The span is
|
||||
specified as a pair of integers, where the first integer
|
||||
is the index of the first token that should be included in
|
||||
the constituent; and the second integer is the index of
|
||||
the first token that should not be included in the
|
||||
constituent. I.e., the constituent should cover
|
||||
``text[span[0]:span[1]]``, where ``text`` is the text
|
||||
that we are parsing.
|
||||
|
||||
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
||||
:param constituents: The most likely constituents table. This
|
||||
table records the most probable tree representation for
|
||||
any given span and node value. In particular,
|
||||
``constituents(s,e,nv)`` is the most likely
|
||||
``ProbabilisticTree`` that covers ``text[s:e]``
|
||||
and has a node value ``nv.symbol()``, where ``text``
|
||||
is the text that we are parsing. When
|
||||
``_add_constituents_spanning`` is called, ``constituents``
|
||||
should contain all possible constituents that are shorter
|
||||
than ``span``.
|
||||
|
||||
:type tokens: list of tokens
|
||||
:param tokens: The text we are parsing. This is only used for
|
||||
trace output.
|
||||
"""
|
||||
# Since some of the grammar productions may be unary, we need to
|
||||
# repeatedly try all of the productions until none of them add any
|
||||
# new constituents.
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
# Find all ways instantiations of the grammar productions that
|
||||
# cover the span.
|
||||
instantiations = self._find_instantiations(span, constituents)
|
||||
|
||||
# For each production instantiation, add a new
|
||||
# ProbabilisticTree whose probability is the product
|
||||
# of the childrens' probabilities and the production's
|
||||
# probability.
|
||||
for production, children in instantiations:
|
||||
subtrees = [c for c in children if isinstance(c, Tree)]
|
||||
p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
|
||||
node = production.lhs().symbol()
|
||||
tree = ProbabilisticTree(node, children, prob=p)
|
||||
|
||||
# If it's new a constituent, then add it to the
|
||||
# constituents dictionary.
|
||||
c = constituents.get((span[0], span[1], production.lhs()))
|
||||
if self._trace > 1:
|
||||
if c is None or c != tree:
|
||||
if c is None or c.prob() < tree.prob():
|
||||
print(" Insert:", end=" ")
|
||||
else:
|
||||
print(" Discard:", end=" ")
|
||||
self._trace_production(production, p, span, len(tokens))
|
||||
if c is None or c.prob() < tree.prob():
|
||||
constituents[span[0], span[1], production.lhs()] = tree
|
||||
changed = True
|
||||
|
||||
def _find_instantiations(self, span, constituents):
|
||||
"""
|
||||
:return: a list of the production instantiations that cover a
|
||||
given span of the text. A "production instantiation" is
|
||||
a tuple containing a production and a list of children,
|
||||
where the production's right hand side matches the list of
|
||||
children; and the children cover ``span``. :rtype: list
|
||||
of ``pair`` of ``Production``, (list of
|
||||
(``ProbabilisticTree`` or token.
|
||||
|
||||
:type span: tuple(int, int)
|
||||
:param span: The section of the text for which we are
|
||||
trying to find production instantiations. The span is
|
||||
specified as a pair of integers, where the first integer
|
||||
is the index of the first token that should be covered by
|
||||
the production instantiation; and the second integer is
|
||||
the index of the first token that should not be covered by
|
||||
the production instantiation.
|
||||
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
||||
:param constituents: The most likely constituents table. This
|
||||
table records the most probable tree representation for
|
||||
any given span and node value. See the module
|
||||
documentation for more information.
|
||||
"""
|
||||
rv = []
|
||||
for production in self._grammar.productions():
|
||||
childlists = self._match_rhs(production.rhs(), span, constituents)
|
||||
|
||||
for childlist in childlists:
|
||||
rv.append((production, childlist))
|
||||
return rv
|
||||
|
||||
def _match_rhs(self, rhs, span, constituents):
|
||||
"""
|
||||
:return: a set of all the lists of children that cover ``span``
|
||||
and that match ``rhs``.
|
||||
:rtype: list(list(ProbabilisticTree or token)
|
||||
|
||||
:type rhs: list(Nonterminal or any)
|
||||
:param rhs: The list specifying what kinds of children need to
|
||||
cover ``span``. Each nonterminal in ``rhs`` specifies
|
||||
that the corresponding child should be a tree whose node
|
||||
value is that nonterminal's symbol. Each terminal in ``rhs``
|
||||
specifies that the corresponding child should be a token
|
||||
whose type is that terminal.
|
||||
:type span: tuple(int, int)
|
||||
:param span: The section of the text for which we are
|
||||
trying to find child lists. The span is specified as a
|
||||
pair of integers, where the first integer is the index of
|
||||
the first token that should be covered by the child list;
|
||||
and the second integer is the index of the first token
|
||||
that should not be covered by the child list.
|
||||
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
|
||||
:param constituents: The most likely constituents table. This
|
||||
table records the most probable tree representation for
|
||||
any given span and node value. See the module
|
||||
documentation for more information.
|
||||
"""
|
||||
(start, end) = span
|
||||
|
||||
# Base case
|
||||
if start >= end and rhs == ():
|
||||
return [[]]
|
||||
if start >= end or rhs == ():
|
||||
return []
|
||||
|
||||
# Find everything that matches the 1st symbol of the RHS
|
||||
childlists = []
|
||||
for split in range(start, end + 1):
|
||||
l = constituents.get((start, split, rhs[0]))
|
||||
if l is not None:
|
||||
rights = self._match_rhs(rhs[1:], (split, end), constituents)
|
||||
childlists += [[l] + r for r in rights]
|
||||
|
||||
return childlists
|
||||
|
||||
def _trace_production(self, production, p, span, width):
|
||||
"""
|
||||
Print trace output indicating that a given production has been
|
||||
applied at a given location.
|
||||
|
||||
:param production: The production that has been applied
|
||||
:type production: Production
|
||||
:param p: The probability of the tree produced by the production.
|
||||
:type p: float
|
||||
:param span: The span of the production
|
||||
:type span: tuple
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
str = "|" + "." * span[0]
|
||||
str += "=" * (span[1] - span[0])
|
||||
str += "." * (width - span[1]) + "| "
|
||||
str += "%s" % production
|
||||
if self._trace > 2:
|
||||
str = f"{str:<40} {p:12.10f} "
|
||||
|
||||
print(str)
|
||||
|
||||
def _trace_lexical_insertion(self, token, index, width):
|
||||
str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
|
||||
str += f"{token}"
|
||||
print(str)
|
||||
|
||||
def __repr__(self):
|
||||
return "<ViterbiParser for %r>" % self._grammar
|
||||
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Test Code
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the probabilistic parsers. The user is
|
||||
prompted to select which demo to run, and how many parses should
|
||||
be found; and then each parser is run on the same demo, and a
|
||||
summary of the results are displayed.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
from nltk import tokenize
|
||||
from nltk.grammar import PCFG
|
||||
from nltk.parse import ViterbiParser
|
||||
|
||||
toy_pcfg1 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
|
||||
Det -> 'the' [0.8] | 'my' [0.2]
|
||||
N -> 'man' [0.5] | 'telescope' [0.5]
|
||||
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
|
||||
V -> 'ate' [0.35] | 'saw' [0.65]
|
||||
PP -> P NP [1.0]
|
||||
P -> 'with' [0.61] | 'under' [0.39]
|
||||
"""
|
||||
)
|
||||
|
||||
toy_pcfg2 = PCFG.fromstring(
|
||||
"""
|
||||
S -> NP VP [1.0]
|
||||
VP -> V NP [.59]
|
||||
VP -> V [.40]
|
||||
VP -> VP PP [.01]
|
||||
NP -> Det N [.41]
|
||||
NP -> Name [.28]
|
||||
NP -> NP PP [.31]
|
||||
PP -> P NP [1.0]
|
||||
V -> 'saw' [.21]
|
||||
V -> 'ate' [.51]
|
||||
V -> 'ran' [.28]
|
||||
N -> 'boy' [.11]
|
||||
N -> 'cookie' [.12]
|
||||
N -> 'table' [.13]
|
||||
N -> 'telescope' [.14]
|
||||
N -> 'hill' [.5]
|
||||
Name -> 'Jack' [.52]
|
||||
Name -> 'Bob' [.48]
|
||||
P -> 'with' [.61]
|
||||
P -> 'under' [.39]
|
||||
Det -> 'the' [.41]
|
||||
Det -> 'a' [.31]
|
||||
Det -> 'my' [.28]
|
||||
"""
|
||||
)
|
||||
|
||||
# Define two demos. Each demo has a sentence and a grammar.
|
||||
demos = [
|
||||
("I saw the man with my telescope", toy_pcfg1),
|
||||
("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
|
||||
]
|
||||
|
||||
# Ask the user which demo they want to use.
|
||||
print()
|
||||
for i in range(len(demos)):
|
||||
print(f"{i + 1:>3}: {demos[i][0]}")
|
||||
print(" %r" % demos[i][1])
|
||||
print()
|
||||
print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
|
||||
try:
|
||||
snum = int(sys.stdin.readline().strip()) - 1
|
||||
sent, grammar = demos[snum]
|
||||
except:
|
||||
print("Bad sentence number")
|
||||
return
|
||||
|
||||
# Tokenize the sentence.
|
||||
tokens = sent.split()
|
||||
|
||||
parser = ViterbiParser(grammar)
|
||||
all_parses = {}
|
||||
|
||||
print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
|
||||
parser.trace(3)
|
||||
t = time.time()
|
||||
parses = parser.parse_all(tokens)
|
||||
time = time.time() - t
|
||||
average = (
|
||||
reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
|
||||
)
|
||||
num_parses = len(parses)
|
||||
for p in parses:
|
||||
all_parses[p.freeze()] = 1
|
||||
|
||||
# Print some summary statistics
|
||||
print()
|
||||
print("Time (secs) # Parses Average P(parse)")
|
||||
print("-----------------------------------------")
|
||||
print("%11.4f%11d%19.14f" % (time, num_parses, average))
|
||||
parses = all_parses.keys()
|
||||
if parses:
|
||||
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
|
||||
else:
|
||||
p = 0
|
||||
print("------------------------------------------")
|
||||
print("%11s%11d%19.14f" % ("n/a", len(parses), p))
|
||||
|
||||
# Ask the user if we should draw the parses.
|
||||
print()
|
||||
print("Draw parses (y/n)? ", end=" ")
|
||||
if sys.stdin.readline().strip().lower().startswith("y"):
|
||||
from nltk.draw.tree import draw_trees
|
||||
|
||||
print(" please wait...")
|
||||
draw_trees(*parses)
|
||||
|
||||
# Ask the user if we should print the parses.
|
||||
print()
|
||||
print("Print parses (y/n)? ", end=" ")
|
||||
if sys.stdin.readline().strip().lower().startswith("y"):
|
||||
for parse in parses:
|
||||
print(parse)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
Reference in New Issue
Block a user