This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,102 @@
# Natural Language Toolkit: Parsers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
NLTK Parsers
Classes and interfaces for producing tree structures that represent
the internal organization of a text. This task is known as "parsing"
the text, and the resulting tree structures are called the text's
"parses". Typically, the text is a single sentence, and the tree
structure represents the syntactic structure of the sentence.
However, parsers can also be used in other domains. For example,
parsers can be used to derive the morphological structure of the
morphemes that make up a word, or to derive the discourse structure
for a set of utterances.
Sometimes, a single piece of text can be represented by more than one
tree structure. Texts represented by more than one tree structure are
called "ambiguous" texts. Note that there are actually two ways in
which a text can be ambiguous:
- The text has multiple correct parses.
- There is not enough information to decide which of several
candidate parses is correct.
However, the parser module does *not* distinguish these two types of
ambiguity.
The parser module defines ``ParserI``, a standard interface for parsing
texts; and two simple implementations of that interface,
``ShiftReduceParser`` and ``RecursiveDescentParser``. It also contains
three sub-modules for specialized kinds of parsing:
- ``nltk.parser.chart`` defines chart parsing, which uses dynamic
programming to efficiently parse texts.
- ``nltk.parser.probabilistic`` defines probabilistic parsing, which
associates a probability with each parse.
"""
from nltk.parse.api import ParserI
from nltk.parse.bllip import BllipParser
from nltk.parse.chart import (
BottomUpChartParser,
BottomUpLeftCornerChartParser,
ChartParser,
LeftCornerChartParser,
SteppingChartParser,
TopDownChartParser,
)
from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser
from nltk.parse.dependencygraph import DependencyGraph
from nltk.parse.earleychart import (
EarleyChartParser,
FeatureEarleyChartParser,
FeatureIncrementalBottomUpChartParser,
FeatureIncrementalBottomUpLeftCornerChartParser,
FeatureIncrementalChartParser,
FeatureIncrementalTopDownChartParser,
IncrementalBottomUpChartParser,
IncrementalBottomUpLeftCornerChartParser,
IncrementalChartParser,
IncrementalLeftCornerChartParser,
IncrementalTopDownChartParser,
)
from nltk.parse.evaluate import DependencyEvaluator
from nltk.parse.featurechart import (
FeatureBottomUpChartParser,
FeatureBottomUpLeftCornerChartParser,
FeatureChartParser,
FeatureTopDownChartParser,
)
from nltk.parse.malt import MaltParser
from nltk.parse.nonprojectivedependencyparser import (
NaiveBayesDependencyScorer,
NonprojectiveDependencyParser,
ProbabilisticNonprojectiveParser,
)
from nltk.parse.pchart import (
BottomUpProbabilisticChartParser,
InsideChartParser,
LongestChartParser,
RandomChartParser,
UnsortedChartParser,
)
from nltk.parse.projectivedependencyparser import (
ProbabilisticProjectiveDependencyParser,
ProjectiveDependencyParser,
)
from nltk.parse.recursivedescent import (
RecursiveDescentParser,
SteppingRecursiveDescentParser,
)
from nltk.parse.shiftreduce import ShiftReduceParser, SteppingShiftReduceParser
from nltk.parse.transitionparser import TransitionParser
from nltk.parse.util import TestGrammar, extract_test_sentences, load_parser
from nltk.parse.viterbi import ViterbiParser

View File

@@ -0,0 +1,72 @@
# Natural Language Toolkit: Parser API
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
import itertools
from nltk.internals import overridden
class ParserI:
"""
A processing class for deriving trees that represent possible
structures for a sequence of tokens. These tree structures are
known as "parses". Typically, parsers are used to derive syntax
trees for sentences. But parsers can also be used to derive other
kinds of tree structure, such as morphological trees and discourse
structures.
Subclasses must define:
- at least one of: ``parse()``, ``parse_sents()``.
Subclasses may define:
- ``grammar()``
"""
def grammar(self):
"""
:return: The grammar used by this parser.
"""
raise NotImplementedError()
def parse(self, sent, *args, **kwargs):
"""
:return: An iterator that generates parse trees for the sentence.
When possible this list is sorted from most likely to least likely.
:param sent: The sentence to be parsed
:type sent: list(str)
:rtype: iter(Tree)
"""
if overridden(self.parse_sents):
return next(self.parse_sents([sent], *args, **kwargs))
elif overridden(self.parse_one):
return (
tree
for tree in [self.parse_one(sent, *args, **kwargs)]
if tree is not None
)
elif overridden(self.parse_all):
return iter(self.parse_all(sent, *args, **kwargs))
else:
raise NotImplementedError()
def parse_sents(self, sents, *args, **kwargs):
"""
Apply ``self.parse()`` to each element of ``sents``.
:rtype: iter(iter(Tree))
"""
return (self.parse(sent, *args, **kwargs) for sent in sents)
def parse_all(self, sent, *args, **kwargs):
""":rtype: list(Tree)"""
return list(self.parse(sent, *args, **kwargs))
def parse_one(self, sent, *args, **kwargs):
""":rtype: Tree or None"""
return next(self.parse(sent, *args, **kwargs), None)

View File

@@ -0,0 +1,299 @@
# Natural Language Toolkit: Interface to BLLIP Parser
#
# Author: David McClosky <dmcc@bigasterisk.com>
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.parse.api import ParserI
from nltk.tree import Tree
"""
Interface for parsing with BLLIP Parser. Requires the Python
bllipparser module. BllipParser objects can be constructed with the
``BllipParser.from_unified_model_dir`` class method or manually using the
``BllipParser`` constructor. The former is generally easier if you have
a BLLIP Parser unified model directory -- a basic model can be obtained
from NLTK's downloader. More unified parsing models can be obtained with
BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher``
or see docs for ``bllipparser.ModelFetcher.download_and_install_model``).
Basic usage::
# download and install a basic unified parsing model (Wall Street Journal)
# sudo python -m nltk.downloader bllip_wsj_no_aux
>>> from nltk.data import find
>>> model_dir = find('models/bllip_wsj_no_aux').path
>>> bllip = BllipParser.from_unified_model_dir(model_dir)
# 1-best parsing
>>> sentence1 = 'British left waffles on Falklands .'.split()
>>> top_parse = bllip.parse_one(sentence1)
>>> print(top_parse)
(S1
(S
(NP (JJ British) (NN left))
(VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands))))
(. .)))
# n-best parsing
>>> sentence2 = 'Time flies'.split()
>>> all_parses = bllip.parse_all(sentence2)
>>> print(len(all_parses))
50
>>> print(all_parses[0])
(S1 (S (NP (NNP Time)) (VP (VBZ flies))))
# incorporating external tagging constraints (None means unconstrained tag)
>>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')])
>>> print(next(constrained1))
(S1 (NP (VB Time) (NNS flies)))
>>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)])
>>> print(next(constrained2))
(S1 (NP (NN Time) (VBZ flies)))
References
----------
- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of
the 1st North American chapter of the Association for Computational
Linguistics conference. Association for Computational Linguistics,
2000.
- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing
and MaxEnt discriminative reranking." Proceedings of the 43rd Annual
Meeting on Association for Computational Linguistics. Association
for Computational Linguistics, 2005.
Known issues
------------
Note that BLLIP Parser is not currently threadsafe. Since this module
uses a SWIG interface, it is potentially unsafe to create multiple
``BllipParser`` objects in the same process. BLLIP Parser currently
has issues with non-ASCII text and will raise an error if given any.
See https://pypi.python.org/pypi/bllipparser/ for more information
on BLLIP Parser's Python interface.
"""
__all__ = ["BllipParser"]
# this block allows this module to be imported even if bllipparser isn't
# available
try:
from bllipparser import RerankingParser
from bllipparser.RerankingParser import get_unified_model_parameters
def _ensure_bllip_import_or_error():
pass
except ImportError as ie:
def _ensure_bllip_import_or_error(ie=ie):
raise ImportError("Couldn't import bllipparser module: %s" % ie)
def _ensure_ascii(words):
try:
for i, word in enumerate(words):
word.encode("ascii")
except UnicodeEncodeError as e:
raise ValueError(
f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser "
"currently doesn't support non-ASCII inputs."
) from e
def _scored_parse_to_nltk_tree(scored_parse):
return Tree.fromstring(str(scored_parse.ptb_parse))
class BllipParser(ParserI):
"""
Interface for parsing with BLLIP Parser. BllipParser objects can be
constructed with the ``BllipParser.from_unified_model_dir`` class
method or manually using the ``BllipParser`` constructor.
"""
def __init__(
self,
parser_model=None,
reranker_features=None,
reranker_weights=None,
parser_options=None,
reranker_options=None,
):
"""
Load a BLLIP Parser model from scratch. You'll typically want to
use the ``from_unified_model_dir()`` class method to construct
this object.
:param parser_model: Path to parser model directory
:type parser_model: str
:param reranker_features: Path the reranker model's features file
:type reranker_features: str
:param reranker_weights: Path the reranker model's weights file
:type reranker_weights: str
:param parser_options: optional dictionary of parser options, see
``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
for more information.
:type parser_options: dict(str)
:param reranker_options: optional
dictionary of reranker options, see
``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
for more information.
:type reranker_options: dict(str)
"""
_ensure_bllip_import_or_error()
parser_options = parser_options or {}
reranker_options = reranker_options or {}
self.rrp = RerankingParser()
self.rrp.load_parser_model(parser_model, **parser_options)
if reranker_features and reranker_weights:
self.rrp.load_reranker_model(
features_filename=reranker_features,
weights_filename=reranker_weights,
**reranker_options,
)
def parse(self, sentence):
"""
Use BLLIP Parser to parse a sentence. Takes a sentence as a list
of words; it will be automatically tagged with this BLLIP Parser
instance's tagger.
:return: An iterator that generates parse trees for the sentence
from most likely to least likely.
:param sentence: The sentence to be parsed
:type sentence: list(str)
:rtype: iter(Tree)
"""
_ensure_ascii(sentence)
nbest_list = self.rrp.parse(sentence)
for scored_parse in nbest_list:
yield _scored_parse_to_nltk_tree(scored_parse)
def tagged_parse(self, word_and_tag_pairs):
"""
Use BLLIP to parse a sentence. Takes a sentence as a list of
(word, tag) tuples; the sentence must have already been tokenized
and tagged. BLLIP will attempt to use the tags provided but may
use others if it can't come up with a complete parse subject
to those constraints. You may also specify a tag as ``None``
to leave a token's tag unconstrained.
:return: An iterator that generates parse trees for the sentence
from most likely to least likely.
:param sentence: Input sentence to parse as (word, tag) pairs
:type sentence: list(tuple(str, str))
:rtype: iter(Tree)
"""
words = []
tag_map = {}
for i, (word, tag) in enumerate(word_and_tag_pairs):
words.append(word)
if tag is not None:
tag_map[i] = tag
_ensure_ascii(words)
nbest_list = self.rrp.parse_tagged(words, tag_map)
for scored_parse in nbest_list:
yield _scored_parse_to_nltk_tree(scored_parse)
@classmethod
def from_unified_model_dir(
cls, model_dir, parser_options=None, reranker_options=None
):
"""
Create a ``BllipParser`` object from a unified parsing model
directory. Unified parsing model directories are a standardized
way of storing BLLIP parser and reranker models together on disk.
See ``bllipparser.RerankingParser.get_unified_model_parameters()``
for more information about unified model directories.
:return: A ``BllipParser`` object using the parser and reranker
models in the model directory.
:param model_dir: Path to the unified model directory.
:type model_dir: str
:param parser_options: optional dictionary of parser options, see
``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
for more information.
:type parser_options: dict(str)
:param reranker_options: optional dictionary of reranker options, see
``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
for more information.
:type reranker_options: dict(str)
:rtype: BllipParser
"""
(
parser_model_dir,
reranker_features_filename,
reranker_weights_filename,
) = get_unified_model_parameters(model_dir)
return cls(
parser_model_dir,
reranker_features_filename,
reranker_weights_filename,
parser_options,
reranker_options,
)
def demo():
"""This assumes the Python module bllipparser is installed."""
# download and install a basic unified parsing model (Wall Street Journal)
# sudo python -m nltk.downloader bllip_wsj_no_aux
from nltk.data import find
model_dir = find("models/bllip_wsj_no_aux").path
print("Loading BLLIP Parsing models...")
# the easiest way to get started is to use a unified model
bllip = BllipParser.from_unified_model_dir(model_dir)
print("Done.")
sentence1 = "British left waffles on Falklands .".split()
sentence2 = "I saw the man with the telescope .".split()
# this sentence is known to fail under the WSJ parsing model
fail1 = "# ! ? : -".split()
for sentence in (sentence1, sentence2, fail1):
print("Sentence: %r" % " ".join(sentence))
try:
tree = next(bllip.parse(sentence))
print(tree)
except StopIteration:
print("(parse failed)")
# n-best parsing demo
for i, parse in enumerate(bllip.parse(sentence1)):
print("parse %d:\n%s" % (i, parse))
# using external POS tag constraints
print(
"forcing 'tree' to be 'NN':",
next(bllip.tagged_parse([("A", None), ("tree", "NN")])),
)
print(
"forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])),
)
# constraints don't have to make sense... (though on more complicated
# sentences, they may cause the parse to fail)
print(
"forcing 'A' to be 'NNP':",
next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])),
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,805 @@
# Natural Language Toolkit: Interface to the CoreNLP REST API.
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Dmitrijs Milajevs <dimazest@gmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import json
import os
import re
import socket
import time
from typing import List, Tuple
from nltk.internals import _java_options, config_java, find_jar_iter, java
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tag.api import TaggerI
from nltk.tokenize.api import TokenizerI
from nltk.tree import Tree
_stanford_url = "https://stanfordnlp.github.io/CoreNLP/"
class CoreNLPServerError(EnvironmentError):
"""Exceptions associated with the Core NLP server."""
def try_port(port=0):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(("", port))
p = sock.getsockname()[1]
sock.close()
return p
class CoreNLPServer:
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
_JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
def __init__(
self,
path_to_jar=None,
path_to_models_jar=None,
verbose=False,
java_options=None,
corenlp_options=None,
port=None,
):
if corenlp_options is None:
corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
jars = list(
find_jar_iter(
self._JAR,
path_to_jar,
env_vars=("CORENLP",),
searchpath=(),
url=_stanford_url,
verbose=verbose,
is_regex=True,
)
)
# find the most recent code and model jar
stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name))
if port is None:
try:
port = try_port(9000)
except OSError:
port = try_port()
corenlp_options.extend(["-port", str(port)])
else:
try_port(port)
corenlp_options.extend(["-port", str(port)])
self.url = f"http://localhost:{port}"
model_jar = max(
find_jar_iter(
self._MODEL_JAR_PATTERN,
path_to_models_jar,
env_vars=("CORENLP_MODELS",),
searchpath=(),
url=_stanford_url,
verbose=verbose,
is_regex=True,
),
key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name),
)
self.verbose = verbose
self._classpath = stanford_jar, model_jar
self.corenlp_options = corenlp_options
self.java_options = java_options or ["-mx2g"]
def start(self, stdout="devnull", stderr="devnull"):
"""Starts the CoreNLP server
:param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
"""
import requests
cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
if self.corenlp_options:
cmd.extend(self.corenlp_options)
# Configure java.
default_options = " ".join(_java_options)
config_java(options=self.java_options, verbose=self.verbose)
try:
self.popen = java(
cmd,
classpath=self._classpath,
blocking=False,
stdout=stdout,
stderr=stderr,
)
finally:
# Return java configurations to their default values.
config_java(options=default_options, verbose=self.verbose)
# Check that the server is istill running.
returncode = self.popen.poll()
if returncode is not None:
_, stderrdata = self.popen.communicate()
raise CoreNLPServerError(
returncode,
"Could not start the server. "
"The error was: {}".format(stderrdata.decode("ascii")),
)
for i in range(30):
try:
response = requests.get(requests.compat.urljoin(self.url, "live"))
except requests.exceptions.ConnectionError:
time.sleep(1)
else:
if response.ok:
break
else:
raise CoreNLPServerError("Could not connect to the server.")
for i in range(60):
try:
response = requests.get(requests.compat.urljoin(self.url, "ready"))
except requests.exceptions.ConnectionError:
time.sleep(1)
else:
if response.ok:
break
else:
raise CoreNLPServerError("The server is not ready.")
def stop(self):
self.popen.terminate()
self.popen.wait()
def __enter__(self):
self.start()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop()
return False
class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
"""Interface to the CoreNLP Parser."""
def __init__(
self,
url="http://localhost:9000",
encoding="utf8",
tagtype=None,
strict_json=True,
):
import requests
self.url = url
self.encoding = encoding
if tagtype not in ["pos", "ner", None]:
raise ValueError("tagtype must be either 'pos', 'ner' or None")
self.tagtype = tagtype
self.strict_json = strict_json
self.session = requests.Session()
def parse_sents(self, sentences, *args, **kwargs):
"""Parse multiple sentences.
Takes multiple sentences as a list where each sentence is a list of
words. Each sentence will be automatically tagged with this
CoreNLPParser instance's tagger.
If a whitespace exists inside a token, then the token will be treated as
several tokens.
:param sentences: Input sentences to parse
:type sentences: list(list(str))
:rtype: iter(iter(Tree))
"""
# Converting list(list(str)) -> list(str)
sentences = (" ".join(words) for words in sentences)
return self.raw_parse_sents(sentences, *args, **kwargs)
def raw_parse(self, sentence, properties=None, *args, **kwargs):
"""Parse a sentence.
Takes a sentence as a string; before parsing, it will be automatically
tokenized and tagged by the CoreNLP Parser.
:param sentence: Input sentence to parse
:type sentence: str
:rtype: iter(Tree)
"""
default_properties = {"tokenize.whitespace": "false"}
default_properties.update(properties or {})
return next(
self.raw_parse_sents(
[sentence], properties=default_properties, *args, **kwargs
)
)
def api_call(self, data, properties=None, timeout=60):
default_properties = {
"outputFormat": "json",
"annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
parser_annotator=self.parser_annotator
),
}
default_properties.update(properties or {})
response = self.session.post(
self.url,
params={"properties": json.dumps(default_properties)},
data=data.encode(self.encoding),
headers={"Content-Type": f"text/plain; charset={self.encoding}"},
timeout=timeout,
)
response.raise_for_status()
return response.json(strict=self.strict_json)
def raw_parse_sents(
self, sentences, verbose=False, properties=None, *args, **kwargs
):
"""Parse multiple sentences.
Takes multiple sentences as a list of strings. Each sentence will be
automatically tokenized and tagged.
:param sentences: Input sentences to parse.
:type sentences: list(str)
:rtype: iter(iter(Tree))
"""
default_properties = {
# Only splits on '\n', never inside the sentence.
"ssplit.eolonly": "true"
}
default_properties.update(properties or {})
"""
for sentence in sentences:
parsed_data = self.api_call(sentence, properties=default_properties)
assert len(parsed_data['sentences']) == 1
for parse in parsed_data['sentences']:
tree = self.make_tree(parse)
yield iter([tree])
"""
parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
for parsed_sent in parsed_data["sentences"]:
tree = self.make_tree(parsed_sent)
yield iter([tree])
def parse_text(self, text, *args, **kwargs):
"""Parse a piece of text.
The text might contain several sentences which will be split by CoreNLP.
:param str text: text to be split.
:returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables?
"""
parsed_data = self.api_call(text, *args, **kwargs)
for parse in parsed_data["sentences"]:
yield self.make_tree(parse)
def tokenize(self, text, properties=None):
"""Tokenize a string of text.
Skip these tests if CoreNLP is likely not ready.
>>> from nltk.test.setup_fixt import check_jar
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
The CoreNLP server can be started using the following notation, although
we recommend the `with CoreNLPServer() as server:` context manager notation
to ensure that the server is always stopped.
>>> server = CoreNLPServer()
>>> server.start()
>>> parser = CoreNLPParser(url=server.url)
>>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'
>>> list(parser.tokenize(text))
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> s = "The colour of the wall is blue."
>>> list(
... parser.tokenize(
... 'The colour of the wall is blue.',
... properties={'tokenize.options': 'americanize=true'},
... )
... )
['The', 'colour', 'of', 'the', 'wall', 'is', 'blue', '.']
>>> server.stop()
"""
default_properties = {"annotators": "tokenize,ssplit"}
default_properties.update(properties or {})
result = self.api_call(text, properties=default_properties)
for sentence in result["sentences"]:
for token in sentence["tokens"]:
yield token["originalText"] or token["word"]
def tag_sents(self, sentences, properties=None):
"""
Tag multiple sentences.
Takes multiple sentences as a list where each sentence is a list of
tokens.
:param sentences: Input sentences to tag
:type sentences: list(list(str))
:rtype: list(list(tuple(str, str))
"""
# Converting list(list(str)) -> list(str)
sentences = (" ".join(words) for words in sentences)
if properties is None:
properties = {"tokenize.whitespace": "true", "ner.useSUTime": "false"}
return [sentences[0] for sentences in self.raw_tag_sents(sentences, properties)]
def tag(self, sentence: str, properties=None) -> List[Tuple[str, str]]:
"""
Tag a list of tokens.
:rtype: list(tuple(str, str))
Skip these tests if CoreNLP is likely not ready.
>>> from nltk.test.setup_fixt import check_jar
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
The CoreNLP server can be started using the following notation, although
we recommend the `with CoreNLPServer() as server:` context manager notation
to ensure that the server is always stopped.
>>> server = CoreNLPServer()
>>> server.start()
>>> parser = CoreNLPParser(url=server.url, tagtype='ner')
>>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]
>>> parser = CoreNLPParser(url=server.url, tagtype='pos')
>>> tokens = "What is the airspeed of an unladen swallow ?".split()
>>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
>>> server.stop()
"""
return self.tag_sents([sentence], properties)[0]
def raw_tag_sents(self, sentences, properties=None):
"""
Tag multiple sentences.
Takes multiple sentences as a list where each sentence is a string.
:param sentences: Input sentences to tag
:type sentences: list(str)
:rtype: list(list(list(tuple(str, str)))
"""
default_properties = {
"ssplit.isOneSentence": "true",
"annotators": "tokenize,ssplit,",
}
default_properties.update(properties or {})
# Supports only 'pos' or 'ner' tags.
assert self.tagtype in [
"pos",
"ner",
], "CoreNLP tagger supports only 'pos' or 'ner' tags."
default_properties["annotators"] += self.tagtype
for sentence in sentences:
tagged_data = self.api_call(sentence, properties=default_properties)
yield [
[
(token["word"], token[self.tagtype])
for token in tagged_sentence["tokens"]
]
for tagged_sentence in tagged_data["sentences"]
]
class CoreNLPParser(GenericCoreNLPParser):
"""
Skip these tests if CoreNLP is likely not ready.
>>> from nltk.test.setup_fixt import check_jar
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
The recommended usage of `CoreNLPParser` is using the context manager notation:
>>> with CoreNLPServer() as server:
... parser = CoreNLPParser(url=server.url)
... next(
... parser.raw_parse('The quick brown fox jumps over the lazy dog.')
... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE
ROOT
|
S
_______________|__________________________
| VP |
| _________|___ |
| | PP |
| | ________|___ |
NP | | NP |
____|__________ | | _______|____ |
DT JJ JJ NN VBZ IN DT JJ NN .
| | | | | | | | | |
The quick brown fox jumps over the lazy dog .
Alternatively, the server can be started using the following notation.
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
outside of Python.
>>> server = CoreNLPServer()
>>> server.start()
>>> parser = CoreNLPParser(url=server.url)
>>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
... [
... 'The quick brown fox jumps over the lazy dog.',
... 'The quick grey wolf jumps over the lazy fox.',
... ]
... )
>>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE
ROOT
|
S
_______________|__________________________
| VP |
| _________|___ |
| | PP |
| | ________|___ |
NP | | NP |
____|__________ | | _______|____ |
DT JJ JJ NN VBZ IN DT JJ NN .
| | | | | | | | | |
The quick brown fox jumps over the lazy dog .
>>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE
ROOT
|
S
_______________|__________________________
| VP |
| _________|___ |
| | PP |
| | ________|___ |
NP | | NP |
____|_________ | | _______|____ |
DT JJ JJ NN VBZ IN DT JJ NN .
| | | | | | | | | |
The quick grey wolf jumps over the lazy fox .
>>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
... [
... "I 'm a dog".split(),
... "This is my friends ' cat ( the tabby )".split(),
... ]
... )
>>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE
ROOT
|
S
_______|____
| VP
| ________|___
NP | NP
| | ___|___
PRP VBP DT NN
| | | |
I 'm a dog
>>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE
ROOT
|
S
____|___________
| VP
| ___________|_____________
| | NP
| | _______|________________________
| | NP | | |
| | _____|_______ | | |
NP | NP | | NP |
| | ______|_________ | | ___|____ |
DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB-
| | | | | | | | | |
This is my friends ' cat -LRB- the tabby -RRB-
>>> parse_john, parse_mary, = parser.parse_text(
... 'John loves Mary. Mary walks.'
... )
>>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE
ROOT
|
S
_____|_____________
| VP |
| ____|___ |
NP | NP |
| | | |
NNP VBZ NNP .
| | | |
John loves Mary .
>>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE
ROOT
|
S
_____|____
NP VP |
| | |
NNP VBZ .
| | |
Mary walks .
Special cases
>>> next(
... parser.raw_parse(
... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
... 'Jessica Lynch have angrily dismissed claims made in her biography '
... 'that she was raped by her Iraqi captors.'
... )
... ).height()
14
>>> next(
... parser.raw_parse(
... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
... '0.05 percent, at 997.02.'
... )
... ).height()
11
>>> server.stop()
"""
_OUTPUT_FORMAT = "penn"
parser_annotator = "parse"
def make_tree(self, result):
return Tree.fromstring(result["parse"])
class CoreNLPDependencyParser(GenericCoreNLPParser):
"""Dependency parser.
Skip these tests if CoreNLP is likely not ready.
>>> from nltk.test.setup_fixt import check_jar
>>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True)
The recommended usage of `CoreNLPParser` is using the context manager notation:
>>> with CoreNLPServer() as server:
... dep_parser = CoreNLPDependencyParser(url=server.url)
... parse, = dep_parser.raw_parse(
... 'The quick brown fox jumps over the lazy dog.'
... )
... print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
The DT 4 det
quick JJ 4 amod
brown JJ 4 amod
fox NN 5 nsubj
jumps VBZ 0 ROOT
over IN 9 case
the DT 9 det
lazy JJ 9 amod
dog NN 5 obl
. . 5 punct
Alternatively, the server can be started using the following notation.
Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started
outside of Python.
>>> server = CoreNLPServer()
>>> server.start()
>>> dep_parser = CoreNLPDependencyParser(url=server.url)
>>> parse, = dep_parser.raw_parse('The quick brown fox jumps over the lazy dog.')
>>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE
(jumps (fox The quick brown) (dog over the lazy) .)
>>> for governor, dep, dependent in parse.triples():
... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE
('jumps', 'VBZ') nsubj ('fox', 'NN')
('fox', 'NN') det ('The', 'DT')
('fox', 'NN') amod ('quick', 'JJ')
('fox', 'NN') amod ('brown', 'JJ')
('jumps', 'VBZ') obl ('dog', 'NN')
('dog', 'NN') case ('over', 'IN')
('dog', 'NN') det ('the', 'DT')
('dog', 'NN') amod ('lazy', 'JJ')
('jumps', 'VBZ') punct ('.', '.')
>>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
... [
... 'The quick brown fox jumps over the lazy dog.',
... 'The quick grey wolf jumps over the lazy fox.',
... ]
... )
>>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
The DT 4 det
quick JJ 4 amod
brown JJ 4 amod
fox NN 5 nsubj
jumps VBZ 0 ROOT
over IN 9 case
the DT 9 det
lazy JJ 9 amod
dog NN 5 obl
. . 5 punct
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
The DT 4 det
quick JJ 4 amod
grey JJ 4 amod
wolf NN 5 nsubj
jumps VBZ 0 ROOT
over IN 9 case
the DT 9 det
lazy JJ 9 amod
fox NN 5 obl
. . 5 punct
>>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
... [
... "I 'm a dog".split(),
... "This is my friends ' cat ( the tabby )".split(),
... ]
... )
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
I PRP 4 nsubj
'm VBP 4 cop
a DT 4 det
dog NN 0 ROOT
>>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
This DT 6 nsubj
is VBZ 6 cop
my PRP$ 4 nmod:poss
friends NNS 6 nmod:poss
' POS 4 case
cat NN 0 ROOT
( -LRB- 9 punct
the DT 9 det
tabby NN 6 dep
) -RRB- 9 punct
>>> parse_john, parse_mary, = dep_parser.parse_text(
... 'John loves Mary. Mary walks.'
... )
>>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
John NNP 2 nsubj
loves VBZ 0 ROOT
Mary NNP 2 obj
. . 2 punct
>>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
Mary NNP 2 nsubj
walks VBZ 0 ROOT
. . 2 punct
Special cases
Non-breaking space inside of a token.
>>> len(
... next(
... dep_parser.raw_parse(
... 'Anhalt said children typically treat a 20-ounce soda bottle as one '
... 'serving, while it actually contains 2 1/2 servings.'
... )
... ).nodes
... )
23
Phone numbers.
>>> len(
... next(
... dep_parser.raw_parse('This is not going to crash: 01 111 555.')
... ).nodes
... )
10
>>> print(
... next(
... dep_parser.raw_parse('The underscore _ should not simply disappear.')
... ).to_conll(4)
... ) # doctest: +NORMALIZE_WHITESPACE
The DT 2 det
underscore NN 7 nsubj
_ NFP 7 punct
should MD 7 aux
not RB 7 advmod
simply RB 7 advmod
disappear VB 0 ROOT
. . 7 punct
>>> print(
... next(
... dep_parser.raw_parse(
... 'for all of its insights into the dream world of teen life , and its electronic expression through '
... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
... '1/2-hour running time .'
... )
... ).to_conll(4)
... ) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
for IN 2 case
all DT 24 obl
of IN 5 case
its PRP$ 5 nmod:poss
insights NNS 2 nmod
into IN 9 case
the DT 9 det
dream NN 9 compound
world NN 5 nmod
of IN 12 case
teen NN 12 compound
...
>>> server.stop()
"""
_OUTPUT_FORMAT = "conll2007"
parser_annotator = "depparse"
def make_tree(self, result):
return DependencyGraph(
(
" ".join(n_items[1:]) # NLTK expects an iterable of strings...
for n_items in sorted(transform(result))
),
cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token.
)
def transform(sentence):
for dependency in sentence["basicDependencies"]:
dependent_index = dependency["dependent"]
token = sentence["tokens"][dependent_index - 1]
# Return values that we don't know as '_'. Also, consider tag and ctag
# to be equal.
yield (
dependent_index,
"_",
token["word"],
token["lemma"],
token["pos"],
token["pos"],
"_",
str(dependency["governor"]),
dependency["dep"],
"_",
"_",
)

View File

@@ -0,0 +1,799 @@
# Natural Language Toolkit: Dependency Grammars
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (modifications)
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Tools for reading and writing dependency trees.
The input is assumed to be in Malt-TAB format
(https://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
"""
import subprocess
import warnings
from collections import defaultdict
from itertools import chain
from pprint import pformat
from nltk.internals import find_binary
from nltk.tree import Tree
#################################################################
# DependencyGraph Class
#################################################################
class DependencyGraph:
"""
A container for the nodes and labelled edges of a dependency structure.
"""
def __init__(
self,
tree_str=None,
cell_extractor=None,
zero_based=False,
cell_separator=None,
top_relation_label="ROOT",
):
"""Dependency graph.
We place a dummy `TOP` node with the index 0, since the root node is
often assigned 0 as its head. This also means that the indexing of the
nodes corresponds directly to the Malt-TAB format, which starts at 1.
If zero-based is True, then Malt-TAB-like input with node numbers
starting at 0 and the root node assigned -1 (as produced by, e.g.,
zpar).
:param str cell_separator: the cell separator. If not provided, cells
are split by whitespace.
:param str top_relation_label: the label by which the top relation is
identified, for examlple, `ROOT`, `null` or `TOP`.
"""
self.nodes = defaultdict(
lambda: {
"address": None,
"word": None,
"lemma": None,
"ctag": None,
"tag": None,
"feats": None,
"head": None,
"deps": defaultdict(list),
"rel": None,
}
)
self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0})
self.root = None
if tree_str:
self._parse(
tree_str,
cell_extractor=cell_extractor,
zero_based=zero_based,
cell_separator=cell_separator,
top_relation_label=top_relation_label,
)
def remove_by_address(self, address):
"""
Removes the node with the given address. References
to this node in others will still exist.
"""
del self.nodes[address]
def redirect_arcs(self, originals, redirect):
"""
Redirects arcs to any of the nodes in the originals list
to the redirect node address.
"""
for node in self.nodes.values():
new_deps = []
for dep in node["deps"]:
if dep in originals:
new_deps.append(redirect)
else:
new_deps.append(dep)
node["deps"] = new_deps
def add_arc(self, head_address, mod_address):
"""
Adds an arc from the node specified by head_address to the
node specified by the mod address.
"""
relation = self.nodes[mod_address]["rel"]
self.nodes[head_address]["deps"].setdefault(relation, [])
self.nodes[head_address]["deps"][relation].append(mod_address)
# self.nodes[head_address]['deps'].append(mod_address)
def connect_graph(self):
"""
Fully connects all non-root nodes. All nodes are set to be dependents
of the root node.
"""
for node1 in self.nodes.values():
for node2 in self.nodes.values():
if node1["address"] != node2["address"] and node2["rel"] != "TOP":
relation = node2["rel"]
node1["deps"].setdefault(relation, [])
node1["deps"][relation].append(node2["address"])
# node1['deps'].append(node2['address'])
def get_by_address(self, node_address):
"""Return the node with the given address."""
return self.nodes[node_address]
def contains_address(self, node_address):
"""
Returns true if the graph contains a node with the given node
address, false otherwise.
"""
return node_address in self.nodes
def to_dot(self):
"""Return a dot representation suitable for using with Graphviz.
>>> dg = DependencyGraph(
... 'John N 2\\n'
... 'loves V 0\\n'
... 'Mary N 2'
... )
>>> print(dg.to_dot())
digraph G{
edge [dir=forward]
node [shape=plaintext]
<BLANKLINE>
0 [label="0 (None)"]
0 -> 2 [label="ROOT"]
1 [label="1 (John)"]
2 [label="2 (loves)"]
2 -> 1 [label=""]
2 -> 3 [label=""]
3 [label="3 (Mary)"]
}
"""
# Start the digraph specification
s = "digraph G{\n"
s += "edge [dir=forward]\n"
s += "node [shape=plaintext]\n"
# Draw the remaining nodes
for node in sorted(self.nodes.values(), key=lambda v: v["address"]):
s += '\n{} [label="{} ({})"]'.format(
node["address"],
node["address"],
node["word"],
)
for rel, deps in node["deps"].items():
for dep in deps:
if rel is not None:
s += '\n{} -> {} [label="{}"]'.format(node["address"], dep, rel)
else:
s += "\n{} -> {} ".format(node["address"], dep)
s += "\n}"
return s
def _repr_svg_(self):
"""Show SVG representation of the transducer (IPython magic).
>>> from nltk.test.setup_fixt import check_binary
>>> check_binary('dot')
>>> dg = DependencyGraph(
... 'John N 2\\n'
... 'loves V 0\\n'
... 'Mary N 2'
... )
>>> dg._repr_svg_().split('\\n')[0]
'<?xml version="1.0" encoding="UTF-8" standalone="no"?>'
"""
dot_string = self.to_dot()
return dot2img(dot_string)
def __str__(self):
return pformat(self.nodes)
def __repr__(self):
return f"<DependencyGraph with {len(self.nodes)} nodes>"
@staticmethod
def load(
filename, zero_based=False, cell_separator=None, top_relation_label="ROOT"
):
"""
:param filename: a name of a file in Malt-TAB format
:param zero_based: nodes in the input file are numbered starting from 0
rather than 1 (as produced by, e.g., zpar)
:param str cell_separator: the cell separator. If not provided, cells
are split by whitespace.
:param str top_relation_label: the label by which the top relation is
identified, for examlple, `ROOT`, `null` or `TOP`.
:return: a list of DependencyGraphs
"""
with open(filename) as infile:
return [
DependencyGraph(
tree_str,
zero_based=zero_based,
cell_separator=cell_separator,
top_relation_label=top_relation_label,
)
for tree_str in infile.read().split("\n\n")
]
def left_children(self, node_index):
"""
Returns the number of left children under the node specified
by the given address.
"""
children = chain.from_iterable(self.nodes[node_index]["deps"].values())
index = self.nodes[node_index]["address"]
return sum(1 for c in children if c < index)
def right_children(self, node_index):
"""
Returns the number of right children under the node specified
by the given address.
"""
children = chain.from_iterable(self.nodes[node_index]["deps"].values())
index = self.nodes[node_index]["address"]
return sum(1 for c in children if c > index)
def add_node(self, node):
if not self.contains_address(node["address"]):
self.nodes[node["address"]].update(node)
def _parse(
self,
input_,
cell_extractor=None,
zero_based=False,
cell_separator=None,
top_relation_label="ROOT",
):
"""Parse a sentence.
:param extractor: a function that given a tuple of cells returns a
7-tuple, where the values are ``word, lemma, ctag, tag, feats, head,
rel``.
:param str cell_separator: the cell separator. If not provided, cells
are split by whitespace.
:param str top_relation_label: the label by which the top relation is
identified, for examlple, `ROOT`, `null` or `TOP`.
"""
def extract_3_cells(cells, index):
word, tag, head = cells
return index, word, word, tag, tag, "", head, ""
def extract_4_cells(cells, index):
word, tag, head, rel = cells
return index, word, word, tag, tag, "", head, rel
def extract_7_cells(cells, index):
line_index, word, lemma, tag, _, head, rel = cells
try:
index = int(line_index)
except ValueError:
# index can't be parsed as an integer, use default
pass
return index, word, lemma, tag, tag, "", head, rel
def extract_10_cells(cells, index):
line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
try:
index = int(line_index)
except ValueError:
# index can't be parsed as an integer, use default
pass
return index, word, lemma, ctag, tag, feats, head, rel
extractors = {
3: extract_3_cells,
4: extract_4_cells,
7: extract_7_cells,
10: extract_10_cells,
}
if isinstance(input_, str):
input_ = (line for line in input_.split("\n"))
lines = (l.rstrip() for l in input_)
lines = (l for l in lines if l)
cell_number = None
for index, line in enumerate(lines, start=1):
cells = line.split(cell_separator)
if cell_number is None:
cell_number = len(cells)
else:
assert cell_number == len(cells)
if cell_extractor is None:
try:
cell_extractor = extractors[cell_number]
except KeyError as e:
raise ValueError(
"Number of tab-delimited fields ({}) not supported by "
"CoNLL(10) or Malt-Tab(4) format".format(cell_number)
) from e
try:
index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(
cells, index
)
except (TypeError, ValueError):
# cell_extractor doesn't take 2 arguments or doesn't return 8
# values; assume the cell_extractor is an older external
# extractor and doesn't accept or return an index.
word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
if head == "_":
continue
head = int(head)
if zero_based:
head += 1
self.nodes[index].update(
{
"address": index,
"word": word,
"lemma": lemma,
"ctag": ctag,
"tag": tag,
"feats": feats,
"head": head,
"rel": rel,
}
)
# Make sure that the fake root node has labeled dependencies.
if (cell_number == 3) and (head == 0):
rel = top_relation_label
self.nodes[head]["deps"][rel].append(index)
if self.nodes[0]["deps"][top_relation_label]:
root_address = self.nodes[0]["deps"][top_relation_label][0]
self.root = self.nodes[root_address]
self.top_relation_label = top_relation_label
else:
warnings.warn(
"The graph doesn't contain a node " "that depends on the root element."
)
def _word(self, node, filter=True):
w = node["word"]
if filter:
if w != ",":
return w
return w
def _tree(self, i):
"""Turn dependency graphs into NLTK trees.
:param int i: index of a node
:return: either a word (if the indexed node is a leaf) or a ``Tree``.
"""
node = self.get_by_address(i)
word = node["word"]
deps = sorted(chain.from_iterable(node["deps"].values()))
if deps:
return Tree(word, [self._tree(dep) for dep in deps])
else:
return word
def tree(self):
"""
Starting with the ``root`` node, build a dependency tree using the NLTK
``Tree`` constructor. Dependency labels are omitted.
"""
node = self.root
word = node["word"]
deps = sorted(chain.from_iterable(node["deps"].values()))
return Tree(word, [self._tree(dep) for dep in deps])
def triples(self, node=None):
"""
Extract dependency triples of the form:
((head word, head tag), rel, (dep word, dep tag))
"""
if not node:
node = self.root
head = (node["word"], node["ctag"])
for i in sorted(chain.from_iterable(node["deps"].values())):
dep = self.get_by_address(i)
yield (head, dep["rel"], (dep["word"], dep["ctag"]))
yield from self.triples(node=dep)
def _hd(self, i):
try:
return self.nodes[i]["head"]
except IndexError:
return None
def _rel(self, i):
try:
return self.nodes[i]["rel"]
except IndexError:
return None
# what's the return type? Boolean or list?
def contains_cycle(self):
"""Check whether there are cycles.
>>> dg = DependencyGraph(treebank_data)
>>> dg.contains_cycle()
False
>>> cyclic_dg = DependencyGraph()
>>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}
>>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}
>>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}
>>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}
>>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}
>>> cyclic_dg.nodes = {
... 0: top,
... 1: child1,
... 2: child2,
... 3: child3,
... 4: child4,
... }
>>> cyclic_dg.root = top
>>> cyclic_dg.contains_cycle()
[1, 2, 4, 3]
"""
distances = {}
for node in self.nodes.values():
for dep in node["deps"]:
key = tuple([node["address"], dep])
distances[key] = 1
for _ in self.nodes:
new_entries = {}
for pair1 in distances:
for pair2 in distances:
if pair1[1] == pair2[0]:
key = tuple([pair1[0], pair2[1]])
new_entries[key] = distances[pair1] + distances[pair2]
for pair in new_entries:
distances[pair] = new_entries[pair]
if pair[0] == pair[1]:
path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0])
return path
return False # return []?
def get_cycle_path(self, curr_node, goal_node_index):
for dep in curr_node["deps"]:
if dep == goal_node_index:
return [curr_node["address"]]
for dep in curr_node["deps"]:
path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
if len(path) > 0:
path.insert(0, curr_node["address"])
return path
return []
def to_conll(self, style):
"""
The dependency graph in CoNLL format.
:param style: the style to use for the format (3, 4, 10 columns)
:type style: int
:rtype: str
"""
if style == 3:
template = "{word}\t{tag}\t{head}\n"
elif style == 4:
template = "{word}\t{tag}\t{head}\t{rel}\n"
elif style == 10:
template = (
"{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n"
)
else:
raise ValueError(
"Number of tab-delimited fields ({}) not supported by "
"CoNLL(10) or Malt-Tab(4) format".format(style)
)
return "".join(
template.format(i=i, **node)
for i, node in sorted(self.nodes.items())
if node["tag"] != "TOP"
)
def nx_graph(self):
"""Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
import networkx
nx_nodelist = list(range(1, len(self.nodes)))
nx_edgelist = [
(n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n)
]
self.nx_labels = {}
for n in nx_nodelist:
self.nx_labels[n] = self.nodes[n]["word"]
g = networkx.MultiDiGraph()
g.add_nodes_from(nx_nodelist)
g.add_edges_from(nx_edgelist)
return g
def dot2img(dot_string, t="svg"):
"""
Create image representation fom dot_string, using the 'dot' program
from the Graphviz package.
Use the 't' argument to specify the image file format, for ex. 'jpeg', 'eps',
'json', 'png' or 'webp' (Running 'dot -T:' lists all available formats).
Note that the "capture_output" option of subprocess.run() is only available
with text formats (like svg), but not with binary image formats (like png).
"""
try:
find_binary("dot")
try:
if t in ["dot", "dot_json", "json", "svg"]:
proc = subprocess.run(
["dot", "-T%s" % t],
capture_output=True,
input=dot_string,
text=True,
)
else:
proc = subprocess.run(
["dot", "-T%s" % t],
input=bytes(dot_string, encoding="utf8"),
)
return proc.stdout
except:
raise Exception(
"Cannot create image representation by running dot from string: {}"
"".format(dot_string)
)
except OSError as e:
raise Exception("Cannot find the dot binary from Graphviz package") from e
class DependencyGraphError(Exception):
"""Dependency graph exception."""
def demo():
malt_demo()
conll_demo()
conll_file_demo()
cycle_finding_demo()
def malt_demo(nx=False):
"""
A demonstration of the result of reading a dependency
version of the first sentence of the Penn Treebank.
"""
dg = DependencyGraph(
"""Pierre NNP 2 NMOD
Vinken NNP 8 SUB
, , 2 P
61 CD 5 NMOD
years NNS 6 AMOD
old JJ 2 NMOD
, , 2 P
will MD 0 ROOT
join VB 8 VC
the DT 11 NMOD
board NN 9 OBJ
as IN 9 VMOD
a DT 15 NMOD
nonexecutive JJ 15 NMOD
director NN 12 PMOD
Nov. NNP 9 VMOD
29 CD 16 NMOD
. . 9 VMOD
"""
)
tree = dg.tree()
tree.pprint()
if nx:
# currently doesn't work
import networkx
from matplotlib import pylab
g = dg.nx_graph()
g.info()
pos = networkx.spring_layout(g, dim=1)
networkx.draw_networkx_nodes(g, pos, node_size=50)
# networkx.draw_networkx_edges(g, pos, edge_color='k', width=8)
networkx.draw_networkx_labels(g, pos, dg.nx_labels)
pylab.xticks([])
pylab.yticks([])
pylab.savefig("tree.png")
pylab.show()
def conll_demo():
"""
A demonstration of how to read a string representation of
a CoNLL format dependency tree.
"""
dg = DependencyGraph(conll_data1)
tree = dg.tree()
tree.pprint()
print(dg)
print(dg.to_conll(4))
def conll_file_demo():
print("Mass conll_read demo...")
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
for graph in graphs:
tree = graph.tree()
print("\n")
tree.pprint()
def cycle_finding_demo():
dg = DependencyGraph(treebank_data)
print(dg.contains_cycle())
cyclic_dg = DependencyGraph()
cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0})
cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1})
cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2})
cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3})
cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4})
print(cyclic_dg.contains_cycle())
treebank_data = """Pierre NNP 2 NMOD
Vinken NNP 8 SUB
, , 2 P
61 CD 5 NMOD
years NNS 6 AMOD
old JJ 2 NMOD
, , 2 P
will MD 0 ROOT
join VB 8 VC
the DT 11 NMOD
board NN 9 OBJ
as IN 9 VMOD
a DT 15 NMOD
nonexecutive JJ 15 NMOD
director NN 12 PMOD
Nov. NNP 9 VMOD
29 CD 16 NMOD
. . 9 VMOD
"""
conll_data1 = """
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 met met Prep Prep voor 8 mod _ _
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
7 gaan ga V V hulp|inf 6 vc _ _
8 winkelen winkel V V intrans|inf 11 cnj _ _
9 , , Punc Punc komma 8 punct _ _
10 zwemmen zwem V V intrans|inf 11 cnj _ _
11 of of Conj Conj neven 7 vc _ _
12 terrassen terras N N soort|mv|neut 11 cnj _ _
13 . . Punc Punc punt 12 punct _ _
"""
conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _
2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _
4 wild wild Adj Adj attr|stell|onverv 5 mod _ _
5 zwaaien zwaai N N soort|mv|neut 2 vc _ _
6 . . Punc Punc punt 5 punct _ _
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 met met Prep Prep voor 8 mod _ _
4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
5 moeder moeder N N soort|ev|neut 3 obj1 _ _
6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
7 gaan ga V V hulp|inf 6 vc _ _
8 winkelen winkel V V intrans|inf 11 cnj _ _
9 , , Punc Punc komma 8 punct _ _
10 zwemmen zwem V V intrans|inf 11 cnj _ _
11 of of Conj Conj neven 7 vc _ _
12 terrassen terras N N soort|mv|neut 11 cnj _ _
13 . . Punc Punc punt 12 punct _ _
1 Dat dat Pron Pron aanw|neut|attr 2 det _ _
2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _
5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _
6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _
7 . . Punc Punc punt 6 punct _ _
1 Het het Pron Pron onbep|neut|zelfst 2 su _ _
2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _
3 bij bij Prep Prep voor 2 ld _ _
4 de de Art Art bep|zijdofmv|neut 6 det _ _
5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _
6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _
7 die die Pron Pron betr|neut|zelfst 6 mod _ _
8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _
9 ginds ginds Adv Adv gew|aanw 12 mod _ _
10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _
11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _
12 gelaten laat V V trans|verldw|onverv 11 vc _ _
13 . . Punc Punc punt 12 punct _ _
1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _
3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _
4 naast naast Prep Prep voor 11 mod _ _
5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _
6 op op Prep Prep voor 11 ld _ _
7 de de Art Art bep|zijdofmv|neut 8 det _ _
8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _
9 kunnen kan V V hulp|inf 2 vc _ _
10 gaan ga V V hulp|inf 9 vc _ _
11 liggen lig V V intrans|inf 10 vc _ _
12 . . Punc Punc punt 11 punct _ _
1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _
2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _
3 mams mams N N soort|ev|neut 4 det _ _
4 rug rug N N soort|ev|neut 5 obj1 _ _
5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _
6 hebben heb V V hulp|inf 2 vc _ _
7 en en Conj Conj neven 0 ROOT _ _
8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _
9 de de Art Art bep|zijdofmv|neut 10 det _ _
10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _
11 . . Punc Punc punt 10 punct _ _
1 Of of Conj Conj onder|metfin 0 ROOT _ _
2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _
3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _
4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _
5 met met Prep Prep voor 10 mod _ _
6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _
7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _
8 rond rond Adv Adv deelv 10 svp _ _
9 kunnen kan V V hulp|inf 3 vc _ _
10 slenteren slenter V V intrans|inf 9 vc _ _
11 in in Prep Prep voor 10 mod _ _
12 de de Art Art bep|zijdofmv|neut 13 det _ _
13 buurt buurt N N soort|ev|neut 11 obj1 _ _
14 van van Prep Prep voor 13 mod _ _
15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _
16 . . Punc Punc punt 15 punct _ _
"""
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,552 @@
# Natural Language Toolkit: An Incremental Earley Chart Parser
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Rob Speer <rspeer@mit.edu>
# Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jean Mark Gawron <gawron@mail.sdsu.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Data classes and parser implementations for *incremental* chart
parsers, which use dynamic programming to efficiently parse a text.
A "chart parser" derives parse trees for a text by iteratively adding
\"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree
structure for a subsequence of the text. The "chart" is a
\"blackboard\" for composing and combining these hypotheses.
A parser is "incremental", if it guarantees that for all i, j where i < j,
all edges ending at i are built before any edges ending at j.
This is appealing for, say, speech recognizer hypothesis filtering.
The main parser class is ``EarleyChartParser``, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
"""
from time import perf_counter
from nltk.parse.chart import (
BottomUpPredictCombineRule,
BottomUpPredictRule,
CachedTopDownPredictRule,
Chart,
ChartParser,
EdgeI,
EmptyPredictRule,
FilteredBottomUpPredictCombineRule,
FilteredSingleEdgeFundamentalRule,
LeafEdge,
LeafInitRule,
SingleEdgeFundamentalRule,
TopDownInitRule,
)
from nltk.parse.featurechart import (
FeatureBottomUpPredictCombineRule,
FeatureBottomUpPredictRule,
FeatureChart,
FeatureChartParser,
FeatureEmptyPredictRule,
FeatureSingleEdgeFundamentalRule,
FeatureTopDownInitRule,
FeatureTopDownPredictRule,
)
# ////////////////////////////////////////////////////////////
# Incremental Chart
# ////////////////////////////////////////////////////////////
class IncrementalChart(Chart):
def initialize(self):
# A sequence of edge lists contained in this chart.
self._edgelists = tuple([] for x in self._positions())
# The set of child pointer lists associated with each edge.
self._edge_to_cpls = {}
# Indexes mapping attribute values to lists of edges
# (used by select()).
self._indexes = {}
def edges(self):
return list(self.iteredges())
def iteredges(self):
return (edge for edgelist in self._edgelists for edge in edgelist)
def select(self, end, **restrictions):
edgelist = self._edgelists[end]
# If there are no restrictions, then return all edges.
if restrictions == {}:
return iter(edgelist)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
restr_keys = tuple(restr_keys)
# If it doesn't exist, then create it.
if restr_keys not in self._indexes:
self._add_index(restr_keys)
vals = tuple(restrictions[key] for key in restr_keys)
return iter(self._indexes[restr_keys][end].get(vals, []))
def _add_index(self, restr_keys):
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
raise ValueError("Bad restriction: %s" % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
# Add all existing edges to the index.
for end, edgelist in enumerate(self._edgelists):
this_index = index[end]
for edge in edgelist:
vals = tuple(getattr(edge, key)() for key in restr_keys)
this_index.setdefault(vals, []).append(edge)
def _register_with_indexes(self, edge):
end = edge.end()
for restr_keys, index in self._indexes.items():
vals = tuple(getattr(edge, key)() for key in restr_keys)
index[end].setdefault(vals, []).append(edge)
def _append_edge(self, edge):
self._edgelists[edge.end()].append(edge)
def _positions(self):
return range(self.num_leaves() + 1)
class FeatureIncrementalChart(IncrementalChart, FeatureChart):
def select(self, end, **restrictions):
edgelist = self._edgelists[end]
# If there are no restrictions, then return all edges.
if restrictions == {}:
return iter(edgelist)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
restr_keys = tuple(restr_keys)
# If it doesn't exist, then create it.
if restr_keys not in self._indexes:
self._add_index(restr_keys)
vals = tuple(
self._get_type_if_possible(restrictions[key]) for key in restr_keys
)
return iter(self._indexes[restr_keys][end].get(vals, []))
def _add_index(self, restr_keys):
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
raise ValueError("Bad restriction: %s" % key)
# Create the index.
index = self._indexes[restr_keys] = tuple({} for x in self._positions())
# Add all existing edges to the index.
for end, edgelist in enumerate(self._edgelists):
this_index = index[end]
for edge in edgelist:
vals = tuple(
self._get_type_if_possible(getattr(edge, key)())
for key in restr_keys
)
this_index.setdefault(vals, []).append(edge)
def _register_with_indexes(self, edge):
end = edge.end()
for restr_keys, index in self._indexes.items():
vals = tuple(
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
)
index[end].setdefault(vals, []).append(edge)
# ////////////////////////////////////////////////////////////
# Incremental CFG Rules
# ////////////////////////////////////////////////////////////
class CompleteFundamentalRule(SingleEdgeFundamentalRule):
def _apply_incomplete(self, chart, grammar, left_edge):
end = left_edge.end()
# When the chart is incremental, we only have to look for
# empty complete edges here.
for right_edge in chart.select(
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
):
new_edge = left_edge.move_dot_forward(right_edge.end())
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
class CompleterRule(CompleteFundamentalRule):
_fundamental_rule = CompleteFundamentalRule()
def apply(self, chart, grammar, edge):
if not isinstance(edge, LeafEdge):
yield from self._fundamental_rule.apply(chart, grammar, edge)
class ScannerRule(CompleteFundamentalRule):
_fundamental_rule = CompleteFundamentalRule()
def apply(self, chart, grammar, edge):
if isinstance(edge, LeafEdge):
yield from self._fundamental_rule.apply(chart, grammar, edge)
class PredictorRule(CachedTopDownPredictRule):
pass
class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
def apply(self, chart, grammar, edge):
# Since the Filtered rule only works for grammars without empty productions,
# we only have to bother with complete edges here.
if edge.is_complete():
yield from self._apply_complete(chart, grammar, edge)
# ////////////////////////////////////////////////////////////
# Incremental FCFG Rules
# ////////////////////////////////////////////////////////////
class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
def _apply_incomplete(self, chart, grammar, left_edge):
fr = self._fundamental_rule
end = left_edge.end()
# When the chart is incremental, we only have to look for
# empty complete edges here.
for right_edge in chart.select(
start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
):
yield from fr.apply(chart, grammar, left_edge, right_edge)
class FeatureCompleterRule(CompleterRule):
_fundamental_rule = FeatureCompleteFundamentalRule()
class FeatureScannerRule(ScannerRule):
_fundamental_rule = FeatureCompleteFundamentalRule()
class FeaturePredictorRule(FeatureTopDownPredictRule):
pass
# ////////////////////////////////////////////////////////////
# Incremental CFG Chart Parsers
# ////////////////////////////////////////////////////////////
EARLEY_STRATEGY = [
LeafInitRule(),
TopDownInitRule(),
CompleterRule(),
ScannerRule(),
PredictorRule(),
]
TD_INCREMENTAL_STRATEGY = [
LeafInitRule(),
TopDownInitRule(),
CachedTopDownPredictRule(),
CompleteFundamentalRule(),
]
BU_INCREMENTAL_STRATEGY = [
LeafInitRule(),
EmptyPredictRule(),
BottomUpPredictRule(),
CompleteFundamentalRule(),
]
BU_LC_INCREMENTAL_STRATEGY = [
LeafInitRule(),
EmptyPredictRule(),
BottomUpPredictCombineRule(),
CompleteFundamentalRule(),
]
LC_INCREMENTAL_STRATEGY = [
LeafInitRule(),
FilteredBottomUpPredictCombineRule(),
FilteredCompleteFundamentalRule(),
]
class IncrementalChartParser(ChartParser):
"""
An *incremental* chart parser implementing Jay Earley's
parsing algorithm:
| For each index end in [0, 1, ..., N]:
| For each edge such that edge.end = end:
| If edge is incomplete and edge.next is not a part of speech:
| Apply PredictorRule to edge
| If edge is incomplete and edge.next is a part of speech:
| Apply ScannerRule to edge
| If edge is complete:
| Apply CompleterRule to edge
| Return any complete parses in the chart
"""
def __init__(
self,
grammar,
strategy=BU_LC_INCREMENTAL_STRATEGY,
trace=0,
trace_chart_width=50,
chart_class=IncrementalChart,
):
"""
Create a new Earley chart parser, that uses ``grammar`` to
parse texts.
:type grammar: CFG
:param grammar: The grammar used to parse texts.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing
output.
:type trace_chart_width: int
:param trace_chart_width: The default total width reserved for
the chart in trace output. The remainder of each line will
be used to display edges.
:param chart_class: The class that should be used to create
the charts used by this parser.
"""
self._grammar = grammar
self._trace = trace
self._trace_chart_width = trace_chart_width
self._chart_class = chart_class
self._axioms = []
self._inference_rules = []
for rule in strategy:
if rule.NUM_EDGES == 0:
self._axioms.append(rule)
elif rule.NUM_EDGES == 1:
self._inference_rules.append(rule)
else:
raise ValueError(
"Incremental inference rules must have " "NUM_EDGES == 0 or 1"
)
def chart_parse(self, tokens, trace=None):
if trace is None:
trace = self._trace
trace_new_edges = self._trace_new_edges
tokens = list(tokens)
self._grammar.check_coverage(tokens)
chart = self._chart_class(tokens)
grammar = self._grammar
# Width, for printing trace edges.
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
if trace:
print(chart.pretty_format_leaves(trace_edge_width))
for axiom in self._axioms:
new_edges = list(axiom.apply(chart, grammar))
trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
inference_rules = self._inference_rules
for end in range(chart.num_leaves() + 1):
if trace > 1:
print("\n* Processing queue:", end, "\n")
agenda = list(chart.select(end=end))
while agenda:
edge = agenda.pop()
for rule in inference_rules:
new_edges = list(rule.apply(chart, grammar, edge))
trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
for new_edge in new_edges:
if new_edge.end() == end:
agenda.append(new_edge)
return chart
class EarleyChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
class IncrementalTopDownChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(
self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args
)
class IncrementalBottomUpChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(
self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args
)
class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
IncrementalChartParser.__init__(
self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args
)
class IncrementalLeftCornerChartParser(IncrementalChartParser):
def __init__(self, grammar, **parser_args):
if not grammar.is_nonempty():
raise ValueError(
"IncrementalLeftCornerParser only works for grammars "
"without empty productions."
)
IncrementalChartParser.__init__(
self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args
)
# ////////////////////////////////////////////////////////////
# Incremental FCFG Chart Parsers
# ////////////////////////////////////////////////////////////
EARLEY_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureTopDownInitRule(),
FeatureCompleterRule(),
FeatureScannerRule(),
FeaturePredictorRule(),
]
TD_INCREMENTAL_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureTopDownInitRule(),
FeatureTopDownPredictRule(),
FeatureCompleteFundamentalRule(),
]
BU_INCREMENTAL_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureEmptyPredictRule(),
FeatureBottomUpPredictRule(),
FeatureCompleteFundamentalRule(),
]
BU_LC_INCREMENTAL_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureEmptyPredictRule(),
FeatureBottomUpPredictCombineRule(),
FeatureCompleteFundamentalRule(),
]
class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
def __init__(
self,
grammar,
strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
trace_chart_width=20,
chart_class=FeatureIncrementalChart,
**parser_args
):
IncrementalChartParser.__init__(
self,
grammar,
strategy=strategy,
trace_chart_width=trace_chart_width,
chart_class=chart_class,
**parser_args
)
class FeatureEarleyChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args
)
class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args
)
class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args
)
class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
def __init__(self, grammar, **parser_args):
FeatureIncrementalChartParser.__init__(
self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args
)
# ////////////////////////////////////////////////////////////
# Demonstration
# ////////////////////////////////////////////////////////////
def demo(
print_times=True,
print_grammar=False,
print_trees=True,
trace=2,
sent="I saw John with a dog with my cookie",
numparses=5,
):
"""
A demonstration of the Earley parsers.
"""
import sys
import time
from nltk.parse.chart import demo_grammar
# The grammar for ChartParser and SteppingChartParser:
grammar = demo_grammar()
if print_grammar:
print("* Grammar")
print(grammar)
# Tokenize the sample sentence.
print("* Sentence:")
print(sent)
tokens = sent.split()
print(tokens)
print()
# Do the parsing.
earley = EarleyChartParser(grammar, trace=trace)
t = perf_counter()
chart = earley.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
t = perf_counter() - t
# Print results.
if numparses:
assert len(parses) == numparses, "Not all parses found"
if print_trees:
for tree in parses:
print(tree)
else:
print("Nr trees:", len(parses))
if print_times:
print("Time:", t)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,129 @@
# Natural Language Toolkit: evaluation of dependency parser
#
# Author: Long Duong <longdt219@gmail.com>
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import unicodedata
class DependencyEvaluator:
"""
Class for measuring labelled and unlabelled attachment score for
dependency parsing. Note that the evaluation ignores punctuation.
>>> from nltk.parse import DependencyGraph, DependencyEvaluator
>>> gold_sent = DependencyGraph(\"""
... Pierre NNP 2 NMOD
... Vinken NNP 8 SUB
... , , 2 P
... 61 CD 5 NMOD
... years NNS 6 AMOD
... old JJ 2 NMOD
... , , 2 P
... will MD 0 ROOT
... join VB 8 VC
... the DT 11 NMOD
... board NN 9 OBJ
... as IN 9 VMOD
... a DT 15 NMOD
... nonexecutive JJ 15 NMOD
... director NN 12 PMOD
... Nov. NNP 9 VMOD
... 29 CD 16 NMOD
... . . 9 VMOD
... \""")
>>> parsed_sent = DependencyGraph(\"""
... Pierre NNP 8 NMOD
... Vinken NNP 1 SUB
... , , 3 P
... 61 CD 6 NMOD
... years NNS 6 AMOD
... old JJ 2 NMOD
... , , 3 AMOD
... will MD 0 ROOT
... join VB 8 VC
... the DT 11 AMOD
... board NN 9 OBJECT
... as IN 9 NMOD
... a DT 15 NMOD
... nonexecutive JJ 15 NMOD
... director NN 12 PMOD
... Nov. NNP 9 VMOD
... 29 CD 16 NMOD
... . . 9 VMOD
... \""")
>>> de = DependencyEvaluator([parsed_sent],[gold_sent])
>>> las, uas = de.eval()
>>> las
0.6
>>> uas
0.8
>>> abs(uas - 0.8) < 0.00001
True
"""
def __init__(self, parsed_sents, gold_sents):
"""
:param parsed_sents: the list of parsed_sents as the output of parser
:type parsed_sents: list(DependencyGraph)
"""
self._parsed_sents = parsed_sents
self._gold_sents = gold_sents
def _remove_punct(self, inStr):
"""
Function to remove punctuation from Unicode string.
:param input: the input string
:return: Unicode string after remove all punctuation
"""
punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat)
def eval(self):
"""
Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS)
:return : tuple(float,float)
"""
if len(self._parsed_sents) != len(self._gold_sents):
raise ValueError(
" Number of parsed sentence is different with number of gold sentence."
)
corr = 0
corrL = 0
total = 0
for i in range(len(self._parsed_sents)):
parsed_sent_nodes = self._parsed_sents[i].nodes
gold_sent_nodes = self._gold_sents[i].nodes
if len(parsed_sent_nodes) != len(gold_sent_nodes):
raise ValueError("Sentences must have equal length.")
for parsed_node_address, parsed_node in parsed_sent_nodes.items():
gold_node = gold_sent_nodes[parsed_node_address]
if parsed_node["word"] is None:
continue
if parsed_node["word"] != gold_node["word"]:
raise ValueError("Sentence sequence is not matched.")
# Ignore if word is punctuation by default
# if (parsed_sent[j]["word"] in string.punctuation):
if self._remove_punct(parsed_node["word"]) == "":
continue
total += 1
if parsed_node["head"] == gold_node["head"]:
corr += 1
if parsed_node["rel"] == gold_node["rel"]:
corrL += 1
return corrL / total, corr / total

View File

@@ -0,0 +1,674 @@
# Natural Language Toolkit: Chart Parser for Feature-Based Grammars
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Rob Speer <rspeer@mit.edu>
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Extension of chart parsing implementation to handle grammars with
feature structures as nodes.
"""
from time import perf_counter
from nltk.featstruct import TYPE, FeatStruct, find_variables, unify
from nltk.grammar import (
CFG,
FeatStructNonterminal,
Nonterminal,
Production,
is_nonterminal,
is_terminal,
)
from nltk.parse.chart import (
BottomUpPredictCombineRule,
BottomUpPredictRule,
CachedTopDownPredictRule,
Chart,
ChartParser,
EdgeI,
EmptyPredictRule,
FundamentalRule,
LeafInitRule,
SingleEdgeFundamentalRule,
TopDownInitRule,
TreeEdge,
)
from nltk.sem import logic
from nltk.tree import Tree
# ////////////////////////////////////////////////////////////
# Tree Edge
# ////////////////////////////////////////////////////////////
class FeatureTreeEdge(TreeEdge):
"""
A specialized tree edge that allows shared variable bindings
between nonterminals on the left-hand side and right-hand side.
Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a
dictionary mapping from variables to values. If the edge is not
complete, then these bindings are simply stored. However, if the
edge is complete, then the constructor applies these bindings to
every nonterminal in the edge whose symbol implements the
interface ``SubstituteBindingsI``.
"""
def __init__(self, span, lhs, rhs, dot=0, bindings=None):
"""
Construct a new edge. If the edge is incomplete (i.e., if
``dot<len(rhs)``), then store the bindings as-is. If the edge
is complete (i.e., if ``dot==len(rhs)``), then apply the
bindings to all nonterminals in ``lhs`` and ``rhs``, and then
clear the bindings. See ``TreeEdge`` for a description of
the other arguments.
"""
if bindings is None:
bindings = {}
# If the edge is complete, then substitute in the bindings,
# and then throw them away. (If we didn't throw them away, we
# might think that 2 complete edges are different just because
# they have different bindings, even though all bindings have
# already been applied.)
if dot == len(rhs) and bindings:
lhs = self._bind(lhs, bindings)
rhs = [self._bind(elt, bindings) for elt in rhs]
bindings = {}
# Initialize the edge.
TreeEdge.__init__(self, span, lhs, rhs, dot)
self._bindings = bindings
self._comparison_key = (self._comparison_key, tuple(sorted(bindings.items())))
@staticmethod
def from_production(production, index):
"""
:return: A new ``TreeEdge`` formed from the given production.
The new edge's left-hand side and right-hand side will
be taken from ``production``; its span will be
``(index,index)``; and its dot position will be ``0``.
:rtype: TreeEdge
"""
return FeatureTreeEdge(
span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
)
def move_dot_forward(self, new_end, bindings=None):
"""
:return: A new ``FeatureTreeEdge`` formed from this edge.
The new edge's dot position is increased by ``1``,
and its end index will be replaced by ``new_end``.
:rtype: FeatureTreeEdge
:param new_end: The new end index.
:type new_end: int
:param bindings: Bindings for the new edge.
:type bindings: dict
"""
return FeatureTreeEdge(
span=(self._span[0], new_end),
lhs=self._lhs,
rhs=self._rhs,
dot=self._dot + 1,
bindings=bindings,
)
def _bind(self, nt, bindings):
if not isinstance(nt, FeatStructNonterminal):
return nt
return nt.substitute_bindings(bindings)
def next_with_bindings(self):
return self._bind(self.nextsym(), self._bindings)
def bindings(self):
"""
Return a copy of this edge's bindings dictionary.
"""
return self._bindings.copy()
def variables(self):
"""
:return: The set of variables used by this edge.
:rtype: set(Variable)
"""
return find_variables(
[self._lhs]
+ list(self._rhs)
+ list(self._bindings.keys())
+ list(self._bindings.values()),
fs_class=FeatStruct,
)
def __str__(self):
if self.is_complete():
return super().__str__()
else:
bindings = "{%s}" % ", ".join(
"%s: %r" % item for item in sorted(self._bindings.items())
)
return f"{super().__str__()} {bindings}"
# ////////////////////////////////////////////////////////////
# A specialized Chart for feature grammars
# ////////////////////////////////////////////////////////////
# TODO: subsumes check when adding new edges
class FeatureChart(Chart):
"""
A Chart for feature grammars.
:see: ``Chart`` for more information.
"""
def select(self, **restrictions):
"""
Returns an iterator over the edges in this chart.
See ``Chart.select`` for more information about the
``restrictions`` on the edges.
"""
# If there are no restrictions, then return all edges.
if restrictions == {}:
return iter(self._edges)
# Find the index corresponding to the given restrictions.
restr_keys = sorted(restrictions.keys())
restr_keys = tuple(restr_keys)
# If it doesn't exist, then create it.
if restr_keys not in self._indexes:
self._add_index(restr_keys)
vals = tuple(
self._get_type_if_possible(restrictions[key]) for key in restr_keys
)
return iter(self._indexes[restr_keys].get(vals, []))
def _add_index(self, restr_keys):
"""
A helper function for ``select``, which creates a new index for
a given set of attributes (aka restriction keys).
"""
# Make sure it's a valid index.
for key in restr_keys:
if not hasattr(EdgeI, key):
raise ValueError("Bad restriction: %s" % key)
# Create the index.
index = self._indexes[restr_keys] = {}
# Add all existing edges to the index.
for edge in self._edges:
vals = tuple(
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
)
index.setdefault(vals, []).append(edge)
def _register_with_indexes(self, edge):
"""
A helper function for ``insert``, which registers the new
edge with all existing indexes.
"""
for restr_keys, index in self._indexes.items():
vals = tuple(
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
)
index.setdefault(vals, []).append(edge)
def _get_type_if_possible(self, item):
"""
Helper function which returns the ``TYPE`` feature of the ``item``,
if it exists, otherwise it returns the ``item`` itself
"""
if isinstance(item, dict) and TYPE in item:
return item[TYPE]
else:
return item
def parses(self, start, tree_class=Tree):
for edge in self.select(start=0, end=self._num_leaves):
if (
(isinstance(edge, FeatureTreeEdge))
and (edge.lhs()[TYPE] == start[TYPE])
and (unify(edge.lhs(), start, rename_vars=True))
):
yield from self.trees(edge, complete=True, tree_class=tree_class)
# ////////////////////////////////////////////////////////////
# Fundamental Rule
# ////////////////////////////////////////////////////////////
class FeatureFundamentalRule(FundamentalRule):
r"""
A specialized version of the fundamental rule that operates on
nonterminals whose symbols are ``FeatStructNonterminal``s. Rather
than simply comparing the nonterminals for equality, they are
unified. Variable bindings from these unifications are collected
and stored in the chart using a ``FeatureTreeEdge``. When a
complete edge is generated, these bindings are applied to all
nonterminals in the edge.
The fundamental rule states that:
- ``[A -> alpha \* B1 beta][i:j]``
- ``[B2 -> gamma \*][j:k]``
licenses the edge:
- ``[A -> alpha B3 \* beta][i:j]``
assuming that B1 and B2 can be unified to generate B3.
"""
def apply(self, chart, grammar, left_edge, right_edge):
# Make sure the rule is applicable.
if not (
left_edge.end() == right_edge.start()
and left_edge.is_incomplete()
and right_edge.is_complete()
and isinstance(left_edge, FeatureTreeEdge)
):
return
found = right_edge.lhs()
nextsym = left_edge.nextsym()
if isinstance(right_edge, FeatureTreeEdge):
if not is_nonterminal(nextsym):
return
if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]:
return
# Create a copy of the bindings.
bindings = left_edge.bindings()
# We rename vars here, because we don't want variables
# from the two different productions to match.
found = found.rename_variables(used_vars=left_edge.variables())
# Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
# generate B3 (result).
result = unify(nextsym, found, bindings, rename_vars=False)
if result is None:
return
else:
if nextsym != found:
return
# Create a copy of the bindings.
bindings = left_edge.bindings()
# Construct the new edge.
new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)
# Add it to the chart, with appropriate child pointers.
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
yield new_edge
class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
"""
A specialized version of the completer / single edge fundamental rule
that operates on nonterminals whose symbols are ``FeatStructNonterminal``.
Rather than simply comparing the nonterminals for equality, they are
unified.
"""
_fundamental_rule = FeatureFundamentalRule()
def _apply_complete(self, chart, grammar, right_edge):
fr = self._fundamental_rule
for left_edge in chart.select(
end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
):
yield from fr.apply(chart, grammar, left_edge, right_edge)
def _apply_incomplete(self, chart, grammar, left_edge):
fr = self._fundamental_rule
for right_edge in chart.select(
start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
):
yield from fr.apply(chart, grammar, left_edge, right_edge)
# ////////////////////////////////////////////////////////////
# Top-Down Prediction
# ////////////////////////////////////////////////////////////
class FeatureTopDownInitRule(TopDownInitRule):
def apply(self, chart, grammar):
for prod in grammar.productions(lhs=grammar.start()):
new_edge = FeatureTreeEdge.from_production(prod, 0)
if chart.insert(new_edge, ()):
yield new_edge
class FeatureTopDownPredictRule(CachedTopDownPredictRule):
r"""
A specialized version of the (cached) top down predict rule that operates
on nonterminals whose symbols are ``FeatStructNonterminal``. Rather
than simply comparing the nonterminals for equality, they are
unified.
The top down expand rule states that:
- ``[A -> alpha \* B1 beta][i:j]``
licenses the edge:
- ``[B2 -> \* gamma][j:j]``
for each grammar production ``B2 -> gamma``, assuming that B1
and B2 can be unified.
"""
def apply(self, chart, grammar, edge):
if edge.is_complete():
return
nextsym, index = edge.nextsym(), edge.end()
if not is_nonterminal(nextsym):
return
# If we've already applied this rule to an edge with the same
# next & end, and the chart & grammar have not changed, then
# just return (no new edges to add).
nextsym_with_bindings = edge.next_with_bindings()
done = self._done.get((nextsym_with_bindings, index), (None, None))
if done[0] is chart and done[1] is grammar:
return
for prod in grammar.productions(lhs=nextsym):
# If the left corner in the predicted production is
# leaf, it must match with the input.
if prod.rhs():
first = prod.rhs()[0]
if is_terminal(first):
if index >= chart.num_leaves():
continue
if first != chart.leaf(index):
continue
# We rename vars here, because we don't want variables
# from the two different productions to match.
if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
new_edge = FeatureTreeEdge.from_production(prod, edge.end())
if chart.insert(new_edge, ()):
yield new_edge
# Record the fact that we've applied this rule.
self._done[nextsym_with_bindings, index] = (chart, grammar)
# ////////////////////////////////////////////////////////////
# Bottom-Up Prediction
# ////////////////////////////////////////////////////////////
class FeatureBottomUpPredictRule(BottomUpPredictRule):
def apply(self, chart, grammar, edge):
if edge.is_incomplete():
return
for prod in grammar.productions(rhs=edge.lhs()):
if isinstance(edge, FeatureTreeEdge):
_next = prod.rhs()[0]
if not is_nonterminal(_next):
continue
new_edge = FeatureTreeEdge.from_production(prod, edge.start())
if chart.insert(new_edge, ()):
yield new_edge
class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
def apply(self, chart, grammar, edge):
if edge.is_incomplete():
return
found = edge.lhs()
for prod in grammar.productions(rhs=found):
bindings = {}
if isinstance(edge, FeatureTreeEdge):
_next = prod.rhs()[0]
if not is_nonterminal(_next):
continue
# We rename vars here, because we don't want variables
# from the two different productions to match.
used_vars = find_variables(
(prod.lhs(),) + prod.rhs(), fs_class=FeatStruct
)
found = found.rename_variables(used_vars=used_vars)
result = unify(_next, found, bindings, rename_vars=False)
if result is None:
continue
new_edge = FeatureTreeEdge.from_production(
prod, edge.start()
).move_dot_forward(edge.end(), bindings)
if chart.insert(new_edge, (edge,)):
yield new_edge
class FeatureEmptyPredictRule(EmptyPredictRule):
def apply(self, chart, grammar):
for prod in grammar.productions(empty=True):
for index in range(chart.num_leaves() + 1):
new_edge = FeatureTreeEdge.from_production(prod, index)
if chart.insert(new_edge, ()):
yield new_edge
# ////////////////////////////////////////////////////////////
# Feature Chart Parser
# ////////////////////////////////////////////////////////////
TD_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureTopDownInitRule(),
FeatureTopDownPredictRule(),
FeatureSingleEdgeFundamentalRule(),
]
BU_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureEmptyPredictRule(),
FeatureBottomUpPredictRule(),
FeatureSingleEdgeFundamentalRule(),
]
BU_LC_FEATURE_STRATEGY = [
LeafInitRule(),
FeatureEmptyPredictRule(),
FeatureBottomUpPredictCombineRule(),
FeatureSingleEdgeFundamentalRule(),
]
class FeatureChartParser(ChartParser):
def __init__(
self,
grammar,
strategy=BU_LC_FEATURE_STRATEGY,
trace_chart_width=20,
chart_class=FeatureChart,
**parser_args,
):
ChartParser.__init__(
self,
grammar,
strategy=strategy,
trace_chart_width=trace_chart_width,
chart_class=chart_class,
**parser_args,
)
class FeatureTopDownChartParser(FeatureChartParser):
def __init__(self, grammar, **parser_args):
FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args)
class FeatureBottomUpChartParser(FeatureChartParser):
def __init__(self, grammar, **parser_args):
FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args)
class FeatureBottomUpLeftCornerChartParser(FeatureChartParser):
def __init__(self, grammar, **parser_args):
FeatureChartParser.__init__(
self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args
)
# ////////////////////////////////////////////////////////////
# Instantiate Variable Chart
# ////////////////////////////////////////////////////////////
class InstantiateVarsChart(FeatureChart):
"""
A specialized chart that 'instantiates' variables whose names
start with '@', by replacing them with unique new variables.
In particular, whenever a complete edge is added to the chart, any
variables in the edge's ``lhs`` whose names start with '@' will be
replaced by unique new ``Variable``.
"""
def __init__(self, tokens):
FeatureChart.__init__(self, tokens)
def initialize(self):
self._instantiated = set()
FeatureChart.initialize(self)
def insert(self, edge, child_pointer_list):
if edge in self._instantiated:
return False
self.instantiate_edge(edge)
return FeatureChart.insert(self, edge, child_pointer_list)
def instantiate_edge(self, edge):
"""
If the edge is a ``FeatureTreeEdge``, and it is complete,
then instantiate all variables whose names start with '@',
by replacing them with unique new variables.
Note that instantiation is done in-place, since the
parsing algorithms might already hold a reference to
the edge for future use.
"""
# If the edge is a leaf, or is not complete, or is
# already in the chart, then just return it as-is.
if not isinstance(edge, FeatureTreeEdge):
return
if not edge.is_complete():
return
if edge in self._edge_to_cpls:
return
# Get a list of variables that need to be instantiated.
# If there are none, then return as-is.
inst_vars = self.inst_vars(edge)
if not inst_vars:
return
# Instantiate the edge!
self._instantiated.add(edge)
edge._lhs = edge.lhs().substitute_bindings(inst_vars)
def inst_vars(self, edge):
return {
var: logic.unique_variable()
for var in edge.lhs().variables()
if var.name.startswith("@")
}
# ////////////////////////////////////////////////////////////
# Demo
# ////////////////////////////////////////////////////////////
def demo_grammar():
from nltk.grammar import FeatureGrammar
return FeatureGrammar.fromstring(
"""
S -> NP VP
PP -> Prep NP
NP -> NP PP
VP -> VP PP
VP -> Verb NP
VP -> Verb
NP -> Det[pl=?x] Noun[pl=?x]
NP -> "John"
NP -> "I"
Det -> "the"
Det -> "my"
Det[-pl] -> "a"
Noun[-pl] -> "dog"
Noun[-pl] -> "cookie"
Verb -> "ate"
Verb -> "saw"
Prep -> "with"
Prep -> "under"
"""
)
def demo(
print_times=True,
print_grammar=True,
print_trees=True,
print_sentence=True,
trace=1,
parser=FeatureChartParser,
sent="I saw John with a dog with my cookie",
):
import sys
import time
print()
grammar = demo_grammar()
if print_grammar:
print(grammar)
print()
print("*", parser.__name__)
if print_sentence:
print("Sentence:", sent)
tokens = sent.split()
t = perf_counter()
cp = parser(grammar, trace=trace)
chart = cp.chart_parse(tokens)
trees = list(chart.parses(grammar.start()))
if print_times:
print("Time: %s" % (perf_counter() - t))
if print_trees:
for tree in trees:
print(tree)
else:
print("Nr trees:", len(trees))
def run_profile():
import profile
profile.run("for i in range(1): demo()", "/tmp/profile.out")
import pstats
p = pstats.Stats("/tmp/profile.out")
p.strip_dirs().sort_stats("time", "cum").print_stats(60)
p.strip_dirs().sort_stats("cum", "time").print_stats(60)
if __name__ == "__main__":
from nltk.data import load
demo()
print()
grammar = load("grammars/book_grammars/feat0.fcfg")
cp = FeatureChartParser(grammar, trace=2)
sent = "Kim likes children"
tokens = sent.split()
trees = cp.parse(tokens)
for tree in trees:
print(tree)

View File

@@ -0,0 +1,88 @@
# Natural Language Toolkit: Generating from a CFG
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Eric Kafe <kafe.eric@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
import itertools
import sys
from nltk.grammar import Nonterminal
def generate(grammar, start=None, depth=None, n=None):
"""
Generates an iterator of all sentences from a CFG.
:param grammar: The Grammar used to generate sentences.
:param start: The Nonterminal from which to start generate sentences.
:param depth: The maximal depth of the generated tree.
:param n: The maximum number of sentences to return.
:return: An iterator of lists of terminal tokens.
"""
if not start:
start = grammar.start()
if depth is None:
# Safe default, assuming the grammar may be recursive:
depth = (sys.getrecursionlimit() // 3) - 3
iter = _generate_all(grammar, [start], depth)
if n:
iter = itertools.islice(iter, n)
return iter
def _generate_all(grammar, items, depth):
if items:
try:
for frag1 in _generate_one(grammar, items[0], depth):
for frag2 in _generate_all(grammar, items[1:], depth):
yield frag1 + frag2
except RecursionError as error:
# Helpful error message while still showing the recursion stack.
raise RuntimeError(
"The grammar has rule(s) that yield infinite recursion!\n\
Eventually use a lower 'depth', or a higher 'sys.setrecursionlimit()'."
) from error
else:
yield []
def _generate_one(grammar, item, depth):
if depth > 0:
if isinstance(item, Nonterminal):
for prod in grammar.productions(lhs=item):
yield from _generate_all(grammar, prod.rhs(), depth - 1)
else:
yield [item]
demo_grammar = """
S -> NP VP
NP -> Det N
PP -> P NP
VP -> 'slept' | 'saw' NP | 'walked' PP
Det -> 'the' | 'a'
N -> 'man' | 'park' | 'dog'
P -> 'in' | 'with'
"""
def demo(N=23):
from nltk.grammar import CFG
print("Generating the first %d sentences for demo grammar:" % (N,))
print(demo_grammar)
grammar = CFG.fromstring(demo_grammar)
for n, sent in enumerate(generate(grammar, n=N), 1):
print("%3d. %s" % (n, " ".join(sent)))
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,393 @@
# Natural Language Toolkit: Interface to MaltParser
#
# Author: Dan Garrette <dhgarrette@gmail.com>
# Contributor: Liling Tan, Mustufain, osamamukhtar11
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import inspect
import os
import subprocess
import sys
import tempfile
from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir, find_file, find_jars_within_path
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.parse.util import taggedsents_to_conll
def malt_regex_tagger():
from nltk.tag import RegexpTagger
_tagger = RegexpTagger(
[
(r"\.$", "."),
(r"\,$", ","),
(r"\?$", "?"), # fullstop, comma, Qmark
(r"\($", "("),
(r"\)$", ")"), # round brackets
(r"\[$", "["),
(r"\]$", "]"), # square brackets
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "DT"), # articles
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
(r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive
(r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
(r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
(r"(till|Till|until|Until)$", "IN"), # time prepopsitions
(r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
(r"(under|Under|below|Below)$", "IN"), # space prepopsitions
(r"(over|Over|above|Above)$", "IN"), # space prepopsitions
(r"(across|Across|through|Through)$", "IN"), # space prepopsitions
(r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
(r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
(r".*ly$", "RB"), # adverbs
(r".*s$", "NNS"), # plural nouns
(r".*ing$", "VBG"), # gerunds
(r".*ed$", "VBD"), # past tense verbs
(r".*", "NN"), # nouns (default)
]
)
return _tagger.tag
def find_maltparser(parser_dirname):
"""
A module to find MaltParser .jar file and its dependencies.
"""
if os.path.exists(parser_dirname): # If a full path is given.
_malt_dir = parser_dirname
else: # Try to find path to maltparser directory in environment variables.
_malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
# Checks that that the found directory contains all the necessary .jar
malt_dependencies = ["", "", ""]
_malt_jars = set(find_jars_within_path(_malt_dir))
_jars = {os.path.split(jar)[1] for jar in _malt_jars}
malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"}
assert malt_dependencies.issubset(_jars)
assert any(
filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
)
return list(_malt_jars)
def find_malt_model(model_filename):
"""
A module to find pre-trained MaltParser model.
"""
if model_filename is None:
return "malt_temp.mco"
elif os.path.exists(model_filename): # If a full path is given.
return model_filename
else: # Try to find path to malt model in environment variables.
return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
class MaltParser(ParserI):
"""
A class for dependency parsing with MaltParser. The input is the paths to:
- (optionally) a maltparser directory
- (optionally) the path to a pre-trained MaltParser .mco model file
- (optionally) the tagger to use for POS tagging before parsing
- (optionally) additional Java arguments
Example:
>>> from nltk.parse import malt
>>> # With MALT_PARSER and MALT_MODEL environment set.
>>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
(shot I (elephant an) (in (pajamas my)) .)
>>> # Without MALT_PARSER and MALT_MODEL environment.
>>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
(shot I (elephant an) (in (pajamas my)) .)
"""
def __init__(
self,
parser_dirname="",
model_filename=None,
tagger=None,
additional_java_args=None,
):
"""
An interface for parsing with the Malt Parser.
:param parser_dirname: The path to the maltparser directory that
contains the maltparser-1.x.jar
:type parser_dirname: str
:param model_filename: The name of the pre-trained model with .mco file
extension. If provided, training will not be required.
(see http://www.maltparser.org/mco/mco.html and
see http://www.patful.com/chalk/node/185)
:type model_filename: str
:param tagger: The tagger used to POS tag the raw string before
formatting to CONLL format. It should behave like `nltk.pos_tag`
:type tagger: function
:param additional_java_args: This is the additional Java arguments that
one can use when calling Maltparser, usually this is the heapsize
limits, e.g. `additional_java_args=['-Xmx1024m']`
(see https://javarevisited.blogspot.com/2011/05/java-heap-space-memory-size-jvm.html)
:type additional_java_args: list
"""
# Find all the necessary jar files for MaltParser.
self.malt_jars = find_maltparser(parser_dirname)
# Initialize additional java arguments.
self.additional_java_args = (
additional_java_args if additional_java_args is not None else []
)
# Initialize model.
self.model = find_malt_model(model_filename)
self._trained = self.model != "malt_temp.mco"
# Set the working_dir parameters i.e. `-w` from MaltParser's option.
self.working_dir = tempfile.gettempdir()
# Initialize POS tagger.
self.tagger = tagger if tagger is not None else malt_regex_tagger()
def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
"""
Use MaltParser to parse multiple POS tagged sentences. Takes multiple
sentences where each sentence is a list of (word, tag) tuples.
The sentences must have already been tokenized and tagged.
:param sentences: Input sentences to parse
:type sentence: list(list(tuple(str, str)))
:return: iter(iter(``DependencyGraph``)) the dependency graph
representation of each sentence
"""
if not self._trained:
raise Exception("Parser has not been trained. Call train() first.")
with tempfile.NamedTemporaryFile(
prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
) as input_file:
with tempfile.NamedTemporaryFile(
prefix="malt_output.conll.",
dir=self.working_dir,
mode="w",
delete=False,
) as output_file:
# Convert list of sentences to CONLL format.
for line in taggedsents_to_conll(sentences):
input_file.write(str(line))
input_file.close()
# Generate command to run maltparser.
cmd = self.generate_malt_command(
input_file.name, output_file.name, mode="parse"
)
# This is a maltparser quirk, it needs to be run
# where the model file is. otherwise it goes into an awkward
# missing .jars or strange -w working_dir problem.
_current_path = os.getcwd() # Remembers the current path.
try: # Change to modelfile path
os.chdir(os.path.split(self.model)[0])
except:
pass
ret = self._execute(cmd, verbose) # Run command.
os.chdir(_current_path) # Change back to current path.
if ret != 0:
raise Exception(
"MaltParser parsing (%s) failed with exit "
"code %d" % (" ".join(cmd), ret)
)
# Must return iter(iter(Tree))
with open(output_file.name) as infile:
for tree_str in infile.read().split("\n\n"):
yield (
iter(
[
DependencyGraph(
tree_str, top_relation_label=top_relation_label
)
]
)
)
os.remove(input_file.name)
os.remove(output_file.name)
def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
"""
Use MaltParser to parse multiple sentences.
Takes a list of sentences, where each sentence is a list of words.
Each sentence will be automatically tagged with this
MaltParser instance's tagger.
:param sentences: Input sentences to parse
:type sentence: list(list(str))
:return: iter(DependencyGraph)
"""
tagged_sentences = (self.tagger(sentence) for sentence in sentences)
return self.parse_tagged_sents(
tagged_sentences, verbose, top_relation_label=top_relation_label
)
def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
"""
This function generates the maltparser command use at the terminal.
:param inputfilename: path to the input file
:type inputfilename: str
:param outputfilename: path to the output file
:type outputfilename: str
"""
cmd = ["java"]
cmd += self.additional_java_args # Adds additional java arguments
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
classpaths_separator = ";" if sys.platform.startswith("win") else ":"
cmd += [
"-cp",
classpaths_separator.join(self.malt_jars),
] # Adds classpaths for jars
cmd += ["org.maltparser.Malt"] # Adds the main function.
# Adds the model file.
if os.path.exists(self.model): # when parsing
cmd += ["-c", os.path.split(self.model)[-1]]
else: # when learning
cmd += ["-c", self.model]
cmd += ["-i", inputfilename]
if mode == "parse":
cmd += ["-o", outputfilename]
cmd += ["-m", mode] # mode use to generate parses.
return cmd
@staticmethod
def _execute(cmd, verbose=False):
output = None if verbose else subprocess.PIPE
p = subprocess.Popen(cmd, stdout=output, stderr=output)
return p.wait()
def train(self, depgraphs, verbose=False):
"""
Train MaltParser from a list of ``DependencyGraph`` objects
:param depgraphs: list of ``DependencyGraph`` objects for training input data
:type depgraphs: DependencyGraph
"""
# Write the conll_str to malt_train.conll file in /tmp/
with tempfile.NamedTemporaryFile(
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
) as input_file:
input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
input_file.write(str(input_str))
# Trains the model with the malt_train.conll
self.train_from_file(input_file.name, verbose=verbose)
# Removes the malt_train.conll once training finishes.
os.remove(input_file.name)
def train_from_file(self, conll_file, verbose=False):
"""
Train MaltParser from a file
:param conll_file: str for the filename of the training input data
:type conll_file: str
"""
# If conll_file is a ZipFilePathPointer,
# then we need to do some extra massaging
if isinstance(conll_file, ZipFilePathPointer):
with tempfile.NamedTemporaryFile(
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
) as input_file:
with conll_file.open() as conll_input_file:
conll_str = conll_input_file.read()
input_file.write(str(conll_str))
return self.train_from_file(input_file.name, verbose=verbose)
# Generate command to run maltparser.
cmd = self.generate_malt_command(conll_file, mode="learn")
ret = self._execute(cmd, verbose)
if ret != 0:
raise Exception(
"MaltParser training (%s) failed with exit "
"code %d" % (" ".join(cmd), ret)
)
self._trained = True
if __name__ == "__main__":
"""
A demonstration function to show how NLTK users can use the malt parser API.
>>> from nltk import pos_tag
>>> assert 'MALT_PARSER' in os.environ, str(
... "Please set MALT_PARSER in your global environment, e.g.:\n"
... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'")
>>>
>>> assert 'MALT_MODEL' in os.environ, str(
... "Please set MALT_MODEL in your global environment, e.g.:\n"
... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
>>>
>>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
... "2 sees _ VB _ _ 0 ROOT _ _\n"
... "3 a _ DT _ _ 4 SPEC _ _\n"
... "4 dog _ NN _ _ 2 OBJ _ _\n"
... "5 . _ . _ _ 2 PUNCT _ _\n")
>>>
>>>
>>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
... "2 walks _ VB _ _ 0 ROOT _ _\n"
... "3 . _ . _ _ 2 PUNCT _ _\n")
>>> dg1 = DependencyGraph(_dg1_str)
>>> dg2 = DependencyGraph(_dg2_str)
>>> # Initialize a MaltParser object
>>> mp = MaltParser()
>>>
>>> # Trains a model.
>>> mp.train([dg1,dg2], verbose=False)
>>> sent1 = ['John','sees','Mary', '.']
>>> sent2 = ['John', 'walks', 'a', 'dog', '.']
>>>
>>> # Parse a single sentence.
>>> parsed_sent1 = mp.parse_one(sent1)
>>> parsed_sent2 = mp.parse_one(sent2)
>>> print(parsed_sent1.tree())
(sees John Mary .)
>>> print(parsed_sent2.tree())
(walks John (dog a) .)
>>>
>>> # Parsing multiple sentences.
>>> sentences = [sent1,sent2]
>>> parsed_sents = mp.parse_sents(sentences)
>>> print(next(next(parsed_sents)).tree())
(sees John Mary .)
>>> print(next(next(parsed_sents)).tree())
(walks John (dog a) .)
>>>
>>> # Initialize a MaltParser object with an English pre-trained model.
>>> parser_dirname = 'maltparser-1.9.2'
>>> model_name = 'engmalt.linear-1.7.mco'
>>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
>>> sent1 = 'I shot an elephant in my pajamas .'.split()
>>> sent2 = 'Time flies like banana .'.split()
>>> # Parse a single sentence.
>>> print(mp.parse_one(sent1).tree())
(shot I (elephant an) (in (pajamas my)) .)
# Parsing multiple sentences
>>> sentences = [sent1,sent2]
>>> parsed_sents = mp.parse_sents(sentences)
>>> print(next(next(parsed_sents)).tree())
(shot I (elephant an) (in (pajamas my)) .)
>>> print(next(next(parsed_sents)).tree())
(flies Time (like banana) .)
"""
import doctest
doctest.testmod()

View File

@@ -0,0 +1,772 @@
# Natural Language Toolkit: Dependency Grammars
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
import logging
import math
from nltk.parse.dependencygraph import DependencyGraph
logger = logging.getLogger(__name__)
#################################################################
# DependencyScorerI - Interface for Graph-Edge Weight Calculation
#################################################################
class DependencyScorerI:
"""
A scorer for calculated the weights on the edges of a weighted
dependency graph. This is used by a
``ProbabilisticNonprojectiveParser`` to initialize the edge
weights of a ``DependencyGraph``. While typically this would be done
by training a binary classifier, any class that can return a
multidimensional list representation of the edge weights can
implement this interface. As such, it has no necessary
fields.
"""
def __init__(self):
if self.__class__ == DependencyScorerI:
raise TypeError("DependencyScorerI is an abstract interface")
def train(self, graphs):
"""
:type graphs: list(DependencyGraph)
:param graphs: A list of dependency graphs to train the scorer.
Typically the edges present in the graphs can be used as
positive training examples, and the edges not present as negative
examples.
"""
raise NotImplementedError()
def score(self, graph):
"""
:type graph: DependencyGraph
:param graph: A dependency graph whose set of edges need to be
scored.
:rtype: A three-dimensional list of numbers.
:return: The score is returned in a multidimensional(3) list, such
that the outer-dimension refers to the head, and the
inner-dimension refers to the dependencies. For instance,
scores[0][1] would reference the list of scores corresponding to
arcs from node 0 to node 1. The node's 'address' field can be used
to determine its number identification.
For further illustration, a score list corresponding to Fig.2 of
Keith Hall's 'K-best Spanning Tree Parsing' paper::
scores = [[[], [5], [1], [1]],
[[], [], [11], [4]],
[[], [10], [], [5]],
[[], [8], [8], []]]
When used in conjunction with a MaxEntClassifier, each score would
correspond to the confidence of a particular edge being classified
with the positive training examples.
"""
raise NotImplementedError()
#################################################################
# NaiveBayesDependencyScorer
#################################################################
class NaiveBayesDependencyScorer(DependencyScorerI):
"""
A dependency scorer built around a MaxEnt classifier. In this
particular class that classifier is a ``NaiveBayesClassifier``.
It uses head-word, head-tag, child-word, and child-tag features
for classification.
>>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2
>>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry]
>>> npp = ProbabilisticNonprojectiveParser()
>>> npp.train(graphs, NaiveBayesDependencyScorer())
>>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'])
>>> len(list(parses))
1
"""
def __init__(self):
pass # Do nothing without throwing error
def train(self, graphs):
"""
Trains a ``NaiveBayesClassifier`` using the edges present in
graphs list as positive examples, the edges not present as
negative examples. Uses a feature vector of head-word,
head-tag, child-word, and child-tag.
:type graphs: list(DependencyGraph)
:param graphs: A list of dependency graphs to train the scorer.
"""
from nltk.classify import NaiveBayesClassifier
# Create training labeled training examples
labeled_examples = []
for graph in graphs:
for head_node in graph.nodes.values():
for child_index, child_node in graph.nodes.items():
if child_index in head_node["deps"]:
label = "T"
else:
label = "F"
labeled_examples.append(
(
dict(
a=head_node["word"],
b=head_node["tag"],
c=child_node["word"],
d=child_node["tag"],
),
label,
)
)
self.classifier = NaiveBayesClassifier.train(labeled_examples)
def score(self, graph):
"""
Converts the graph into a feature-based representation of
each edge, and then assigns a score to each based on the
confidence of the classifier in assigning it to the
positive label. Scores are returned in a multidimensional list.
:type graph: DependencyGraph
:param graph: A dependency graph to score.
:rtype: 3 dimensional list
:return: Edge scores for the graph parameter.
"""
# Convert graph to feature representation
edges = []
for head_node in graph.nodes.values():
for child_node in graph.nodes.values():
edges.append(
dict(
a=head_node["word"],
b=head_node["tag"],
c=child_node["word"],
d=child_node["tag"],
)
)
# Score edges
edge_scores = []
row = []
count = 0
for pdist in self.classifier.prob_classify_many(edges):
logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
# smoothing in case the probability = 0
row.append([math.log(pdist.prob("T") + 0.00000000001)])
count += 1
if count == len(graph.nodes):
edge_scores.append(row)
row = []
count = 0
return edge_scores
#################################################################
# A Scorer for Demo Purposes
#################################################################
# A short class necessary to show parsing example from paper
class DemoScorer(DependencyScorerI):
def train(self, graphs):
print("Training...")
def score(self, graph):
# scores for Keith Hall 'K-best Spanning Tree Parsing' paper
return [
[[], [5], [1], [1]],
[[], [], [11], [4]],
[[], [10], [], [5]],
[[], [8], [8], []],
]
#################################################################
# Non-Projective Probabilistic Parsing
#################################################################
class ProbabilisticNonprojectiveParser:
"""A probabilistic non-projective dependency parser.
Nonprojective dependencies allows for "crossing branches" in the parse tree
which is necessary for representing particular linguistic phenomena, or even
typical parses in some languages. This parser follows the MST parsing
algorithm, outlined in McDonald(2005), which likens the search for the best
non-projective parse to finding the maximum spanning tree in a weighted
directed graph.
>>> class Scorer(DependencyScorerI):
... def train(self, graphs):
... pass
...
... def score(self, graph):
... return [
... [[], [5], [1], [1]],
... [[], [], [11], [4]],
... [[], [10], [], [5]],
... [[], [8], [8], []],
... ]
>>> npp = ProbabilisticNonprojectiveParser()
>>> npp.train([], Scorer())
>>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None])
>>> len(list(parses))
1
Rule based example
>>> from nltk.grammar import DependencyGrammar
>>> grammar = DependencyGrammar.fromstring('''
... 'taught' -> 'play' | 'man'
... 'man' -> 'the' | 'in'
... 'in' -> 'corner'
... 'corner' -> 'the'
... 'play' -> 'golf' | 'dachshund' | 'to'
... 'dachshund' -> 'his'
... ''')
>>> ndp = NonprojectiveDependencyParser(grammar)
>>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf'])
>>> len(list(parses))
4
"""
def __init__(self):
"""
Creates a new non-projective parser.
"""
logging.debug("initializing prob. nonprojective...")
def train(self, graphs, dependency_scorer):
"""
Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
and establishes this as the parser's scorer. This is used to
initialize the scores on a ``DependencyGraph`` during the parsing
procedure.
:type graphs: list(DependencyGraph)
:param graphs: A list of dependency graphs to train the scorer.
:type dependency_scorer: DependencyScorerI
:param dependency_scorer: A scorer which implements the
``DependencyScorerI`` interface.
"""
self._scorer = dependency_scorer
self._scorer.train(graphs)
def initialize_edge_scores(self, graph):
"""
Assigns a score to every edge in the ``DependencyGraph`` graph.
These scores are generated via the parser's scorer which
was assigned during the training process.
:type graph: DependencyGraph
:param graph: A dependency graph to assign scores to.
"""
self.scores = self._scorer.score(graph)
def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph):
"""
Takes a list of nodes that have been identified to belong to a cycle,
and collapses them into on larger node. The arcs of all nodes in
the graph must be updated to account for this.
:type new_node: Node.
:param new_node: A Node (Dictionary) to collapse the cycle nodes into.
:type cycle_path: A list of integers.
:param cycle_path: A list of node addresses, each of which is in the cycle.
:type g_graph, b_graph, c_graph: DependencyGraph
:param g_graph, b_graph, c_graph: Graphs which need to be updated.
"""
logger.debug("Collapsing nodes...")
# Collapse all cycle nodes into v_n+1 in G_Graph
for cycle_node_index in cycle_path:
g_graph.remove_by_address(cycle_node_index)
g_graph.add_node(new_node)
g_graph.redirect_arcs(cycle_path, new_node["address"])
def update_edge_scores(self, new_node, cycle_path):
"""
Updates the edge scores to reflect a collapse operation into
new_node.
:type new_node: A Node.
:param new_node: The node which cycle nodes are collapsed into.
:type cycle_path: A list of integers.
:param cycle_path: A list of node addresses that belong to the cycle.
"""
logger.debug("cycle %s", cycle_path)
cycle_path = self.compute_original_indexes(cycle_path)
logger.debug("old cycle %s", cycle_path)
logger.debug("Prior to update: %s", self.scores)
for i, row in enumerate(self.scores):
for j, column in enumerate(self.scores[i]):
logger.debug(self.scores[i][j])
if j in cycle_path and i not in cycle_path and self.scores[i][j]:
subtract_val = self.compute_max_subtract_score(j, cycle_path)
logger.debug("%s - %s", self.scores[i][j], subtract_val)
new_vals = []
for cur_val in self.scores[i][j]:
new_vals.append(cur_val - subtract_val)
self.scores[i][j] = new_vals
for i, row in enumerate(self.scores):
for j, cell in enumerate(self.scores[i]):
if i in cycle_path and j in cycle_path:
self.scores[i][j] = []
logger.debug("After update: %s", self.scores)
def compute_original_indexes(self, new_indexes):
"""
As nodes are collapsed into others, they are replaced
by the new node in the graph, but it's still necessary
to keep track of what these original nodes were. This
takes a list of node addresses and replaces any collapsed
node addresses with their original addresses.
:type new_indexes: A list of integers.
:param new_indexes: A list of node addresses to check for
subsumed nodes.
"""
swapped = True
while swapped:
originals = []
swapped = False
for new_index in new_indexes:
if new_index in self.inner_nodes:
for old_val in self.inner_nodes[new_index]:
if old_val not in originals:
originals.append(old_val)
swapped = True
else:
originals.append(new_index)
new_indexes = originals
return new_indexes
def compute_max_subtract_score(self, column_index, cycle_indexes):
"""
When updating scores the score of the highest-weighted incoming
arc is subtracted upon collapse. This returns the correct
amount to subtract from that edge.
:type column_index: integer.
:param column_index: A index representing the column of incoming arcs
to a particular node being updated
:type cycle_indexes: A list of integers.
:param cycle_indexes: Only arcs from cycle nodes are considered. This
is a list of such nodes addresses.
"""
max_score = -100000
for row_index in cycle_indexes:
for subtract_val in self.scores[row_index][column_index]:
if subtract_val > max_score:
max_score = subtract_val
return max_score
def best_incoming_arc(self, node_index):
"""
Returns the source of the best incoming arc to the
node with address: node_index
:type node_index: integer.
:param node_index: The address of the 'destination' node,
the node that is arced to.
"""
originals = self.compute_original_indexes([node_index])
logger.debug("originals: %s", originals)
max_arc = None
max_score = None
for row_index in range(len(self.scores)):
for col_index in range(len(self.scores[row_index])):
if col_index in originals and (
max_score is None or self.scores[row_index][col_index] > max_score
):
max_score = self.scores[row_index][col_index]
max_arc = row_index
logger.debug("%s, %s", row_index, col_index)
logger.debug(max_score)
for key in self.inner_nodes:
replaced_nodes = self.inner_nodes[key]
if max_arc in replaced_nodes:
return key
return max_arc
def original_best_arc(self, node_index):
originals = self.compute_original_indexes([node_index])
max_arc = None
max_score = None
max_orig = None
for row_index in range(len(self.scores)):
for col_index in range(len(self.scores[row_index])):
if col_index in originals and (
max_score is None or self.scores[row_index][col_index] > max_score
):
max_score = self.scores[row_index][col_index]
max_arc = row_index
max_orig = col_index
return [max_arc, max_orig]
def parse(self, tokens, tags):
"""
Parses a list of tokens in accordance to the MST parsing algorithm
for non-projective dependency parses. Assumes that the tokens to
be parsed have already been tagged and those tags are provided. Various
scoring methods can be used by implementing the ``DependencyScorerI``
interface and passing it to the training algorithm.
:type tokens: list(str)
:param tokens: A list of words or punctuation to be parsed.
:type tags: list(str)
:param tags: A list of tags corresponding by index to the words in the tokens list.
:return: An iterator of non-projective parses.
:rtype: iter(DependencyGraph)
"""
self.inner_nodes = {}
# Initialize g_graph
g_graph = DependencyGraph()
for index, token in enumerate(tokens):
g_graph.nodes[index + 1].update(
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
)
# Fully connect non-root nodes in g_graph
g_graph.connect_graph()
original_graph = DependencyGraph()
for index, token in enumerate(tokens):
original_graph.nodes[index + 1].update(
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
)
b_graph = DependencyGraph()
c_graph = DependencyGraph()
for index, token in enumerate(tokens):
c_graph.nodes[index + 1].update(
{"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
)
# Assign initial scores to g_graph edges
self.initialize_edge_scores(g_graph)
logger.debug(self.scores)
# Initialize a list of unvisited vertices (by node address)
unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
# Iterate over unvisited vertices
nr_vertices = len(tokens)
betas = {}
while unvisited_vertices:
# Mark current node as visited
current_vertex = unvisited_vertices.pop(0)
logger.debug("current_vertex: %s", current_vertex)
# Get corresponding node n_i to vertex v_i
current_node = g_graph.get_by_address(current_vertex)
logger.debug("current_node: %s", current_node)
# Get best in-edge node b for current node
best_in_edge = self.best_incoming_arc(current_vertex)
betas[current_vertex] = self.original_best_arc(current_vertex)
logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
# b_graph = Union(b_graph, b)
for new_vertex in [current_vertex, best_in_edge]:
b_graph.nodes[new_vertex].update(
{"word": "TEMP", "rel": "NTOP", "address": new_vertex}
)
b_graph.add_arc(best_in_edge, current_vertex)
# Beta(current node) = b - stored for parse recovery
# If b_graph contains a cycle, collapse it
cycle_path = b_graph.contains_cycle()
if cycle_path:
# Create a new node v_n+1 with address = len(nodes) + 1
new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
# c_graph = Union(c_graph, v_n+1)
c_graph.add_node(new_node)
# Collapse all nodes in cycle C into v_n+1
self.update_edge_scores(new_node, cycle_path)
self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
for cycle_index in cycle_path:
c_graph.add_arc(new_node["address"], cycle_index)
# self.replaced_by[cycle_index] = new_node['address']
self.inner_nodes[new_node["address"]] = cycle_path
# Add v_n+1 to list of unvisited vertices
unvisited_vertices.insert(0, nr_vertices + 1)
# increment # of nodes counter
nr_vertices += 1
# Remove cycle nodes from b_graph; B = B - cycle c
for cycle_node_address in cycle_path:
b_graph.remove_by_address(cycle_node_address)
logger.debug("g_graph: %s", g_graph)
logger.debug("b_graph: %s", b_graph)
logger.debug("c_graph: %s", c_graph)
logger.debug("Betas: %s", betas)
logger.debug("replaced nodes %s", self.inner_nodes)
# Recover parse tree
logger.debug("Final scores: %s", self.scores)
logger.debug("Recovering parse...")
for i in range(len(tokens) + 1, nr_vertices + 1):
betas[betas[i][1]] = betas[i]
logger.debug("Betas: %s", betas)
for node in original_graph.nodes.values():
# TODO: It's dangerous to assume that deps it a dictionary
# because it's a default dictionary. Ideally, here we should not
# be concerned how dependencies are stored inside of a dependency
# graph.
node["deps"] = {}
for i in range(1, len(tokens) + 1):
original_graph.add_arc(betas[i][0], betas[i][1])
logger.debug("Done.")
yield original_graph
#################################################################
# Rule-based Non-Projective Parser
#################################################################
class NonprojectiveDependencyParser:
"""
A non-projective, rule-based, dependency parser. This parser
will return the set of all possible non-projective parses based on
the word-to-word relations defined in the parser's dependency
grammar, and will allow the branches of the parse tree to cross
in order to capture a variety of linguistic phenomena that a
projective parser will not.
"""
def __init__(self, dependency_grammar):
"""
Creates a new ``NonprojectiveDependencyParser``.
:param dependency_grammar: a grammar of word-to-word relations.
:type dependency_grammar: DependencyGrammar
"""
self._grammar = dependency_grammar
def parse(self, tokens):
"""
Parses the input tokens with respect to the parser's grammar. Parsing
is accomplished by representing the search-space of possible parses as
a fully-connected directed graph. Arcs that would lead to ungrammatical
parses are removed and a lattice is constructed of length n, where n is
the number of input tokens, to represent all possible grammatical
traversals. All possible paths through the lattice are then enumerated
to produce the set of non-projective parses.
param tokens: A list of tokens to parse.
type tokens: list(str)
return: An iterator of non-projective parses.
rtype: iter(DependencyGraph)
"""
# Create graph representation of tokens
self._graph = DependencyGraph()
for index, token in enumerate(tokens):
self._graph.nodes[index] = {
"word": token,
"deps": [],
"rel": "NTOP",
"address": index,
}
for head_node in self._graph.nodes.values():
deps = []
for dep_node in self._graph.nodes.values():
if (
self._grammar.contains(head_node["word"], dep_node["word"])
and head_node["word"] != dep_node["word"]
):
deps.append(dep_node["address"])
head_node["deps"] = deps
# Create lattice of possible heads
roots = []
possible_heads = []
for i, word in enumerate(tokens):
heads = []
for j, head in enumerate(tokens):
if (i != j) and self._grammar.contains(head, word):
heads.append(j)
if len(heads) == 0:
roots.append(i)
possible_heads.append(heads)
# Set roots to attempt
if len(roots) < 2:
if len(roots) == 0:
for i in range(len(tokens)):
roots.append(i)
# Traverse lattice
analyses = []
for _ in roots:
stack = []
analysis = [[] for i in range(len(possible_heads))]
i = 0
forward = True
while i >= 0:
if forward:
if len(possible_heads[i]) == 1:
analysis[i] = possible_heads[i][0]
elif len(possible_heads[i]) == 0:
analysis[i] = -1
else:
head = possible_heads[i].pop()
analysis[i] = head
stack.append([i, head])
if not forward:
index_on_stack = False
for stack_item in stack:
if stack_item[0] == i:
index_on_stack = True
orig_length = len(possible_heads[i])
if index_on_stack and orig_length == 0:
for j in range(len(stack) - 1, -1, -1):
stack_item = stack[j]
if stack_item[0] == i:
possible_heads[i].append(stack.pop(j)[1])
elif index_on_stack and orig_length > 0:
head = possible_heads[i].pop()
analysis[i] = head
stack.append([i, head])
forward = True
if i + 1 == len(possible_heads):
analyses.append(analysis[:])
forward = False
if forward:
i += 1
else:
i -= 1
# Filter parses
# ensure 1 root, every thing has 1 head
for analysis in analyses:
if analysis.count(-1) > 1:
# there are several root elements!
continue
graph = DependencyGraph()
graph.root = graph.nodes[analysis.index(-1) + 1]
for address, (token, head_index) in enumerate(
zip(tokens, analysis), start=1
):
head_address = head_index + 1
node = graph.nodes[address]
node.update({"word": token, "address": address})
if head_address == 0:
rel = "ROOT"
else:
rel = ""
graph.nodes[head_index + 1]["deps"][rel].append(address)
# TODO: check for cycles
yield graph
#################################################################
# Demos
#################################################################
def demo():
# hall_demo()
nonprojective_conll_parse_demo()
rule_based_demo()
def hall_demo():
npp = ProbabilisticNonprojectiveParser()
npp.train([], DemoScorer())
for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
print(parse_graph)
def nonprojective_conll_parse_demo():
from nltk.parse.dependencygraph import conll_data2
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
npp = ProbabilisticNonprojectiveParser()
npp.train(graphs, NaiveBayesDependencyScorer())
for parse_graph in npp.parse(
["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
):
print(parse_graph)
def rule_based_demo():
from nltk.grammar import DependencyGrammar
grammar = DependencyGrammar.fromstring(
"""
'taught' -> 'play' | 'man'
'man' -> 'the' | 'in'
'in' -> 'corner'
'corner' -> 'the'
'play' -> 'golf' | 'dachshund' | 'to'
'dachshund' -> 'his'
"""
)
print(grammar)
ndp = NonprojectiveDependencyParser(grammar)
graphs = ndp.parse(
[
"the",
"man",
"in",
"the",
"corner",
"taught",
"his",
"dachshund",
"to",
"play",
"golf",
]
)
print("Graphs:")
for graph in graphs:
print(graph)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,579 @@
# Natural Language Toolkit: Probabilistic Chart Parsers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Classes and interfaces for associating probabilities with tree
structures that represent the internal organization of a text. The
probabilistic parser module defines ``BottomUpProbabilisticChartParser``.
``BottomUpProbabilisticChartParser`` is an abstract class that implements
a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges,
and adds them to the chart one at a time. The ordering of this queue
is based on the probabilities associated with the edges, allowing the
parser to expand more likely edges before less likely ones. Each
subclass implements a different queue ordering, producing different
search strategies. Currently the following subclasses are defined:
- ``InsideChartParser`` searches edges in decreasing order of
their trees' inside probabilities.
- ``RandomChartParser`` searches edges in random order.
- ``LongestChartParser`` searches edges in decreasing order of their
location's length.
The ``BottomUpProbabilisticChartParser`` constructor has an optional
argument beam_size. If non-zero, this controls the size of the beam
(aka the edge queue). This option is most useful with InsideChartParser.
"""
##//////////////////////////////////////////////////////
## Bottom-Up PCFG Chart Parser
##//////////////////////////////////////////////////////
# [XX] This might not be implemented quite right -- it would be better
# to associate probabilities with child pointer lists.
import random
from functools import reduce
from nltk.grammar import PCFG, Nonterminal
from nltk.parse.api import ParserI
from nltk.parse.chart import AbstractChartRule, Chart, LeafEdge, TreeEdge
from nltk.tree import ProbabilisticTree, Tree
# Probabilistic edges
class ProbabilisticLeafEdge(LeafEdge):
def prob(self):
return 1.0
class ProbabilisticTreeEdge(TreeEdge):
def __init__(self, prob, *args, **kwargs):
TreeEdge.__init__(self, *args, **kwargs)
self._prob = prob
# two edges with different probabilities are not equal.
self._comparison_key = (self._comparison_key, prob)
def prob(self):
return self._prob
@staticmethod
def from_production(production, index, p):
return ProbabilisticTreeEdge(
p, (index, index), production.lhs(), production.rhs(), 0
)
# Rules using probabilistic edges
class ProbabilisticBottomUpInitRule(AbstractChartRule):
NUM_EDGES = 0
def apply(self, chart, grammar):
for index in range(chart.num_leaves()):
new_edge = ProbabilisticLeafEdge(chart.leaf(index), index)
if chart.insert(new_edge, ()):
yield new_edge
class ProbabilisticBottomUpPredictRule(AbstractChartRule):
NUM_EDGES = 1
def apply(self, chart, grammar, edge):
if edge.is_incomplete():
return
for prod in grammar.productions():
if edge.lhs() == prod.rhs()[0]:
new_edge = ProbabilisticTreeEdge.from_production(
prod, edge.start(), prod.prob()
)
if chart.insert(new_edge, ()):
yield new_edge
class ProbabilisticFundamentalRule(AbstractChartRule):
NUM_EDGES = 2
def apply(self, chart, grammar, left_edge, right_edge):
# Make sure the rule is applicable.
if not (
left_edge.end() == right_edge.start()
and left_edge.nextsym() == right_edge.lhs()
and left_edge.is_incomplete()
and right_edge.is_complete()
):
return
# Construct the new edge.
p = left_edge.prob() * right_edge.prob()
new_edge = ProbabilisticTreeEdge(
p,
span=(left_edge.start(), right_edge.end()),
lhs=left_edge.lhs(),
rhs=left_edge.rhs(),
dot=left_edge.dot() + 1,
)
# Add it to the chart, with appropriate child pointers.
changed_chart = False
for cpl1 in chart.child_pointer_lists(left_edge):
if chart.insert(new_edge, cpl1 + (right_edge,)):
changed_chart = True
# If we changed the chart, then generate the edge.
if changed_chart:
yield new_edge
class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
NUM_EDGES = 1
_fundamental_rule = ProbabilisticFundamentalRule()
def apply(self, chart, grammar, edge1):
fr = self._fundamental_rule
if edge1.is_incomplete():
# edge1 = left_edge; edge2 = right_edge
for edge2 in chart.select(
start=edge1.end(), is_complete=True, lhs=edge1.nextsym()
):
yield from fr.apply(chart, grammar, edge1, edge2)
else:
# edge2 = left_edge; edge1 = right_edge
for edge2 in chart.select(
end=edge1.start(), is_complete=False, nextsym=edge1.lhs()
):
yield from fr.apply(chart, grammar, edge2, edge1)
def __str__(self):
return "Fundamental Rule"
class BottomUpProbabilisticChartParser(ParserI):
"""
An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to
record partial results. ``BottomUpProbabilisticChartParser`` maintains
a queue of edges that can be added to the chart. This queue is
initialized with edges for each token in the text that is being
parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into
the chart one at a time, starting with the most likely edges, and
proceeding to less likely edges. For each edge that is added to
the chart, it may become possible to insert additional edges into
the chart; these are added to the queue. This process continues
until enough complete parses have been generated, or until the
queue is empty.
The sorting order for the queue is not specified by
``BottomUpProbabilisticChartParser``. Different sorting orders will
result in different search strategies. The sorting order for the
queue is defined by the method ``sort_queue``; subclasses are required
to provide a definition for this method.
:type _grammar: PCFG
:ivar _grammar: The grammar used to parse sentences.
:type _trace: int
:ivar _trace: The level of tracing output that should be generated
when parsing a text.
"""
def __init__(self, grammar, beam_size=0, trace=0):
"""
Create a new ``BottomUpProbabilisticChartParser``, that uses
``grammar`` to parse texts.
:type grammar: PCFG
:param grammar: The grammar used to parse texts.
:type beam_size: int
:param beam_size: The maximum length for the parser's edge queue.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing
output.
"""
if not isinstance(grammar, PCFG):
raise ValueError("The grammar must be probabilistic PCFG")
self._grammar = grammar
self.beam_size = beam_size
self._trace = trace
def grammar(self):
return self._grammar
def trace(self, trace=2):
"""
Set the level of tracing output that should be generated when
parsing a text.
:type trace: int
:param trace: The trace level. A trace level of ``0`` will
generate no tracing output; and higher trace levels will
produce more verbose tracing output.
:rtype: None
"""
self._trace = trace
# TODO: change this to conform more with the standard ChartParser
def parse(self, tokens):
self._grammar.check_coverage(tokens)
chart = Chart(list(tokens))
grammar = self._grammar
# Chart parser rules.
bu_init = ProbabilisticBottomUpInitRule()
bu = ProbabilisticBottomUpPredictRule()
fr = SingleEdgeProbabilisticFundamentalRule()
# Our queue
queue = []
# Initialize the chart.
for edge in bu_init.apply(chart, grammar):
if self._trace > 1:
print(
" %-50s [%s]"
% (chart.pretty_format_edge(edge, width=2), edge.prob())
)
queue.append(edge)
while len(queue) > 0:
# Re-sort the queue.
self.sort_queue(queue, chart)
# Prune the queue to the correct size if a beam was defined
if self.beam_size:
self._prune(queue, chart)
# Get the best edge.
edge = queue.pop()
if self._trace > 0:
print(
" %-50s [%s]"
% (chart.pretty_format_edge(edge, width=2), edge.prob())
)
# Apply BU & FR to it.
queue.extend(bu.apply(chart, grammar, edge))
queue.extend(fr.apply(chart, grammar, edge))
# Get a list of complete parses.
parses = list(chart.parses(grammar.start(), ProbabilisticTree))
# Assign probabilities to the trees.
prod_probs = {}
for prod in grammar.productions():
prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
for parse in parses:
self._setprob(parse, prod_probs)
# Sort by probability
parses.sort(reverse=True, key=lambda tree: tree.prob())
return iter(parses)
def _setprob(self, tree, prod_probs):
if tree.prob() is not None:
return
# Get the prob of the CFG production.
lhs = Nonterminal(tree.label())
rhs = []
for child in tree:
if isinstance(child, Tree):
rhs.append(Nonterminal(child.label()))
else:
rhs.append(child)
prob = prod_probs[lhs, tuple(rhs)]
# Get the probs of children.
for child in tree:
if isinstance(child, Tree):
self._setprob(child, prod_probs)
prob *= child.prob()
tree.set_prob(prob)
def sort_queue(self, queue, chart):
"""
Sort the given queue of ``Edge`` objects, placing the edge that should
be tried first at the beginning of the queue. This method
will be called after each ``Edge`` is added to the queue.
:param queue: The queue of ``Edge`` objects to sort. Each edge in
this queue is an edge that could be added to the chart by
the fundamental rule; but that has not yet been added.
:type queue: list(Edge)
:param chart: The chart being used to parse the text. This
chart can be used to provide extra information for sorting
the queue.
:type chart: Chart
:rtype: None
"""
raise NotImplementedError()
def _prune(self, queue, chart):
"""Discard items in the queue if the queue is longer than the beam."""
if len(queue) > self.beam_size:
split = len(queue) - self.beam_size
if self._trace > 2:
for edge in queue[:split]:
print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2))
del queue[:split]
class InsideChartParser(BottomUpProbabilisticChartParser):
"""
A bottom-up parser for ``PCFG`` grammars that tries edges in descending
order of the inside probabilities of their trees. The "inside
probability" of a tree is simply the
probability of the entire tree, ignoring its context. In
particular, the inside probability of a tree generated by
production *p* with children *c[1], c[2], ..., c[n]* is
*P(p)P(c[1])P(c[2])...P(c[n])*; and the inside
probability of a token is 1 if it is present in the text, and 0 if
it is absent.
This sorting order results in a type of lowest-cost-first search
strategy.
"""
# Inherit constructor.
def sort_queue(self, queue, chart):
"""
Sort the given queue of edges, in descending order of the
inside probabilities of the edges' trees.
:param queue: The queue of ``Edge`` objects to sort. Each edge in
this queue is an edge that could be added to the chart by
the fundamental rule; but that has not yet been added.
:type queue: list(Edge)
:param chart: The chart being used to parse the text. This
chart can be used to provide extra information for sorting
the queue.
:type chart: Chart
:rtype: None
"""
queue.sort(key=lambda edge: edge.prob())
# Eventually, this will become some sort of inside-outside parser:
# class InsideOutsideParser(BottomUpProbabilisticChartParser):
# def __init__(self, grammar, trace=0):
# # Inherit docs.
# BottomUpProbabilisticChartParser.__init__(self, grammar, trace)
#
# # Find the best path from S to each nonterminal
# bestp = {}
# for production in grammar.productions(): bestp[production.lhs()]=0
# bestp[grammar.start()] = 1.0
#
# for i in range(len(grammar.productions())):
# for production in grammar.productions():
# lhs = production.lhs()
# for elt in production.rhs():
# bestp[elt] = max(bestp[lhs]*production.prob(),
# bestp.get(elt,0))
#
# self._bestp = bestp
# for (k,v) in self._bestp.items(): print(k,v)
#
# def _sortkey(self, edge):
# return edge.structure()[PROB] * self._bestp[edge.lhs()]
#
# def sort_queue(self, queue, chart):
# queue.sort(key=self._sortkey)
class RandomChartParser(BottomUpProbabilisticChartParser):
"""
A bottom-up parser for ``PCFG`` grammars that tries edges in random order.
This sorting order results in a random search strategy.
"""
# Inherit constructor
def sort_queue(self, queue, chart):
i = random.randint(0, len(queue) - 1)
(queue[-1], queue[i]) = (queue[i], queue[-1])
class UnsortedChartParser(BottomUpProbabilisticChartParser):
"""
A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order.
"""
# Inherit constructor
def sort_queue(self, queue, chart):
return
class LongestChartParser(BottomUpProbabilisticChartParser):
"""
A bottom-up parser for ``PCFG`` grammars that tries longer edges before
shorter ones. This sorting order results in a type of best-first
search strategy.
"""
# Inherit constructor
def sort_queue(self, queue, chart):
queue.sort(key=lambda edge: edge.length())
##//////////////////////////////////////////////////////
## Test Code
##//////////////////////////////////////////////////////
def demo(choice=None, draw_parses=None, print_parses=None):
"""
A demonstration of the probabilistic parsers. The user is
prompted to select which demo to run, and how many parses should
be found; and then each parser is run on the same demo, and a
summary of the results are displayed.
"""
import sys
import time
from nltk import tokenize
from nltk.parse import pchart
# Define two demos. Each demo has a sentence and a grammar.
toy_pcfg1 = PCFG.fromstring(
"""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
"""
)
toy_pcfg2 = PCFG.fromstring(
"""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
VP -> VP PP [.01]
NP -> Det N [.41]
NP -> Name [.28]
NP -> NP PP [.31]
PP -> P NP [1.0]
V -> 'saw' [.21]
V -> 'ate' [.51]
V -> 'ran' [.28]
N -> 'boy' [.11]
N -> 'cookie' [.12]
N -> 'table' [.13]
N -> 'telescope' [.14]
N -> 'hill' [.5]
Name -> 'Jack' [.52]
Name -> 'Bob' [.48]
P -> 'with' [.61]
P -> 'under' [.39]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
"""
)
demos = [
("I saw John with my telescope", toy_pcfg1),
("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
]
if choice is None:
# Ask the user which demo they want to use.
print()
for i in range(len(demos)):
print(f"{i + 1:>3}: {demos[i][0]}")
print(" %r" % demos[i][1])
print()
print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
choice = int(sys.stdin.readline().strip()) - 1
try:
sent, grammar = demos[choice]
except:
print("Bad sentence number")
return
# Tokenize the sentence.
tokens = sent.split()
# Define a list of parsers. We'll use all parsers.
parsers = [
pchart.InsideChartParser(grammar),
pchart.RandomChartParser(grammar),
pchart.UnsortedChartParser(grammar),
pchart.LongestChartParser(grammar),
pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser
]
# Run the parsers on the tokenized sentence.
times = []
average_p = []
num_parses = []
all_parses = {}
for parser in parsers:
print(f"\ns: {sent}\nparser: {parser}\ngrammar: {grammar}")
parser.trace(3)
t = time.time()
parses = list(parser.parse(tokens))
times.append(time.time() - t)
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
average_p.append(p)
num_parses.append(len(parses))
for p in parses:
all_parses[p.freeze()] = 1
# Print some summary statistics
print()
print(" Parser Beam | Time (secs) # Parses Average P(parse)")
print("------------------------+------------------------------------------")
for i in range(len(parsers)):
print(
"%18s %4d |%11.4f%11d%19.14f"
% (
parsers[i].__class__.__name__,
parsers[i].beam_size,
times[i],
num_parses[i],
average_p[i],
)
)
parses = all_parses.keys()
if parses:
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
else:
p = 0
print("------------------------+------------------------------------------")
print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p))
if draw_parses is None:
# Ask the user if we should draw the parses.
print()
print("Draw parses (y/n)? ", end=" ")
draw_parses = sys.stdin.readline().strip().lower().startswith("y")
if draw_parses:
from nltk.draw.tree import draw_trees
print(" please wait...")
draw_trees(*parses)
if print_parses is None:
# Ask the user if we should print the parses.
print()
print("Print parses (y/n)? ", end=" ")
print_parses = sys.stdin.readline().strip().lower().startswith("y")
if print_parses:
for parse in parses:
print(parse)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,716 @@
# Natural Language Toolkit: Dependency Grammars
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Jason Narad <jason.narad@gmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
from collections import defaultdict
from functools import total_ordering
from itertools import chain
from nltk.grammar import (
DependencyGrammar,
DependencyProduction,
ProbabilisticDependencyGrammar,
)
from nltk.internals import raise_unorderable_types
from nltk.parse.dependencygraph import DependencyGraph
#################################################################
# Dependency Span
#################################################################
@total_ordering
class DependencySpan:
"""
A contiguous span over some part of the input string representing
dependency (head -> modifier) relationships amongst words. An atomic
span corresponds to only one word so it isn't a 'span' in the conventional
sense, as its _start_index = _end_index = _head_index for concatenation
purposes. All other spans are assumed to have arcs between all nodes
within the start and end indexes of the span, and one head index corresponding
to the head word for the entire span. This is the same as the root node if
the dependency structure were depicted as a graph.
"""
def __init__(self, start_index, end_index, head_index, arcs, tags):
self._start_index = start_index
self._end_index = end_index
self._head_index = head_index
self._arcs = arcs
self._tags = tags
self._comparison_key = (start_index, end_index, head_index, tuple(arcs))
self._hash = hash(self._comparison_key)
def head_index(self):
"""
:return: An value indexing the head of the entire ``DependencySpan``.
:rtype: int
"""
return self._head_index
def __repr__(self):
"""
:return: A concise string representatino of the ``DependencySpan``.
:rtype: str.
"""
return "Span %d-%d; Head Index: %d" % (
self._start_index,
self._end_index,
self._head_index,
)
def __str__(self):
"""
:return: A verbose string representation of the ``DependencySpan``.
:rtype: str
"""
str = "Span %d-%d; Head Index: %d" % (
self._start_index,
self._end_index,
self._head_index,
)
for i in range(len(self._arcs)):
str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i])
return str
def __eq__(self, other):
return (
type(self) == type(other) and self._comparison_key == other._comparison_key
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if not isinstance(other, DependencySpan):
raise_unorderable_types("<", self, other)
return self._comparison_key < other._comparison_key
def __hash__(self):
"""
:return: The hash value of this ``DependencySpan``.
"""
return self._hash
#################################################################
# Chart Cell
#################################################################
class ChartCell:
"""
A cell from the parse chart formed when performing the CYK algorithm.
Each cell keeps track of its x and y coordinates (though this will probably
be discarded), and a list of spans serving as the cell's entries.
"""
def __init__(self, x, y):
"""
:param x: This cell's x coordinate.
:type x: int.
:param y: This cell's y coordinate.
:type y: int.
"""
self._x = x
self._y = y
self._entries = set()
def add(self, span):
"""
Appends the given span to the list of spans
representing the chart cell's entries.
:param span: The span to add.
:type span: DependencySpan
"""
self._entries.add(span)
def __str__(self):
"""
:return: A verbose string representation of this ``ChartCell``.
:rtype: str.
"""
return "CC[%d,%d]: %s" % (self._x, self._y, self._entries)
def __repr__(self):
"""
:return: A concise string representation of this ``ChartCell``.
:rtype: str.
"""
return "%s" % self
#################################################################
# Parsing with Dependency Grammars
#################################################################
class ProjectiveDependencyParser:
"""
A projective, rule-based, dependency parser. A ProjectiveDependencyParser
is created with a DependencyGrammar, a set of productions specifying
word-to-word dependency relations. The parse() method will then
return the set of all parses, in tree representation, for a given input
sequence of tokens. Each parse must meet the requirements of the both
the grammar and the projectivity constraint which specifies that the
branches of the dependency tree are not allowed to cross. Alternatively,
this can be understood as stating that each parent node and its children
in the parse tree form a continuous substring of the input sequence.
"""
def __init__(self, dependency_grammar):
"""
Create a new ProjectiveDependencyParser, from a word-to-word
dependency grammar ``DependencyGrammar``.
:param dependency_grammar: A word-to-word relation dependencygrammar.
:type dependency_grammar: DependencyGrammar
"""
self._grammar = dependency_grammar
def parse(self, tokens):
"""
Performs a projective dependency parse on the list of tokens using
a chart-based, span-concatenation algorithm similar to Eisner (1996).
:param tokens: The list of input tokens.
:type tokens: list(str)
:return: An iterator over parse trees.
:rtype: iter(Tree)
"""
self._tokens = list(tokens)
chart = []
for i in range(0, len(self._tokens) + 1):
chart.append([])
for j in range(0, len(self._tokens) + 1):
chart[i].append(ChartCell(i, j))
if i == j + 1:
chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"]))
for i in range(1, len(self._tokens) + 1):
for j in range(i - 2, -1, -1):
for k in range(i - 1, j, -1):
for span1 in chart[k][j]._entries:
for span2 in chart[i][k]._entries:
for newspan in self.concatenate(span1, span2):
chart[i][j].add(newspan)
for parse in chart[len(self._tokens)][0]._entries:
conll_format = ""
# malt_format = ""
for i in range(len(tokens)):
# malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
# Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
i + 1,
tokens[i],
tokens[i],
"null",
"null",
"null",
parse._arcs[i] + 1,
"ROOT",
"-",
"-",
)
dg = DependencyGraph(conll_format)
# if self.meets_arity(dg):
yield dg.tree()
def concatenate(self, span1, span2):
"""
Concatenates the two spans in whichever way possible. This
includes rightward concatenation (from the leftmost word of the
leftmost span to the rightmost word of the rightmost span) and
leftward concatenation (vice-versa) between adjacent spans. Unlike
Eisner's presentation of span concatenation, these spans do not
share or pivot on a particular word/word-index.
:return: A list of new spans formed through concatenation.
:rtype: list(DependencySpan)
"""
spans = []
if span1._start_index == span2._start_index:
print("Error: Mismatched spans - replace this with thrown error")
if span1._start_index > span2._start_index:
temp_span = span1
span1 = span2
span2 = temp_span
# adjacent rightward covered concatenation
new_arcs = span1._arcs + span2._arcs
new_tags = span1._tags + span2._tags
if self._grammar.contains(
self._tokens[span1._head_index], self._tokens[span2._head_index]
):
# print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index))
new_arcs[span2._head_index - span1._start_index] = span1._head_index
spans.append(
DependencySpan(
span1._start_index,
span2._end_index,
span1._head_index,
new_arcs,
new_tags,
)
)
# adjacent leftward covered concatenation
new_arcs = span1._arcs + span2._arcs
if self._grammar.contains(
self._tokens[span2._head_index], self._tokens[span1._head_index]
):
# print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index))
new_arcs[span1._head_index - span1._start_index] = span2._head_index
spans.append(
DependencySpan(
span1._start_index,
span2._end_index,
span2._head_index,
new_arcs,
new_tags,
)
)
return spans
#################################################################
# Parsing with Probabilistic Dependency Grammars
#################################################################
class ProbabilisticProjectiveDependencyParser:
"""A probabilistic, projective dependency parser.
This parser returns the most probable projective parse derived from the
probabilistic dependency grammar derived from the train() method. The
probabilistic model is an implementation of Eisner's (1996) Model C, which
conditions on head-word, head-tag, child-word, and child-tag. The decoding
uses a bottom-up chart-based span concatenation algorithm that's identical
to the one utilized by the rule-based projective parser.
Usage example
>>> from nltk.parse.dependencygraph import conll_data2
>>> graphs = [
... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry
... ]
>>> ppdp = ProbabilisticProjectiveDependencyParser()
>>> ppdp.train(graphs)
>>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
>>> list(ppdp.parse(sent))
[Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])]
"""
def __init__(self):
"""
Create a new probabilistic dependency parser. No additional
operations are necessary.
"""
def parse(self, tokens):
"""
Parses the list of tokens subject to the projectivity constraint
and the productions in the parser's grammar. This uses a method
similar to the span-concatenation algorithm defined in Eisner (1996).
It returns the most probable parse derived from the parser's
probabilistic dependency grammar.
"""
self._tokens = list(tokens)
chart = []
for i in range(0, len(self._tokens) + 1):
chart.append([])
for j in range(0, len(self._tokens) + 1):
chart[i].append(ChartCell(i, j))
if i == j + 1:
if tokens[i - 1] in self._grammar._tags:
for tag in self._grammar._tags[tokens[i - 1]]:
chart[i][j].add(
DependencySpan(i - 1, i, i - 1, [-1], [tag])
)
else:
print(
"No tag found for input token '%s', parse is impossible."
% tokens[i - 1]
)
return []
for i in range(1, len(self._tokens) + 1):
for j in range(i - 2, -1, -1):
for k in range(i - 1, j, -1):
for span1 in chart[k][j]._entries:
for span2 in chart[i][k]._entries:
for newspan in self.concatenate(span1, span2):
chart[i][j].add(newspan)
trees = []
max_parse = None
max_score = 0
for parse in chart[len(self._tokens)][0]._entries:
conll_format = ""
malt_format = ""
for i in range(len(tokens)):
malt_format += "%s\t%s\t%d\t%s\n" % (
tokens[i],
"null",
parse._arcs[i] + 1,
"null",
)
# conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
# Modify to comply with recent change in dependency graph such that there must be a ROOT element.
conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
i + 1,
tokens[i],
tokens[i],
parse._tags[i],
parse._tags[i],
"null",
parse._arcs[i] + 1,
"ROOT",
"-",
"-",
)
dg = DependencyGraph(conll_format)
score = self.compute_prob(dg)
trees.append((score, dg.tree()))
trees.sort()
return (tree for (score, tree) in trees)
def concatenate(self, span1, span2):
"""
Concatenates the two spans in whichever way possible. This
includes rightward concatenation (from the leftmost word of the
leftmost span to the rightmost word of the rightmost span) and
leftward concatenation (vice-versa) between adjacent spans. Unlike
Eisner's presentation of span concatenation, these spans do not
share or pivot on a particular word/word-index.
:return: A list of new spans formed through concatenation.
:rtype: list(DependencySpan)
"""
spans = []
if span1._start_index == span2._start_index:
print("Error: Mismatched spans - replace this with thrown error")
if span1._start_index > span2._start_index:
temp_span = span1
span1 = span2
span2 = temp_span
# adjacent rightward covered concatenation
new_arcs = span1._arcs + span2._arcs
new_tags = span1._tags + span2._tags
if self._grammar.contains(
self._tokens[span1._head_index], self._tokens[span2._head_index]
):
new_arcs[span2._head_index - span1._start_index] = span1._head_index
spans.append(
DependencySpan(
span1._start_index,
span2._end_index,
span1._head_index,
new_arcs,
new_tags,
)
)
# adjacent leftward covered concatenation
new_arcs = span1._arcs + span2._arcs
new_tags = span1._tags + span2._tags
if self._grammar.contains(
self._tokens[span2._head_index], self._tokens[span1._head_index]
):
new_arcs[span1._head_index - span1._start_index] = span2._head_index
spans.append(
DependencySpan(
span1._start_index,
span2._end_index,
span2._head_index,
new_arcs,
new_tags,
)
)
return spans
def train(self, graphs):
"""
Trains a ProbabilisticDependencyGrammar based on the list of input
DependencyGraphs. This model is an implementation of Eisner's (1996)
Model C, which derives its statistics from head-word, head-tag,
child-word, and child-tag relationships.
:param graphs: A list of dependency graphs to train from.
:type: list(DependencyGraph)
"""
productions = []
events = defaultdict(int)
tags = {}
for dg in graphs:
for node_index in range(1, len(dg.nodes)):
# children = dg.nodes[node_index]['deps']
children = list(
chain.from_iterable(dg.nodes[node_index]["deps"].values())
)
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
nr_children = nr_left_children + nr_right_children
for child_index in range(
0 - (nr_left_children + 1), nr_right_children + 2
):
head_word = dg.nodes[node_index]["word"]
head_tag = dg.nodes[node_index]["tag"]
if head_word in tags:
tags[head_word].add(head_tag)
else:
tags[head_word] = {head_tag}
child = "STOP"
child_tag = "STOP"
prev_word = "START"
prev_tag = "START"
if child_index < 0:
array_index = child_index + nr_left_children
if array_index >= 0:
child = dg.nodes[children[array_index]]["word"]
child_tag = dg.nodes[children[array_index]]["tag"]
if child_index != -1:
prev_word = dg.nodes[children[array_index + 1]]["word"]
prev_tag = dg.nodes[children[array_index + 1]]["tag"]
if child != "STOP":
productions.append(DependencyProduction(head_word, [child]))
head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format(
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
mod_event = "(mods ({}, {}, {}) left))".format(
prev_tag,
head_word,
head_tag,
)
events[head_event] += 1
events[mod_event] += 1
elif child_index > 0:
array_index = child_index + nr_left_children - 1
if array_index < nr_children:
child = dg.nodes[children[array_index]]["word"]
child_tag = dg.nodes[children[array_index]]["tag"]
if child_index != 1:
prev_word = dg.nodes[children[array_index - 1]]["word"]
prev_tag = dg.nodes[children[array_index - 1]]["tag"]
if child != "STOP":
productions.append(DependencyProduction(head_word, [child]))
head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format(
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
mod_event = "(mods ({}, {}, {}) right))".format(
prev_tag,
head_word,
head_tag,
)
events[head_event] += 1
events[mod_event] += 1
self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)
def compute_prob(self, dg):
"""
Computes the probability of a dependency graph based
on the parser's probability model (defined by the parser's
statistical dependency grammar).
:param dg: A dependency graph to score.
:type dg: DependencyGraph
:return: The probability of the dependency graph.
:rtype: int
"""
prob = 1.0
for node_index in range(1, len(dg.nodes)):
# children = dg.nodes[node_index]['deps']
children = list(chain.from_iterable(dg.nodes[node_index]["deps"].values()))
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
nr_children = nr_left_children + nr_right_children
for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
head_word = dg.nodes[node_index]["word"]
head_tag = dg.nodes[node_index]["tag"]
child = "STOP"
child_tag = "STOP"
prev_word = "START"
prev_tag = "START"
if child_index < 0:
array_index = child_index + nr_left_children
if array_index >= 0:
child = dg.nodes[children[array_index]]["word"]
child_tag = dg.nodes[children[array_index]]["tag"]
if child_index != -1:
prev_word = dg.nodes[children[array_index + 1]]["word"]
prev_tag = dg.nodes[children[array_index + 1]]["tag"]
head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format(
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
mod_event = "(mods ({}, {}, {}) left))".format(
prev_tag,
head_word,
head_tag,
)
h_count = self._grammar._events[head_event]
m_count = self._grammar._events[mod_event]
# If the grammar is not covered
if m_count != 0:
prob *= h_count / m_count
else:
prob = 0.00000001 # Very small number
elif child_index > 0:
array_index = child_index + nr_left_children - 1
if array_index < nr_children:
child = dg.nodes[children[array_index]]["word"]
child_tag = dg.nodes[children[array_index]]["tag"]
if child_index != 1:
prev_word = dg.nodes[children[array_index - 1]]["word"]
prev_tag = dg.nodes[children[array_index - 1]]["tag"]
head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format(
child,
child_tag,
prev_tag,
head_word,
head_tag,
)
mod_event = "(mods ({}, {}, {}) right))".format(
prev_tag,
head_word,
head_tag,
)
h_count = self._grammar._events[head_event]
m_count = self._grammar._events[mod_event]
if m_count != 0:
prob *= h_count / m_count
else:
prob = 0.00000001 # Very small number
return prob
#################################################################
# Demos
#################################################################
def demo():
projective_rule_parse_demo()
# arity_parse_demo()
projective_prob_parse_demo()
def projective_rule_parse_demo():
"""
A demonstration showing the creation and use of a
``DependencyGrammar`` to perform a projective dependency
parse.
"""
grammar = DependencyGrammar.fromstring(
"""
'scratch' -> 'cats' | 'walls'
'walls' -> 'the'
'cats' -> 'the'
"""
)
print(grammar)
pdp = ProjectiveDependencyParser(grammar)
trees = pdp.parse(["the", "cats", "scratch", "the", "walls"])
for tree in trees:
print(tree)
def arity_parse_demo():
"""
A demonstration showing the creation of a ``DependencyGrammar``
in which a specific number of modifiers is listed for a given
head. This can further constrain the number of possible parses
created by a ``ProjectiveDependencyParser``.
"""
print()
print("A grammar with no arity constraints. Each DependencyProduction")
print("specifies a relationship between one head word and only one")
print("modifier word.")
grammar = DependencyGrammar.fromstring(
"""
'fell' -> 'price' | 'stock'
'price' -> 'of' | 'the'
'of' -> 'stock'
'stock' -> 'the'
"""
)
print(grammar)
print()
print("For the sentence 'The price of the stock fell', this grammar")
print("will produce the following three parses:")
pdp = ProjectiveDependencyParser(grammar)
trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
for tree in trees:
print(tree)
print()
print("By contrast, the following grammar contains a ")
print("DependencyProduction that specifies a relationship")
print("between a single head word, 'price', and two modifier")
print("words, 'of' and 'the'.")
grammar = DependencyGrammar.fromstring(
"""
'fell' -> 'price' | 'stock'
'price' -> 'of' 'the'
'of' -> 'stock'
'stock' -> 'the'
"""
)
print(grammar)
print()
print(
"This constrains the number of possible parses to just one:"
) # unimplemented, soon to replace
pdp = ProjectiveDependencyParser(grammar)
trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
for tree in trees:
print(tree)
def projective_prob_parse_demo():
"""
A demo showing the training and use of a projective
dependency parser.
"""
from nltk.parse.dependencygraph import conll_data2
graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
ppdp = ProbabilisticProjectiveDependencyParser()
print("Training Probabilistic Projective Dependency Parser...")
ppdp.train(graphs)
sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."]
print("Parsing '", " ".join(sent), "'...")
print("Parse:")
for tree in ppdp.parse(sent):
print(tree)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,684 @@
# Natural Language Toolkit: Recursive Descent Parser
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.grammar import Nonterminal
from nltk.parse.api import ParserI
from nltk.tree import ImmutableTree, Tree
##//////////////////////////////////////////////////////
## Recursive Descent Parser
##//////////////////////////////////////////////////////
class RecursiveDescentParser(ParserI):
"""
A simple top-down CFG parser that parses texts by recursively
expanding the fringe of a Tree, and matching it against a
text.
``RecursiveDescentParser`` uses a list of tree locations called a
"frontier" to remember which subtrees have not yet been expanded
and which leaves have not yet been matched against the text. Each
tree location consists of a list of child indices specifying the
path from the root of the tree to a subtree or a leaf; see the
reference documentation for Tree for more information
about tree locations.
When the parser begins parsing a text, it constructs a tree
containing only the start symbol, and a frontier containing the
location of the tree's root node. It then extends the tree to
cover the text, using the following recursive procedure:
- If the frontier is empty, and the text is covered by the tree,
then return the tree as a possible parse.
- If the frontier is empty, and the text is not covered by the
tree, then return no parses.
- If the first element of the frontier is a subtree, then
use CFG productions to "expand" it. For each applicable
production, add the expanded subtree's children to the
frontier, and recursively find all parses that can be
generated by the new tree and frontier.
- If the first element of the frontier is a token, then "match"
it against the next token from the text. Remove the token
from the frontier, and recursively find all parses that can be
generated by the new tree and frontier.
:see: ``nltk.grammar``
"""
def __init__(self, grammar, trace=0):
"""
Create a new ``RecursiveDescentParser``, that uses ``grammar``
to parse texts.
:type grammar: CFG
:param grammar: The grammar used to parse texts.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing
output.
"""
self._grammar = grammar
self._trace = trace
def grammar(self):
return self._grammar
def parse(self, tokens):
# Inherit docs from ParserI
tokens = list(tokens)
self._grammar.check_coverage(tokens)
# Start a recursive descent parse, with an initial tree
# containing just the start symbol.
start = self._grammar.start().symbol()
initial_tree = Tree(start, [])
frontier = [()]
if self._trace:
self._trace_start(initial_tree, frontier, tokens)
return self._parse(tokens, initial_tree, frontier)
def _parse(self, remaining_text, tree, frontier):
"""
Recursively expand and match each elements of ``tree``
specified by ``frontier``, to cover ``remaining_text``. Return
a list of all parses found.
:return: An iterator of all parses that can be generated by
matching and expanding the elements of ``tree``
specified by ``frontier``.
:rtype: iter(Tree)
:type tree: Tree
:param tree: A partial structure for the text that is
currently being parsed. The elements of ``tree``
that are specified by ``frontier`` have not yet been
expanded or matched.
:type remaining_text: list(str)
:param remaining_text: The portion of the text that is not yet
covered by ``tree``.
:type frontier: list(tuple(int))
:param frontier: A list of the locations within ``tree`` of
all subtrees that have not yet been expanded, and all
leaves that have not yet been matched. This list sorted
in left-to-right order of location within the tree.
"""
# If the tree covers the text, and there's nothing left to
# expand, then we've found a complete parse; return it.
if len(remaining_text) == 0 and len(frontier) == 0:
if self._trace:
self._trace_succeed(tree, frontier)
yield tree
# If there's still text, but nothing left to expand, we failed.
elif len(frontier) == 0:
if self._trace:
self._trace_backtrack(tree, frontier)
# If the next element on the frontier is a tree, expand it.
elif isinstance(tree[frontier[0]], Tree):
yield from self._expand(remaining_text, tree, frontier)
# If the next element on the frontier is a token, match it.
else:
yield from self._match(remaining_text, tree, frontier)
def _match(self, rtext, tree, frontier):
"""
:rtype: iter(Tree)
:return: an iterator of all parses that can be generated by
matching the first element of ``frontier`` against the
first token in ``rtext``. In particular, if the first
element of ``frontier`` has the same type as the first
token in ``rtext``, then substitute the token into
``tree``; and return all parses that can be generated by
matching and expanding the remaining elements of
``frontier``. If the first element of ``frontier`` does not
have the same type as the first token in ``rtext``, then
return empty list.
:type tree: Tree
:param tree: A partial structure for the text that is
currently being parsed. The elements of ``tree``
that are specified by ``frontier`` have not yet been
expanded or matched.
:type rtext: list(str)
:param rtext: The portion of the text that is not yet
covered by ``tree``.
:type frontier: list of tuple of int
:param frontier: A list of the locations within ``tree`` of
all subtrees that have not yet been expanded, and all
leaves that have not yet been matched.
"""
tree_leaf = tree[frontier[0]]
if len(rtext) > 0 and tree_leaf == rtext[0]:
# If it's a terminal that matches rtext[0], then substitute
# in the token, and continue parsing.
newtree = tree.copy(deep=True)
newtree[frontier[0]] = rtext[0]
if self._trace:
self._trace_match(newtree, frontier[1:], rtext[0])
yield from self._parse(rtext[1:], newtree, frontier[1:])
else:
# If it's a non-matching terminal, fail.
if self._trace:
self._trace_backtrack(tree, frontier, rtext[:1])
def _expand(self, remaining_text, tree, frontier, production=None):
"""
:rtype: iter(Tree)
:return: An iterator of all parses that can be generated by
expanding the first element of ``frontier`` with
``production``. In particular, if the first element of
``frontier`` is a subtree whose node type is equal to
``production``'s left hand side, then add a child to that
subtree for each element of ``production``'s right hand
side; and return all parses that can be generated by
matching and expanding the remaining elements of
``frontier``. If the first element of ``frontier`` is not a
subtree whose node type is equal to ``production``'s left
hand side, then return an empty list. If ``production`` is
not specified, then return a list of all parses that can
be generated by expanding the first element of ``frontier``
with *any* CFG production.
:type tree: Tree
:param tree: A partial structure for the text that is
currently being parsed. The elements of ``tree``
that are specified by ``frontier`` have not yet been
expanded or matched.
:type remaining_text: list(str)
:param remaining_text: The portion of the text that is not yet
covered by ``tree``.
:type frontier: list(tuple(int))
:param frontier: A list of the locations within ``tree`` of
all subtrees that have not yet been expanded, and all
leaves that have not yet been matched.
"""
if production is None:
productions = self._grammar.productions()
else:
productions = [production]
for production in productions:
lhs = production.lhs().symbol()
if lhs == tree[frontier[0]].label():
subtree = self._production_to_tree(production)
if frontier[0] == ():
newtree = subtree
else:
newtree = tree.copy(deep=True)
newtree[frontier[0]] = subtree
new_frontier = [
frontier[0] + (i,) for i in range(len(production.rhs()))
]
if self._trace:
self._trace_expand(newtree, new_frontier, production)
yield from self._parse(
remaining_text, newtree, new_frontier + frontier[1:]
)
def _production_to_tree(self, production):
"""
:rtype: Tree
:return: The Tree that is licensed by ``production``.
In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
return a tree that has a node ``lhs.symbol``, and
``n`` children. For each nonterminal element
``elt[i]`` in the production, the tree token has a
childless subtree with node value ``elt[i].symbol``; and
for each terminal element ``elt[j]``, the tree token has
a leaf token with type ``elt[j]``.
:param production: The CFG production that licenses the tree
token that should be returned.
:type production: Production
"""
children = []
for elt in production.rhs():
if isinstance(elt, Nonterminal):
children.append(Tree(elt.symbol(), []))
else:
# This will be matched.
children.append(elt)
return Tree(production.lhs().symbol(), children)
def trace(self, trace=2):
"""
Set the level of tracing output that should be generated when
parsing a text.
:type trace: int
:param trace: The trace level. A trace level of ``0`` will
generate no tracing output; and higher trace levels will
produce more verbose tracing output.
:rtype: None
"""
self._trace = trace
def _trace_fringe(self, tree, treeloc=None):
"""
Print trace output displaying the fringe of ``tree``. The
fringe of ``tree`` consists of all of its leaves and all of
its childless subtrees.
:rtype: None
"""
if treeloc == ():
print("*", end=" ")
if isinstance(tree, Tree):
if len(tree) == 0:
print(repr(Nonterminal(tree.label())), end=" ")
for i in range(len(tree)):
if treeloc is not None and i == treeloc[0]:
self._trace_fringe(tree[i], treeloc[1:])
else:
self._trace_fringe(tree[i])
else:
print(repr(tree), end=" ")
def _trace_tree(self, tree, frontier, operation):
"""
Print trace output displaying the parser's current state.
:param operation: A character identifying the operation that
generated the current state.
:rtype: None
"""
if self._trace == 2:
print(" %c [" % operation, end=" ")
else:
print(" [", end=" ")
if len(frontier) > 0:
self._trace_fringe(tree, frontier[0])
else:
self._trace_fringe(tree)
print("]")
def _trace_start(self, tree, frontier, text):
print("Parsing %r" % " ".join(text))
if self._trace > 2:
print("Start:")
if self._trace > 1:
self._trace_tree(tree, frontier, " ")
def _trace_expand(self, tree, frontier, production):
if self._trace > 2:
print("Expand: %s" % production)
if self._trace > 1:
self._trace_tree(tree, frontier, "E")
def _trace_match(self, tree, frontier, tok):
if self._trace > 2:
print("Match: %r" % tok)
if self._trace > 1:
self._trace_tree(tree, frontier, "M")
def _trace_succeed(self, tree, frontier):
if self._trace > 2:
print("GOOD PARSE:")
if self._trace == 1:
print("Found a parse:\n%s" % tree)
if self._trace > 1:
self._trace_tree(tree, frontier, "+")
def _trace_backtrack(self, tree, frontier, toks=None):
if self._trace > 2:
if toks:
print("Backtrack: %r match failed" % toks[0])
else:
print("Backtrack")
##//////////////////////////////////////////////////////
## Stepping Recursive Descent Parser
##//////////////////////////////////////////////////////
class SteppingRecursiveDescentParser(RecursiveDescentParser):
"""
A ``RecursiveDescentParser`` that allows you to step through the
parsing process, performing a single operation at a time.
The ``initialize`` method is used to start parsing a text.
``expand`` expands the first element on the frontier using a single
CFG production, and ``match`` matches the first element on the
frontier against the next text token. ``backtrack`` undoes the most
recent expand or match operation. ``step`` performs a single
expand, match, or backtrack operation. ``parses`` returns the set
of parses that have been found by the parser.
:ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
containing the previous states of the parser. This history is
used to implement the ``backtrack`` operation.
:ivar _tried_e: A record of all productions that have been tried
for a given tree. This record is used by ``expand`` to perform
the next untried production.
:ivar _tried_m: A record of what tokens have been matched for a
given tree. This record is used by ``step`` to decide whether
or not to match a token.
:see: ``nltk.grammar``
"""
def __init__(self, grammar, trace=0):
super().__init__(grammar, trace)
self._rtext = None
self._tree = None
self._frontier = [()]
self._tried_e = {}
self._tried_m = {}
self._history = []
self._parses = []
# [XX] TEMPORARY HACK WARNING! This should be replaced with
# something nicer when we get the chance.
def _freeze(self, tree):
c = tree.copy()
# for pos in c.treepositions('leaves'):
# c[pos] = c[pos].freeze()
return ImmutableTree.convert(c)
def parse(self, tokens):
tokens = list(tokens)
self.initialize(tokens)
while self.step() is not None:
pass
return self.parses()
def initialize(self, tokens):
"""
Start parsing a given text. This sets the parser's tree to
the start symbol, its frontier to the root node, and its
remaining text to ``token['SUBTOKENS']``.
"""
self._rtext = tokens
start = self._grammar.start().symbol()
self._tree = Tree(start, [])
self._frontier = [()]
self._tried_e = {}
self._tried_m = {}
self._history = []
self._parses = []
if self._trace:
self._trace_start(self._tree, self._frontier, self._rtext)
def remaining_text(self):
"""
:return: The portion of the text that is not yet covered by the
tree.
:rtype: list(str)
"""
return self._rtext
def frontier(self):
"""
:return: A list of the tree locations of all subtrees that
have not yet been expanded, and all leaves that have not
yet been matched.
:rtype: list(tuple(int))
"""
return self._frontier
def tree(self):
"""
:return: A partial structure for the text that is
currently being parsed. The elements specified by the
frontier have not yet been expanded or matched.
:rtype: Tree
"""
return self._tree
def step(self):
"""
Perform a single parsing operation. If an untried match is
possible, then perform the match, and return the matched
token. If an untried expansion is possible, then perform the
expansion, and return the production that it is based on. If
backtracking is possible, then backtrack, and return True.
Otherwise, return None.
:return: None if no operation was performed; a token if a match
was performed; a production if an expansion was performed;
and True if a backtrack operation was performed.
:rtype: Production or String or bool
"""
# Try matching (if we haven't already)
if self.untried_match():
token = self.match()
if token is not None:
return token
# Try expanding.
production = self.expand()
if production is not None:
return production
# Try backtracking
if self.backtrack():
self._trace_backtrack(self._tree, self._frontier)
return True
# Nothing left to do.
return None
def expand(self, production=None):
"""
Expand the first element of the frontier. In particular, if
the first element of the frontier is a subtree whose node type
is equal to ``production``'s left hand side, then add a child
to that subtree for each element of ``production``'s right hand
side. If ``production`` is not specified, then use the first
untried expandable production. If all expandable productions
have been tried, do nothing.
:return: The production used to expand the frontier, if an
expansion was performed. If no expansion was performed,
return None.
:rtype: Production or None
"""
# Make sure we *can* expand.
if len(self._frontier) == 0:
return None
if not isinstance(self._tree[self._frontier[0]], Tree):
return None
# If they didn't specify a production, check all untried ones.
if production is None:
productions = self.untried_expandable_productions()
else:
productions = [production]
parses = []
for prod in productions:
# Record that we've tried this production now.
self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
# Try expanding.
for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
return prod
# We didn't expand anything.
return None
def match(self):
"""
Match the first element of the frontier. In particular, if
the first element of the frontier has the same type as the
next text token, then substitute the text token into the tree.
:return: The token matched, if a match operation was
performed. If no match was performed, return None
:rtype: str or None
"""
# Record that we've tried matching this token.
tok = self._rtext[0]
self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
# Make sure we *can* match.
if len(self._frontier) == 0:
return None
if isinstance(self._tree[self._frontier[0]], Tree):
return None
for _result in self._match(self._rtext, self._tree, self._frontier):
# Return the token we just matched.
return self._history[-1][0][0]
return None
def backtrack(self):
"""
Return the parser to its state before the most recent
match or expand operation. Calling ``undo`` repeatedly return
the parser to successively earlier states. If no match or
expand operations have been performed, ``undo`` will make no
changes.
:return: true if an operation was successfully undone.
:rtype: bool
"""
if len(self._history) == 0:
return False
(self._rtext, self._tree, self._frontier) = self._history.pop()
return True
def expandable_productions(self):
"""
:return: A list of all the productions for which expansions
are available for the current parser state.
:rtype: list(Production)
"""
# Make sure we *can* expand.
if len(self._frontier) == 0:
return []
frontier_child = self._tree[self._frontier[0]]
if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
return []
return [
p
for p in self._grammar.productions()
if p.lhs().symbol() == frontier_child.label()
]
def untried_expandable_productions(self):
"""
:return: A list of all the untried productions for which
expansions are available for the current parser state.
:rtype: list(Production)
"""
tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
return [p for p in self.expandable_productions() if p not in tried_expansions]
def untried_match(self):
"""
:return: Whether the first element of the frontier is a token
that has not yet been matched.
:rtype: bool
"""
if len(self._rtext) == 0:
return False
tried_matches = self._tried_m.get(self._freeze(self._tree), [])
return self._rtext[0] not in tried_matches
def currently_complete(self):
"""
:return: Whether the parser's current state represents a
complete parse.
:rtype: bool
"""
return len(self._frontier) == 0 and len(self._rtext) == 0
def _parse(self, remaining_text, tree, frontier):
"""
A stub version of ``_parse`` that sets the parsers current
state to the given arguments. In ``RecursiveDescentParser``,
the ``_parse`` method is used to recursively continue parsing a
text. ``SteppingRecursiveDescentParser`` overrides it to
capture these recursive calls. It records the parser's old
state in the history (to allow for backtracking), and updates
the parser's new state using the given arguments. Finally, it
returns ``[1]``, which is used by ``match`` and ``expand`` to
detect whether their operations were successful.
:return: ``[1]``
:rtype: list of int
"""
self._history.append((self._rtext, self._tree, self._frontier))
self._rtext = remaining_text
self._tree = tree
self._frontier = frontier
# Is it a good parse? If so, record it.
if len(frontier) == 0 and len(remaining_text) == 0:
self._parses.append(tree)
self._trace_succeed(self._tree, self._frontier)
return [1]
def parses(self):
"""
:return: An iterator of the parses that have been found by this
parser so far.
:rtype: list of Tree
"""
return iter(self._parses)
def set_grammar(self, grammar):
"""
Change the grammar used to parse texts.
:param grammar: The new grammar.
:type grammar: CFG
"""
self._grammar = grammar
##//////////////////////////////////////////////////////
## Demonstration Code
##//////////////////////////////////////////////////////
def demo():
"""
A demonstration of the recursive descent parser.
"""
from nltk import CFG, parse
grammar = CFG.fromstring(
"""
S -> NP VP
NP -> Det N | Det N PP
VP -> V NP | V NP PP
PP -> P NP
NP -> 'I'
N -> 'man' | 'park' | 'telescope' | 'dog'
Det -> 'the' | 'a'
P -> 'in' | 'with'
V -> 'saw'
"""
)
for prod in grammar.productions():
print(prod)
sent = "I saw a man in the park".split()
parser = parse.RecursiveDescentParser(grammar, trace=2)
for p in parser.parse(sent):
print(p)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,478 @@
# Natural Language Toolkit: Shift-Reduce Parser
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.grammar import Nonterminal
from nltk.parse.api import ParserI
from nltk.tree import Tree
##//////////////////////////////////////////////////////
## Shift/Reduce Parser
##//////////////////////////////////////////////////////
class ShiftReduceParser(ParserI):
"""
A simple bottom-up CFG parser that uses two operations, "shift"
and "reduce", to find a single parse for a text.
``ShiftReduceParser`` maintains a stack, which records the
structure of a portion of the text. This stack is a list of
strings and Trees that collectively cover a portion of
the text. For example, while parsing the sentence "the dog saw
the man" with a typical grammar, ``ShiftReduceParser`` will produce
the following stack, which covers "the dog saw"::
[(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
``ShiftReduceParser`` attempts to extend the stack to cover the
entire text, and to combine the stack elements into a single tree,
producing a complete parse for the sentence.
Initially, the stack is empty. It is extended to cover the text,
from left to right, by repeatedly applying two operations:
- "shift" moves a token from the beginning of the text to the
end of the stack.
- "reduce" uses a CFG production to combine the rightmost stack
elements into a single Tree.
Often, more than one operation can be performed on a given stack.
In this case, ``ShiftReduceParser`` uses the following heuristics
to decide which operation to perform:
- Only shift if no reductions are available.
- If multiple reductions are available, then apply the reduction
whose CFG production is listed earliest in the grammar.
Note that these heuristics are not guaranteed to choose an
operation that leads to a parse of the text. Also, if multiple
parses exists, ``ShiftReduceParser`` will return at most one of
them.
:see: ``nltk.grammar``
"""
def __init__(self, grammar, trace=0):
"""
Create a new ``ShiftReduceParser``, that uses ``grammar`` to
parse texts.
:type grammar: Grammar
:param grammar: The grammar used to parse texts.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing
output.
"""
self._grammar = grammar
self._trace = trace
self._check_grammar()
def grammar(self):
return self._grammar
def parse(self, tokens):
tokens = list(tokens)
self._grammar.check_coverage(tokens)
# initialize the stack.
stack = []
remaining_text = tokens
# Trace output.
if self._trace:
print("Parsing %r" % " ".join(tokens))
self._trace_stack(stack, remaining_text)
# iterate through the text, pushing the token onto
# the stack, then reducing the stack.
while len(remaining_text) > 0:
self._shift(stack, remaining_text)
while self._reduce(stack, remaining_text):
pass
# Did we reduce everything?
if len(stack) == 1:
# Did we end up with the right category?
if stack[0].label() == self._grammar.start().symbol():
yield stack[0]
def _shift(self, stack, remaining_text):
"""
Move a token from the beginning of ``remaining_text`` to the
end of ``stack``.
:type stack: list(str and Tree)
:param stack: A list of strings and Trees, encoding
the structure of the text that has been parsed so far.
:type remaining_text: list(str)
:param remaining_text: The portion of the text that is not yet
covered by ``stack``.
:rtype: None
"""
stack.append(remaining_text[0])
remaining_text.remove(remaining_text[0])
if self._trace:
self._trace_shift(stack, remaining_text)
def _match_rhs(self, rhs, rightmost_stack):
"""
:rtype: bool
:return: true if the right hand side of a CFG production
matches the rightmost elements of the stack. ``rhs``
matches ``rightmost_stack`` if they are the same length,
and each element of ``rhs`` matches the corresponding
element of ``rightmost_stack``. A nonterminal element of
``rhs`` matches any Tree whose node value is equal
to the nonterminal's symbol. A terminal element of ``rhs``
matches any string whose type is equal to the terminal.
:type rhs: list(terminal and Nonterminal)
:param rhs: The right hand side of a CFG production.
:type rightmost_stack: list(string and Tree)
:param rightmost_stack: The rightmost elements of the parser's
stack.
"""
if len(rightmost_stack) != len(rhs):
return False
for i in range(len(rightmost_stack)):
if isinstance(rightmost_stack[i], Tree):
if not isinstance(rhs[i], Nonterminal):
return False
if rightmost_stack[i].label() != rhs[i].symbol():
return False
else:
if isinstance(rhs[i], Nonterminal):
return False
if rightmost_stack[i] != rhs[i]:
return False
return True
def _reduce(self, stack, remaining_text, production=None):
"""
Find a CFG production whose right hand side matches the
rightmost stack elements; and combine those stack elements
into a single Tree, with the node specified by the
production's left-hand side. If more than one CFG production
matches the stack, then use the production that is listed
earliest in the grammar. The new Tree replaces the
elements in the stack.
:rtype: Production or None
:return: If a reduction is performed, then return the CFG
production that the reduction is based on; otherwise,
return false.
:type stack: list(string and Tree)
:param stack: A list of strings and Trees, encoding
the structure of the text that has been parsed so far.
:type remaining_text: list(str)
:param remaining_text: The portion of the text that is not yet
covered by ``stack``.
"""
if production is None:
productions = self._grammar.productions()
else:
productions = [production]
# Try each production, in order.
for production in productions:
rhslen = len(production.rhs())
# check if the RHS of a production matches the top of the stack
if self._match_rhs(production.rhs(), stack[-rhslen:]):
# combine the tree to reflect the reduction
tree = Tree(production.lhs().symbol(), stack[-rhslen:])
stack[-rhslen:] = [tree]
# We reduced something
if self._trace:
self._trace_reduce(stack, production, remaining_text)
return production
# We didn't reduce anything
return None
def trace(self, trace=2):
"""
Set the level of tracing output that should be generated when
parsing a text.
:type trace: int
:param trace: The trace level. A trace level of ``0`` will
generate no tracing output; and higher trace levels will
produce more verbose tracing output.
:rtype: None
"""
# 1: just show shifts.
# 2: show shifts & reduces
# 3: display which tokens & productions are shifed/reduced
self._trace = trace
def _trace_stack(self, stack, remaining_text, marker=" "):
"""
Print trace output displaying the given stack and text.
:rtype: None
:param marker: A character that is printed to the left of the
stack. This is used with trace level 2 to print 'S'
before shifted stacks and 'R' before reduced stacks.
"""
s = " " + marker + " [ "
for elt in stack:
if isinstance(elt, Tree):
s += repr(Nonterminal(elt.label())) + " "
else:
s += repr(elt) + " "
s += "* " + " ".join(remaining_text) + "]"
print(s)
def _trace_shift(self, stack, remaining_text):
"""
Print trace output displaying that a token has been shifted.
:rtype: None
"""
if self._trace > 2:
print("Shift %r:" % stack[-1])
if self._trace == 2:
self._trace_stack(stack, remaining_text, "S")
elif self._trace > 0:
self._trace_stack(stack, remaining_text)
def _trace_reduce(self, stack, production, remaining_text):
"""
Print trace output displaying that ``production`` was used to
reduce ``stack``.
:rtype: None
"""
if self._trace > 2:
rhs = " ".join(production.rhs())
print(f"Reduce {production.lhs()!r} <- {rhs}")
if self._trace == 2:
self._trace_stack(stack, remaining_text, "R")
elif self._trace > 1:
self._trace_stack(stack, remaining_text)
def _check_grammar(self):
"""
Check to make sure that all of the CFG productions are
potentially useful. If any productions can never be used,
then print a warning.
:rtype: None
"""
productions = self._grammar.productions()
# Any production whose RHS is an extension of another production's RHS
# will never be used.
for i in range(len(productions)):
for j in range(i + 1, len(productions)):
rhs1 = productions[i].rhs()
rhs2 = productions[j].rhs()
if rhs1[: len(rhs2)] == rhs2:
print("Warning: %r will never be used" % productions[i])
##//////////////////////////////////////////////////////
## Stepping Shift/Reduce Parser
##//////////////////////////////////////////////////////
class SteppingShiftReduceParser(ShiftReduceParser):
"""
A ``ShiftReduceParser`` that allows you to setp through the parsing
process, performing a single operation at a time. It also allows
you to change the parser's grammar midway through parsing a text.
The ``initialize`` method is used to start parsing a text.
``shift`` performs a single shift operation, and ``reduce`` performs
a single reduce operation. ``step`` will perform a single reduce
operation if possible; otherwise, it will perform a single shift
operation. ``parses`` returns the set of parses that have been
found by the parser.
:ivar _history: A list of ``(stack, remaining_text)`` pairs,
containing all of the previous states of the parser. This
history is used to implement the ``undo`` operation.
:see: ``nltk.grammar``
"""
def __init__(self, grammar, trace=0):
super().__init__(grammar, trace)
self._stack = None
self._remaining_text = None
self._history = []
def parse(self, tokens):
tokens = list(tokens)
self.initialize(tokens)
while self.step():
pass
return self.parses()
def stack(self):
"""
:return: The parser's stack.
:rtype: list(str and Tree)
"""
return self._stack
def remaining_text(self):
"""
:return: The portion of the text that is not yet covered by the
stack.
:rtype: list(str)
"""
return self._remaining_text
def initialize(self, tokens):
"""
Start parsing a given text. This sets the parser's stack to
``[]`` and sets its remaining text to ``tokens``.
"""
self._stack = []
self._remaining_text = tokens
self._history = []
def step(self):
"""
Perform a single parsing operation. If a reduction is
possible, then perform that reduction, and return the
production that it is based on. Otherwise, if a shift is
possible, then perform it, and return True. Otherwise,
return False.
:return: False if no operation was performed; True if a shift was
performed; and the CFG production used to reduce if a
reduction was performed.
:rtype: Production or bool
"""
return self.reduce() or self.shift()
def shift(self):
"""
Move a token from the beginning of the remaining text to the
end of the stack. If there are no more tokens in the
remaining text, then do nothing.
:return: True if the shift operation was successful.
:rtype: bool
"""
if len(self._remaining_text) == 0:
return False
self._history.append((self._stack[:], self._remaining_text[:]))
self._shift(self._stack, self._remaining_text)
return True
def reduce(self, production=None):
"""
Use ``production`` to combine the rightmost stack elements into
a single Tree. If ``production`` does not match the
rightmost stack elements, then do nothing.
:return: The production used to reduce the stack, if a
reduction was performed. If no reduction was performed,
return None.
:rtype: Production or None
"""
self._history.append((self._stack[:], self._remaining_text[:]))
return_val = self._reduce(self._stack, self._remaining_text, production)
if not return_val:
self._history.pop()
return return_val
def undo(self):
"""
Return the parser to its state before the most recent
shift or reduce operation. Calling ``undo`` repeatedly return
the parser to successively earlier states. If no shift or
reduce operations have been performed, ``undo`` will make no
changes.
:return: true if an operation was successfully undone.
:rtype: bool
"""
if len(self._history) == 0:
return False
(self._stack, self._remaining_text) = self._history.pop()
return True
def reducible_productions(self):
"""
:return: A list of the productions for which reductions are
available for the current parser state.
:rtype: list(Production)
"""
productions = []
for production in self._grammar.productions():
rhslen = len(production.rhs())
if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
productions.append(production)
return productions
def parses(self):
"""
:return: An iterator of the parses that have been found by this
parser so far.
:rtype: iter(Tree)
"""
if (
len(self._remaining_text) == 0
and len(self._stack) == 1
and self._stack[0].label() == self._grammar.start().symbol()
):
yield self._stack[0]
# copied from nltk.parser
def set_grammar(self, grammar):
"""
Change the grammar used to parse texts.
:param grammar: The new grammar.
:type grammar: CFG
"""
self._grammar = grammar
##//////////////////////////////////////////////////////
## Demonstration Code
##//////////////////////////////////////////////////////
def demo():
"""
A demonstration of the shift-reduce parser.
"""
from nltk import CFG, parse
grammar = CFG.fromstring(
"""
S -> NP VP
NP -> Det N | Det N PP
VP -> V NP | V NP PP
PP -> P NP
NP -> 'I'
N -> 'man' | 'park' | 'telescope' | 'dog'
Det -> 'the' | 'a'
P -> 'in' | 'with'
V -> 'saw'
"""
)
sent = "I saw a man in the park".split()
parser = parse.ShiftReduceParser(grammar, trace=2)
for p in parser.parse(sent):
print(p)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,468 @@
# Natural Language Toolkit: Interface to the Stanford Parser
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import os
import tempfile
import warnings
from subprocess import PIPE
from nltk.internals import (
_java_options,
config_java,
find_jar_iter,
find_jars_within_path,
java,
)
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tree import Tree
_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
class GenericStanfordParser(ParserI):
"""Interface to the Stanford Parser"""
_MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
_JAR = r"stanford-parser\.jar"
_MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
_USE_STDIN = False
_DOUBLE_SPACED_OUTPUT = False
def __init__(
self,
path_to_jar=None,
path_to_models_jar=None,
model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
encoding="utf8",
verbose=False,
java_options="-mx4g",
corenlp_options="",
):
# find the most recent code and model jar
stanford_jar = max(
find_jar_iter(
self._JAR,
path_to_jar,
env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
searchpath=(),
url=_stanford_url,
verbose=verbose,
is_regex=True,
),
key=lambda model_path: os.path.dirname(model_path),
)
model_jar = max(
find_jar_iter(
self._MODEL_JAR_PATTERN,
path_to_models_jar,
env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
searchpath=(),
url=_stanford_url,
verbose=verbose,
is_regex=True,
),
key=lambda model_path: os.path.dirname(model_path),
)
# self._classpath = (stanford_jar, model_jar)
# Adding logging jar files to classpath
stanford_dir = os.path.split(stanford_jar)[0]
self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
self.model_path = model_path
self._encoding = encoding
self.corenlp_options = corenlp_options
self.java_options = java_options
def _parse_trees_output(self, output_):
res = []
cur_lines = []
cur_trees = []
blank = False
for line in output_.splitlines(False):
if line == "":
if blank:
res.append(iter(cur_trees))
cur_trees = []
blank = False
elif self._DOUBLE_SPACED_OUTPUT:
cur_trees.append(self._make_tree("\n".join(cur_lines)))
cur_lines = []
blank = True
else:
res.append(iter([self._make_tree("\n".join(cur_lines))]))
cur_lines = []
else:
cur_lines.append(line)
blank = False
return iter(res)
def parse_sents(self, sentences, verbose=False):
"""
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
list where each sentence is a list of words.
Each sentence will be automatically tagged with this StanfordParser instance's
tagger.
If whitespaces exists inside a token, then the token will be treated as
separate tokens.
:param sentences: Input sentences to parse
:type sentences: list(list(str))
:rtype: iter(iter(Tree))
"""
cmd = [
self._MAIN_CLASS,
"-model",
self.model_path,
"-sentences",
"newline",
"-outputFormat",
self._OUTPUT_FORMAT,
"-tokenized",
"-escaper",
"edu.stanford.nlp.process.PTBEscapingProcessor",
]
return self._parse_trees_output(
self._execute(
cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
)
)
def raw_parse(self, sentence, verbose=False):
"""
Use StanfordParser to parse a sentence. Takes a sentence as a string;
before parsing, it will be automatically tokenized and tagged by
the Stanford Parser.
:param sentence: Input sentence to parse
:type sentence: str
:rtype: iter(Tree)
"""
return next(self.raw_parse_sents([sentence], verbose))
def raw_parse_sents(self, sentences, verbose=False):
"""
Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
list of strings.
Each sentence will be automatically tokenized and tagged by the Stanford Parser.
:param sentences: Input sentences to parse
:type sentences: list(str)
:rtype: iter(iter(Tree))
"""
cmd = [
self._MAIN_CLASS,
"-model",
self.model_path,
"-sentences",
"newline",
"-outputFormat",
self._OUTPUT_FORMAT,
]
return self._parse_trees_output(
self._execute(cmd, "\n".join(sentences), verbose)
)
def tagged_parse(self, sentence, verbose=False):
"""
Use StanfordParser to parse a sentence. Takes a sentence as a list of
(word, tag) tuples; the sentence must have already been tokenized and
tagged.
:param sentence: Input sentence to parse
:type sentence: list(tuple(str, str))
:rtype: iter(Tree)
"""
return next(self.tagged_parse_sents([sentence], verbose))
def tagged_parse_sents(self, sentences, verbose=False):
"""
Use StanfordParser to parse multiple sentences. Takes multiple sentences
where each sentence is a list of (word, tag) tuples.
The sentences must have already been tokenized and tagged.
:param sentences: Input sentences to parse
:type sentences: list(list(tuple(str, str)))
:rtype: iter(iter(Tree))
"""
tag_separator = "/"
cmd = [
self._MAIN_CLASS,
"-model",
self.model_path,
"-sentences",
"newline",
"-outputFormat",
self._OUTPUT_FORMAT,
"-tokenized",
"-tagSeparator",
tag_separator,
"-tokenizerFactory",
"edu.stanford.nlp.process.WhitespaceTokenizer",
"-tokenizerMethod",
"newCoreLabelTokenizerFactory",
]
# We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
return self._parse_trees_output(
self._execute(
cmd,
"\n".join(
" ".join(tag_separator.join(tagged) for tagged in sentence)
for sentence in sentences
),
verbose,
)
)
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
cmd.extend(["-encoding", encoding])
if self.corenlp_options:
cmd.extend(self.corenlp_options.split())
default_options = " ".join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
# Write the actual sentences to the temporary input file
if isinstance(input_, str) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
# Run the tagger and get the output.
if self._USE_STDIN:
input_file.seek(0)
stdout, stderr = java(
cmd,
classpath=self._classpath,
stdin=input_file,
stdout=PIPE,
stderr=PIPE,
)
else:
cmd.append(input_file.name)
stdout, stderr = java(
cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
)
stdout = stdout.replace(b"\xc2\xa0", b" ")
stdout = stdout.replace(b"\x00\xa0", b" ")
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout
class StanfordParser(GenericStanfordParser):
"""
>>> parser=StanfordParser(
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
... ) # doctest: +SKIP
>>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
>>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
... "the quick brown fox jumps over the lazy dog",
... "the quick grey wolf jumps over the lazy fox"
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
[Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
[Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
>>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
... "I 'm a dog".split(),
... "This is my friends ' cat ( the tabby )".split(),
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
[Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]
>>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
... (
... ("The", "DT"),
... ("quick", "JJ"),
... ("brown", "JJ"),
... ("fox", "NN"),
... ("jumped", "VBD"),
... ("over", "IN"),
... ("the", "DT"),
... ("lazy", "JJ"),
... ("dog", "NN"),
... (".", "."),
... ),
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
"""
_OUTPUT_FORMAT = "penn"
def __init__(self, *args, **kwargs):
warnings.warn(
"The StanfordParser will be deprecated\n"
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
DeprecationWarning,
stacklevel=2,
)
super().__init__(*args, **kwargs)
def _make_tree(self, result):
return Tree.fromstring(result)
class StanfordDependencyParser(GenericStanfordParser):
"""
>>> dep_parser=StanfordDependencyParser(
... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
... ) # doctest: +SKIP
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
... "The quick brown fox jumps over the lazy dog.",
... "The quick grey wolf jumps over the lazy fox."
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
... "I 'm a dog".split(),
... "This is my friends ' cat ( the tabby )".split(),
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
>>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
... (
... ("The", "DT"),
... ("quick", "JJ"),
... ("brown", "JJ"),
... ("fox", "NN"),
... ("jumped", "VBD"),
... ("over", "IN"),
... ("the", "DT"),
... ("lazy", "JJ"),
... ("dog", "NN"),
... (".", "."),
... ),
... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
[[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
"""
_OUTPUT_FORMAT = "conll2007"
def __init__(self, *args, **kwargs):
warnings.warn(
"The StanfordDependencyParser will be deprecated\n"
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
DeprecationWarning,
stacklevel=2,
)
super().__init__(*args, **kwargs)
def _make_tree(self, result):
return DependencyGraph(result, top_relation_label="root")
class StanfordNeuralDependencyParser(GenericStanfordParser):
"""
>>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
>>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
(u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
(u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
u'punct', (u'.', u'.'))]]
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
... "The quick brown fox jumps over the lazy dog.",
... "The quick grey wolf jumps over the lazy fox."
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
Tree('fox', ['over', 'the', 'lazy']), '.'])]
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
... "I 'm a dog".split(),
... "This is my friends ' cat ( the tabby )".split(),
... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
"""
_OUTPUT_FORMAT = "conll"
_MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
_JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
_USE_STDIN = True
_DOUBLE_SPACED_OUTPUT = True
def __init__(self, *args, **kwargs):
warnings.warn(
"The StanfordNeuralDependencyParser will be deprecated\n"
"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
DeprecationWarning,
stacklevel=2,
)
super().__init__(*args, **kwargs)
self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
def tagged_parse_sents(self, sentences, verbose=False):
"""
Currently unimplemented because the neural dependency parser (and
the StanfordCoreNLP pipeline class) doesn't support passing in pre-
tagged tokens.
"""
raise NotImplementedError(
"tagged_parse[_sents] is not supported by "
"StanfordNeuralDependencyParser; use "
"parse[_sents] or raw_parse[_sents] instead."
)
def _make_tree(self, result):
return DependencyGraph(result, top_relation_label="ROOT")

View File

@@ -0,0 +1,793 @@
# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers
#
# Author: Long Duong <longdt219@gmail.com>
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import pickle
import tempfile
from copy import deepcopy
from operator import itemgetter
from os import remove
try:
from numpy import array
from scipy import sparse
from sklearn import svm
from sklearn.datasets import load_svmlight_file
except ImportError:
pass
from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI
class Configuration:
"""
Class for holding configuration which is the partial analysis of the input sentence.
The transition based parser aims at finding set of operators that transfer the initial
configuration to the terminal configuration.
The configuration includes:
- Stack: for storing partially proceeded words
- Buffer: for storing remaining input words
- Set of arcs: for storing partially built dependency tree
This class also provides a method to represent a configuration as list of features.
"""
def __init__(self, dep_graph):
"""
:param dep_graph: the representation of an input in the form of dependency graph.
:type dep_graph: DependencyGraph where the dependencies are not specified.
"""
# dep_graph.nodes contain list of token for a sentence
self.stack = [0] # The root element
self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer
self.arcs = [] # empty set of arc
self._tokens = dep_graph.nodes
self._max_address = len(self.buffer)
def __str__(self):
return (
"Stack : "
+ str(self.stack)
+ " Buffer : "
+ str(self.buffer)
+ " Arcs : "
+ str(self.arcs)
)
def _check_informative(self, feat, flag=False):
"""
Check whether a feature is informative
The flag control whether "_" is informative or not
"""
if feat is None:
return False
if feat == "":
return False
if flag is False:
if feat == "_":
return False
return True
def extract_features(self):
"""
Extract the set of features for the current configuration. Implement standard features as describe in
Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
Please note that these features are very basic.
:return: list(str)
"""
result = []
# Todo : can come up with more complicated features set for better
# performance.
if len(self.stack) > 0:
# Stack 0
stack_idx0 = self.stack[len(self.stack) - 1]
token = self._tokens[stack_idx0]
if self._check_informative(token["word"], True):
result.append("STK_0_FORM_" + token["word"])
if "lemma" in token and self._check_informative(token["lemma"]):
result.append("STK_0_LEMMA_" + token["lemma"])
if self._check_informative(token["tag"]):
result.append("STK_0_POS_" + token["tag"])
if "feats" in token and self._check_informative(token["feats"]):
feats = token["feats"].split("|")
for feat in feats:
result.append("STK_0_FEATS_" + feat)
# Stack 1
if len(self.stack) > 1:
stack_idx1 = self.stack[len(self.stack) - 2]
token = self._tokens[stack_idx1]
if self._check_informative(token["tag"]):
result.append("STK_1_POS_" + token["tag"])
# Left most, right most dependency of stack[0]
left_most = 1000000
right_most = -1
dep_left_most = ""
dep_right_most = ""
for wi, r, wj in self.arcs:
if wi == stack_idx0:
if (wj > wi) and (wj > right_most):
right_most = wj
dep_right_most = r
if (wj < wi) and (wj < left_most):
left_most = wj
dep_left_most = r
if self._check_informative(dep_left_most):
result.append("STK_0_LDEP_" + dep_left_most)
if self._check_informative(dep_right_most):
result.append("STK_0_RDEP_" + dep_right_most)
# Check Buffered 0
if len(self.buffer) > 0:
# Buffer 0
buffer_idx0 = self.buffer[0]
token = self._tokens[buffer_idx0]
if self._check_informative(token["word"], True):
result.append("BUF_0_FORM_" + token["word"])
if "lemma" in token and self._check_informative(token["lemma"]):
result.append("BUF_0_LEMMA_" + token["lemma"])
if self._check_informative(token["tag"]):
result.append("BUF_0_POS_" + token["tag"])
if "feats" in token and self._check_informative(token["feats"]):
feats = token["feats"].split("|")
for feat in feats:
result.append("BUF_0_FEATS_" + feat)
# Buffer 1
if len(self.buffer) > 1:
buffer_idx1 = self.buffer[1]
token = self._tokens[buffer_idx1]
if self._check_informative(token["word"], True):
result.append("BUF_1_FORM_" + token["word"])
if self._check_informative(token["tag"]):
result.append("BUF_1_POS_" + token["tag"])
if len(self.buffer) > 2:
buffer_idx2 = self.buffer[2]
token = self._tokens[buffer_idx2]
if self._check_informative(token["tag"]):
result.append("BUF_2_POS_" + token["tag"])
if len(self.buffer) > 3:
buffer_idx3 = self.buffer[3]
token = self._tokens[buffer_idx3]
if self._check_informative(token["tag"]):
result.append("BUF_3_POS_" + token["tag"])
# Left most, right most dependency of stack[0]
left_most = 1000000
right_most = -1
dep_left_most = ""
dep_right_most = ""
for wi, r, wj in self.arcs:
if wi == buffer_idx0:
if (wj > wi) and (wj > right_most):
right_most = wj
dep_right_most = r
if (wj < wi) and (wj < left_most):
left_most = wj
dep_left_most = r
if self._check_informative(dep_left_most):
result.append("BUF_0_LDEP_" + dep_left_most)
if self._check_informative(dep_right_most):
result.append("BUF_0_RDEP_" + dep_right_most)
return result
class Transition:
"""
This class defines a set of transition which is applied to a configuration to get another configuration
Note that for different parsing algorithm, the transition is different.
"""
# Define set of transitions
LEFT_ARC = "LEFTARC"
RIGHT_ARC = "RIGHTARC"
SHIFT = "SHIFT"
REDUCE = "REDUCE"
def __init__(self, alg_option):
"""
:param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
:type alg_option: str
"""
self._algo = alg_option
if alg_option not in [
TransitionParser.ARC_STANDARD,
TransitionParser.ARC_EAGER,
]:
raise ValueError(
" Currently we only support %s and %s "
% (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)
)
def left_arc(self, conf, relation):
"""
Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager
:param configuration: is the current configuration
:return: A new configuration or -1 if the pre-condition is not satisfied
"""
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
return -1
if conf.buffer[0] == 0:
# here is the Root element
return -1
idx_wi = conf.stack[len(conf.stack) - 1]
flag = True
if self._algo == TransitionParser.ARC_EAGER:
for idx_parent, r, idx_child in conf.arcs:
if idx_child == idx_wi:
flag = False
if flag:
conf.stack.pop()
idx_wj = conf.buffer[0]
conf.arcs.append((idx_wj, relation, idx_wi))
else:
return -1
def right_arc(self, conf, relation):
"""
Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager
:param configuration: is the current configuration
:return: A new configuration or -1 if the pre-condition is not satisfied
"""
if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0):
return -1
if self._algo == TransitionParser.ARC_STANDARD:
idx_wi = conf.stack.pop()
idx_wj = conf.buffer[0]
conf.buffer[0] = idx_wi
conf.arcs.append((idx_wi, relation, idx_wj))
else: # arc-eager
idx_wi = conf.stack[len(conf.stack) - 1]
idx_wj = conf.buffer.pop(0)
conf.stack.append(idx_wj)
conf.arcs.append((idx_wi, relation, idx_wj))
def reduce(self, conf):
"""
Note that the algorithm for reduce is only available for arc-eager
:param configuration: is the current configuration
:return: A new configuration or -1 if the pre-condition is not satisfied
"""
if self._algo != TransitionParser.ARC_EAGER:
return -1
if len(conf.stack) <= 0:
return -1
idx_wi = conf.stack[len(conf.stack) - 1]
flag = False
for idx_parent, r, idx_child in conf.arcs:
if idx_child == idx_wi:
flag = True
if flag:
conf.stack.pop() # reduce it
else:
return -1
def shift(self, conf):
"""
Note that the algorithm for shift is the SAME for arc-standard and arc-eager
:param configuration: is the current configuration
:return: A new configuration or -1 if the pre-condition is not satisfied
"""
if len(conf.buffer) <= 0:
return -1
idx_wi = conf.buffer.pop(0)
conf.stack.append(idx_wi)
class TransitionParser(ParserI):
"""
Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
"""
ARC_STANDARD = "arc-standard"
ARC_EAGER = "arc-eager"
def __init__(self, algorithm):
"""
:param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm
:type algorithm: str
"""
if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]):
raise ValueError(
" Currently we only support %s and %s "
% (self.ARC_STANDARD, self.ARC_EAGER)
)
self._algorithm = algorithm
self._dictionary = {}
self._transition = {}
self._match_transition = {}
def _get_dep_relation(self, idx_parent, idx_child, depgraph):
p_node = depgraph.nodes[idx_parent]
c_node = depgraph.nodes[idx_child]
if c_node["word"] is None:
return None # Root word
if c_node["head"] == p_node["address"]:
return c_node["rel"]
else:
return None
def _convert_to_binary_features(self, features):
"""
:param features: list of feature string which is needed to convert to binary features
:type features: list(str)
:return : string of binary features in libsvm format which is 'featureID:value' pairs
"""
unsorted_result = []
for feature in features:
self._dictionary.setdefault(feature, len(self._dictionary))
unsorted_result.append(self._dictionary[feature])
# Default value of each feature is 1.0
return " ".join(
str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
)
def _is_projective(self, depgraph):
arc_list = []
for key in depgraph.nodes:
node = depgraph.nodes[key]
if "head" in node:
childIdx = node["address"]
parentIdx = node["head"]
if parentIdx is not None:
arc_list.append((parentIdx, childIdx))
for parentIdx, childIdx in arc_list:
# Ensure that childIdx < parentIdx
if childIdx > parentIdx:
temp = childIdx
childIdx = parentIdx
parentIdx = temp
for k in range(childIdx + 1, parentIdx):
for m in range(len(depgraph.nodes)):
if (m < childIdx) or (m > parentIdx):
if (k, m) in arc_list:
return False
if (m, k) in arc_list:
return False
return True
def _write_to_file(self, key, binary_features, input_file):
"""
write the binary features to input file and update the transition dictionary
"""
self._transition.setdefault(key, len(self._transition) + 1)
self._match_transition[self._transition[key]] = key
input_str = str(self._transition[key]) + " " + binary_features + "\n"
input_file.write(input_str.encode("utf-8"))
def _create_training_examples_arc_std(self, depgraphs, input_file):
"""
Create the training example in the libsvm format and write it to the input_file.
Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009)
"""
operation = Transition(self.ARC_STANDARD)
count_proj = 0
training_seq = []
for depgraph in depgraphs:
if not self._is_projective(depgraph):
continue
count_proj += 1
conf = Configuration(depgraph)
while len(conf.buffer) > 0:
b0 = conf.buffer[0]
features = conf.extract_features()
binary_features = self._convert_to_binary_features(features)
if len(conf.stack) > 0:
s0 = conf.stack[len(conf.stack) - 1]
# Left-arc operation
rel = self._get_dep_relation(b0, s0, depgraph)
if rel is not None:
key = Transition.LEFT_ARC + ":" + rel
self._write_to_file(key, binary_features, input_file)
operation.left_arc(conf, rel)
training_seq.append(key)
continue
# Right-arc operation
rel = self._get_dep_relation(s0, b0, depgraph)
if rel is not None:
precondition = True
# Get the max-index of buffer
maxID = conf._max_address
for w in range(maxID + 1):
if w != b0:
relw = self._get_dep_relation(b0, w, depgraph)
if relw is not None:
if (b0, relw, w) not in conf.arcs:
precondition = False
if precondition:
key = Transition.RIGHT_ARC + ":" + rel
self._write_to_file(key, binary_features, input_file)
operation.right_arc(conf, rel)
training_seq.append(key)
continue
# Shift operation as the default
key = Transition.SHIFT
self._write_to_file(key, binary_features, input_file)
operation.shift(conf)
training_seq.append(key)
print(" Number of training examples : " + str(len(depgraphs)))
print(" Number of valid (projective) examples : " + str(count_proj))
return training_seq
def _create_training_examples_arc_eager(self, depgraphs, input_file):
"""
Create the training example in the libsvm format and write it to the input_file.
Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre
"""
operation = Transition(self.ARC_EAGER)
countProj = 0
training_seq = []
for depgraph in depgraphs:
if not self._is_projective(depgraph):
continue
countProj += 1
conf = Configuration(depgraph)
while len(conf.buffer) > 0:
b0 = conf.buffer[0]
features = conf.extract_features()
binary_features = self._convert_to_binary_features(features)
if len(conf.stack) > 0:
s0 = conf.stack[len(conf.stack) - 1]
# Left-arc operation
rel = self._get_dep_relation(b0, s0, depgraph)
if rel is not None:
key = Transition.LEFT_ARC + ":" + rel
self._write_to_file(key, binary_features, input_file)
operation.left_arc(conf, rel)
training_seq.append(key)
continue
# Right-arc operation
rel = self._get_dep_relation(s0, b0, depgraph)
if rel is not None:
key = Transition.RIGHT_ARC + ":" + rel
self._write_to_file(key, binary_features, input_file)
operation.right_arc(conf, rel)
training_seq.append(key)
continue
# reduce operation
flag = False
for k in range(s0):
if self._get_dep_relation(k, b0, depgraph) is not None:
flag = True
if self._get_dep_relation(b0, k, depgraph) is not None:
flag = True
if flag:
key = Transition.REDUCE
self._write_to_file(key, binary_features, input_file)
operation.reduce(conf)
training_seq.append(key)
continue
# Shift operation as the default
key = Transition.SHIFT
self._write_to_file(key, binary_features, input_file)
operation.shift(conf)
training_seq.append(key)
print(" Number of training examples : " + str(len(depgraphs)))
print(" Number of valid (projective) examples : " + str(countProj))
return training_seq
def train(self, depgraphs, modelfile, verbose=True):
"""
:param depgraphs : list of DependencyGraph as the training data
:type depgraphs : DependencyGraph
:param modelfile : file name to save the trained model
:type modelfile : str
"""
try:
input_file = tempfile.NamedTemporaryFile(
prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
)
if self._algorithm == self.ARC_STANDARD:
self._create_training_examples_arc_std(depgraphs, input_file)
else:
self._create_training_examples_arc_eager(depgraphs, input_file)
input_file.close()
# Using the temporary file to train the libsvm classifier
x_train, y_train = load_svmlight_file(input_file.name)
# The parameter is set according to the paper:
# Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
# Todo : because of probability = True => very slow due to
# cross-validation. Need to improve the speed here
model = svm.SVC(
kernel="poly",
degree=2,
coef0=0,
gamma=0.2,
C=0.5,
verbose=verbose,
probability=True,
)
model.fit(x_train, y_train)
# Save the model to file name (as pickle)
pickle.dump(model, open(modelfile, "wb"))
finally:
remove(input_file.name)
def parse(self, depgraphs, modelFile):
"""
:param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy
:type depgraphs: list(DependencyGraph)
:param modelfile: the model file
:type modelfile: str
:return: list (DependencyGraph) with the 'head' and 'rel' information
"""
result = []
# First load the model
model = pickle.load(open(modelFile, "rb"))
operation = Transition(self._algorithm)
for depgraph in depgraphs:
conf = Configuration(depgraph)
while len(conf.buffer) > 0:
features = conf.extract_features()
col = []
row = []
data = []
for feature in features:
if feature in self._dictionary:
col.append(self._dictionary[feature])
row.append(0)
data.append(1.0)
np_col = array(sorted(col)) # NB : index must be sorted
np_row = array(row)
np_data = array(data)
x_test = sparse.csr_matrix(
(np_data, (np_row, np_col)), shape=(1, len(self._dictionary))
)
# It's best to use decision function as follow BUT it's not supported yet for sparse SVM
# Using decision function to build the votes array
# dec_func = model.decision_function(x_test)[0]
# votes = {}
# k = 0
# for i in range(len(model.classes_)):
# for j in range(i+1, len(model.classes_)):
# #if dec_func[k] > 0:
# votes.setdefault(i,0)
# votes[i] +=1
# else:
# votes.setdefault(j,0)
# votes[j] +=1
# k +=1
# Sort votes according to the values
# sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)
# We will use predict_proba instead of decision_function
prob_dict = {}
pred_prob = model.predict_proba(x_test)[0]
for i in range(len(pred_prob)):
prob_dict[i] = pred_prob[i]
sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True)
# Note that SHIFT is always a valid operation
for y_pred_idx, confidence in sorted_Prob:
# y_pred = model.predict(x_test)[0]
# From the prediction match to the operation
y_pred = model.classes_[y_pred_idx]
if y_pred in self._match_transition:
strTransition = self._match_transition[y_pred]
baseTransition = strTransition.split(":")[0]
if baseTransition == Transition.LEFT_ARC:
if (
operation.left_arc(conf, strTransition.split(":")[1])
!= -1
):
break
elif baseTransition == Transition.RIGHT_ARC:
if (
operation.right_arc(conf, strTransition.split(":")[1])
!= -1
):
break
elif baseTransition == Transition.REDUCE:
if operation.reduce(conf) != -1:
break
elif baseTransition == Transition.SHIFT:
if operation.shift(conf) != -1:
break
else:
raise ValueError(
"The predicted transition is not recognized, expected errors"
)
# Finish with operations build the dependency graph from Conf.arcs
new_depgraph = deepcopy(depgraph)
for key in new_depgraph.nodes:
node = new_depgraph.nodes[key]
node["rel"] = ""
# With the default, all the token depend on the Root
node["head"] = 0
for head, rel, child in conf.arcs:
c_node = new_depgraph.nodes[child]
c_node["head"] = head
c_node["rel"] = rel
result.append(new_depgraph)
return result
def demo():
"""
>>> from nltk.parse import DependencyGraph, DependencyEvaluator
>>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
>>> gold_sent = DependencyGraph(\"""
... Economic JJ 2 ATT
... news NN 3 SBJ
... has VBD 0 ROOT
... little JJ 5 ATT
... effect NN 3 OBJ
... on IN 5 ATT
... financial JJ 8 ATT
... markets NNS 6 PC
... . . 3 PU
... \""")
>>> conf = Configuration(gold_sent)
###################### Check the Initial Feature ########################
>>> print(', '.join(conf.extract_features()))
STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ
###################### Check The Transition #######################
Check the Initialized Configuration
>>> print(conf)
Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : []
A. Do some transition checks for ARC-STANDARD
>>> operation = Transition('arc-standard')
>>> operation.shift(conf)
>>> operation.left_arc(conf, "ATT")
>>> operation.shift(conf)
>>> operation.left_arc(conf,"SBJ")
>>> operation.shift(conf)
>>> operation.shift(conf)
>>> operation.left_arc(conf, "ATT")
>>> operation.shift(conf)
>>> operation.shift(conf)
>>> operation.shift(conf)
>>> operation.left_arc(conf, "ATT")
Middle Configuration and Features Check
>>> print(conf)
Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)]
>>> print(', '.join(conf.extract_features()))
STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT
>>> operation.right_arc(conf, "PC")
>>> operation.right_arc(conf, "ATT")
>>> operation.right_arc(conf, "OBJ")
>>> operation.shift(conf)
>>> operation.right_arc(conf, "PU")
>>> operation.right_arc(conf, "ROOT")
>>> operation.shift(conf)
Terminated Configuration Check
>>> print(conf)
Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)]
B. Do some transition checks for ARC-EAGER
>>> conf = Configuration(gold_sent)
>>> operation = Transition('arc-eager')
>>> operation.shift(conf)
>>> operation.left_arc(conf,'ATT')
>>> operation.shift(conf)
>>> operation.left_arc(conf,'SBJ')
>>> operation.right_arc(conf,'ROOT')
>>> operation.shift(conf)
>>> operation.left_arc(conf,'ATT')
>>> operation.right_arc(conf,'OBJ')
>>> operation.right_arc(conf,'ATT')
>>> operation.shift(conf)
>>> operation.left_arc(conf,'ATT')
>>> operation.right_arc(conf,'PC')
>>> operation.reduce(conf)
>>> operation.reduce(conf)
>>> operation.reduce(conf)
>>> operation.right_arc(conf,'PU')
>>> print(conf)
Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)]
###################### Check The Training Function #######################
A. Check the ARC-STANDARD training
>>> import tempfile
>>> import os
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)
>>> parser_std = TransitionParser('arc-standard')
>>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file)))
Number of training examples : 1
Number of valid (projective) examples : 1
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
>>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
Number of training examples : 1
Number of valid (projective) examples : 1
>>> input_file.close()
>>> remove(input_file.name)
B. Check the ARC-EAGER training
>>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False)
>>> parser_eager = TransitionParser('arc-eager')
>>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file)))
Number of training examples : 1
Number of valid (projective) examples : 1
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
>>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
Number of training examples : 1
Number of valid (projective) examples : 1
>>> input_file.close()
>>> remove(input_file.name)
###################### Check The Parsing Function ########################
A. Check the ARC-STANDARD parser
>>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
>>> de = DependencyEvaluator(result, [gold_sent])
>>> de.eval() >= (0, 0)
True
B. Check the ARC-EAGER parser
>>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
>>> de = DependencyEvaluator(result, [gold_sent])
>>> de.eval() >= (0, 0)
True
Remove test temporary files
>>> remove('temp.arceager.model')
>>> remove('temp.arcstd.model')
Note that result is very poor because of only one training example.
"""

View File

@@ -0,0 +1,234 @@
# Natural Language Toolkit: Parser Utility Functions
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Tom Aarsen <>
#
# Copyright (C) 2001-2025 NLTK Project
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility functions for parsers.
"""
from nltk.data import load
from nltk.grammar import CFG, PCFG, FeatureGrammar
from nltk.parse.chart import Chart, ChartParser
from nltk.parse.featurechart import FeatureChart, FeatureChartParser
from nltk.parse.pchart import InsideChartParser
def load_parser(
grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
):
"""
Load a grammar from a file, and build a parser based on that grammar.
The parser depends on the grammar format, and might also depend
on properties of the grammar itself.
The following grammar formats are currently supported:
- ``'cfg'`` (CFGs: ``CFG``)
- ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
- ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
:type grammar_url: str
:param grammar_url: A URL specifying where the grammar is located.
The default protocol is ``"nltk:"``, which searches for the file
in the the NLTK data package.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing output.
:param parser: The class used for parsing; should be ``ChartParser``
or a subclass.
If None, the class depends on the grammar format.
:param chart_class: The class used for storing the chart;
should be ``Chart`` or a subclass.
Only used for CFGs and feature CFGs.
If None, the chart class depends on the grammar format.
:type beam_size: int
:param beam_size: The maximum length for the parser's edge queue.
Only used for probabilistic CFGs.
:param load_args: Keyword parameters used when loading the grammar.
See ``data.load`` for more information.
"""
grammar = load(grammar_url, **load_args)
if not isinstance(grammar, CFG):
raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
if isinstance(grammar, PCFG):
if parser is None:
parser = InsideChartParser
return parser(grammar, trace=trace, beam_size=beam_size)
elif isinstance(grammar, FeatureGrammar):
if parser is None:
parser = FeatureChartParser
if chart_class is None:
chart_class = FeatureChart
return parser(grammar, trace=trace, chart_class=chart_class)
else: # Plain CFG.
if parser is None:
parser = ChartParser
if chart_class is None:
chart_class = Chart
return parser(grammar, trace=trace, chart_class=chart_class)
def taggedsent_to_conll(sentence):
"""
A module to convert a single POS tagged sentence into CONLL format.
>>> from nltk import word_tokenize, pos_tag
>>> text = "This is a foobar sentence."
>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE
... print(line, end="")
1 This _ DT DT _ 0 a _ _
2 is _ VBZ VBZ _ 0 a _ _
3 a _ DT DT _ 0 a _ _
4 foobar _ JJ JJ _ 0 a _ _
5 sentence _ NN NN _ 0 a _ _
6 . _ . . _ 0 a _ _
:param sentence: A single input sentence to parse
:type sentence: list(tuple(str, str))
:rtype: iter(str)
:return: a generator yielding a single sentence in CONLL format.
"""
for i, (word, tag) in enumerate(sentence, start=1):
input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
input_str = "\t".join(input_str) + "\n"
yield input_str
def taggedsents_to_conll(sentences):
"""
A module to convert the a POS tagged document stream
(i.e. list of list of tuples, a list of sentences) and yield lines
in CONLL format. This module yields one line per word and two newlines
for end of sentence.
>>> from nltk import word_tokenize, sent_tokenize, pos_tag
>>> text = "This is a foobar sentence. Is that right?"
>>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
>>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE
... if line:
... print(line, end="")
1 This _ DT DT _ 0 a _ _
2 is _ VBZ VBZ _ 0 a _ _
3 a _ DT DT _ 0 a _ _
4 foobar _ JJ JJ _ 0 a _ _
5 sentence _ NN NN _ 0 a _ _
6 . _ . . _ 0 a _ _
<BLANKLINE>
<BLANKLINE>
1 Is _ VBZ VBZ _ 0 a _ _
2 that _ IN IN _ 0 a _ _
3 right _ NN NN _ 0 a _ _
4 ? _ . . _ 0 a _ _
<BLANKLINE>
<BLANKLINE>
:param sentences: Input sentences to parse
:type sentence: list(list(tuple(str, str)))
:rtype: iter(str)
:return: a generator yielding sentences in CONLL format.
"""
for sentence in sentences:
yield from taggedsent_to_conll(sentence)
yield "\n\n"
######################################################################
# { Test Suites
######################################################################
class TestGrammar:
"""
Unit tests for CFG.
"""
def __init__(self, grammar, suite, accept=None, reject=None):
self.test_grammar = grammar
self.cp = load_parser(grammar, trace=0)
self.suite = suite
self._accept = accept
self._reject = reject
def run(self, show_trees=False):
"""
Sentences in the test suite are divided into two classes:
- grammatical (``accept``) and
- ungrammatical (``reject``).
If a sentence should parse according to the grammar, the value of
``trees`` will be a non-empty list. If a sentence should be rejected
according to the grammar, then the value of ``trees`` will be None.
"""
for test in self.suite:
print(test["doc"] + ":", end=" ")
for key in ["accept", "reject"]:
for sent in test[key]:
tokens = sent.split()
trees = list(self.cp.parse(tokens))
if show_trees and trees:
print()
print(sent)
for tree in trees:
print(tree)
if key == "accept":
if trees == []:
raise ValueError("Sentence '%s' failed to parse'" % sent)
else:
accepted = True
else:
if trees:
raise ValueError("Sentence '%s' received a parse'" % sent)
else:
rejected = True
if accepted and rejected:
print("All tests passed!")
def extract_test_sentences(string, comment_chars="#%;", encoding=None):
"""
Parses a string with one test sentence per line.
Lines can optionally begin with:
- a bool, saying if the sentence is grammatical or not, or
- an int, giving the number of parse trees is should have,
The result information is followed by a colon, and then the sentence.
Empty lines and lines beginning with a comment char are ignored.
:return: a list of tuple of sentences and expected results,
where a sentence is a list of str,
and a result is None, or bool, or int
:param comment_chars: ``str`` of possible comment characters.
:param encoding: the encoding of the string, if it is binary
"""
if encoding is not None:
string = string.decode(encoding)
sentences = []
for sentence in string.split("\n"):
if sentence == "" or sentence[0] in comment_chars:
continue
split_info = sentence.split(":", 1)
result = None
if len(split_info) == 2:
if split_info[0] in ["True", "true", "False", "false"]:
result = split_info[0] in ["True", "true"]
sentence = split_info[1]
else:
result = int(split_info[0])
sentence = split_info[1]
tokens = sentence.split()
if tokens == []:
continue
sentences += [(tokens, result)]
return sentences

View File

@@ -0,0 +1,453 @@
# Natural Language Toolkit: Viterbi Probabilistic Parser
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from functools import reduce
from nltk.parse.api import ParserI
from nltk.tree import ProbabilisticTree, Tree
##//////////////////////////////////////////////////////
## Viterbi PCFG Parser
##//////////////////////////////////////////////////////
class ViterbiParser(ParserI):
"""
A bottom-up ``PCFG`` parser that uses dynamic programming to find
the single most likely parse for a text. The ``ViterbiParser`` parser
parses texts by filling in a "most likely constituent table".
This table records the most probable tree representation for any
given span and node value. In particular, it has an entry for
every start index, end index, and node value, recording the most
likely subtree that spans from the start index to the end index,
and has the given node value.
The ``ViterbiParser`` parser fills in this table incrementally. It starts
by filling in all entries for constituents that span one element
of text (i.e., entries where the end index is one greater than the
start index). After it has filled in all table entries for
constituents that span one element of text, it fills in the
entries for constitutants that span two elements of text. It
continues filling in the entries for constituents spanning larger
and larger portions of the text, until the entire table has been
filled. Finally, it returns the table entry for a constituent
spanning the entire text, whose node value is the grammar's start
symbol.
In order to find the most likely constituent with a given span and
node value, the ``ViterbiParser`` parser considers all productions that
could produce that node value. For each production, it finds all
children that collectively cover the span and have the node values
specified by the production's right hand side. If the probability
of the tree formed by applying the production to the children is
greater than the probability of the current entry in the table,
then the table is updated with this new tree.
A pseudo-code description of the algorithm used by
``ViterbiParser`` is:
| Create an empty most likely constituent table, *MLC*.
| For width in 1...len(text):
| For start in 1...len(text)-width:
| For prod in grammar.productions:
| For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
| where t[i].label()==prod.rhs[i],
| and the sequence covers [start:start+width]:
| old_p = MLC[start, start+width, prod.lhs]
| new_p = P(t[1])P(t[1])...P(t[n])P(prod)
| if new_p > old_p:
| new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
| MLC[start, start+width, prod.lhs] = new_tree
| Return MLC[0, len(text), start_symbol]
:type _grammar: PCFG
:ivar _grammar: The grammar used to parse sentences.
:type _trace: int
:ivar _trace: The level of tracing output that should be generated
when parsing a text.
"""
def __init__(self, grammar, trace=0):
"""
Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
parse texts.
:type grammar: PCFG
:param grammar: The grammar used to parse texts.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing
output.
"""
self._grammar = grammar
self._trace = trace
def grammar(self):
return self._grammar
def trace(self, trace=2):
"""
Set the level of tracing output that should be generated when
parsing a text.
:type trace: int
:param trace: The trace level. A trace level of ``0`` will
generate no tracing output; and higher trace levels will
produce more verbose tracing output.
:rtype: None
"""
self._trace = trace
def parse(self, tokens):
# Inherit docs from ParserI
tokens = list(tokens)
self._grammar.check_coverage(tokens)
# The most likely constituent table. This table specifies the
# most likely constituent for a given span and type.
# Constituents can be either Trees or tokens. For Trees,
# the "type" is the Nonterminal for the tree's root node
# value. For Tokens, the "type" is the token's type.
# The table is stored as a dictionary, since it is sparse.
constituents = {}
# Initialize the constituents dictionary with the words from
# the text.
if self._trace:
print("Inserting tokens into the most likely" + " constituents table...")
for index in range(len(tokens)):
token = tokens[index]
constituents[index, index + 1, token] = token
if self._trace > 1:
self._trace_lexical_insertion(token, index, len(tokens))
# Consider each span of length 1, 2, ..., n; and add any trees
# that might cover that span to the constituents dictionary.
for length in range(1, len(tokens) + 1):
if self._trace:
print(
"Finding the most likely constituents"
+ " spanning %d text elements..." % length
)
for start in range(len(tokens) - length + 1):
span = (start, start + length)
self._add_constituents_spanning(span, constituents, tokens)
# Return the tree that spans the entire text & have the right cat
tree = constituents.get((0, len(tokens), self._grammar.start()))
if tree is not None:
yield tree
def _add_constituents_spanning(self, span, constituents, tokens):
"""
Find any constituents that might cover ``span``, and add them
to the most likely constituents table.
:rtype: None
:type span: tuple(int, int)
:param span: The section of the text for which we are
trying to find possible constituents. The span is
specified as a pair of integers, where the first integer
is the index of the first token that should be included in
the constituent; and the second integer is the index of
the first token that should not be included in the
constituent. I.e., the constituent should cover
``text[span[0]:span[1]]``, where ``text`` is the text
that we are parsing.
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
:param constituents: The most likely constituents table. This
table records the most probable tree representation for
any given span and node value. In particular,
``constituents(s,e,nv)`` is the most likely
``ProbabilisticTree`` that covers ``text[s:e]``
and has a node value ``nv.symbol()``, where ``text``
is the text that we are parsing. When
``_add_constituents_spanning`` is called, ``constituents``
should contain all possible constituents that are shorter
than ``span``.
:type tokens: list of tokens
:param tokens: The text we are parsing. This is only used for
trace output.
"""
# Since some of the grammar productions may be unary, we need to
# repeatedly try all of the productions until none of them add any
# new constituents.
changed = True
while changed:
changed = False
# Find all ways instantiations of the grammar productions that
# cover the span.
instantiations = self._find_instantiations(span, constituents)
# For each production instantiation, add a new
# ProbabilisticTree whose probability is the product
# of the childrens' probabilities and the production's
# probability.
for production, children in instantiations:
subtrees = [c for c in children if isinstance(c, Tree)]
p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
node = production.lhs().symbol()
tree = ProbabilisticTree(node, children, prob=p)
# If it's new a constituent, then add it to the
# constituents dictionary.
c = constituents.get((span[0], span[1], production.lhs()))
if self._trace > 1:
if c is None or c != tree:
if c is None or c.prob() < tree.prob():
print(" Insert:", end=" ")
else:
print(" Discard:", end=" ")
self._trace_production(production, p, span, len(tokens))
if c is None or c.prob() < tree.prob():
constituents[span[0], span[1], production.lhs()] = tree
changed = True
def _find_instantiations(self, span, constituents):
"""
:return: a list of the production instantiations that cover a
given span of the text. A "production instantiation" is
a tuple containing a production and a list of children,
where the production's right hand side matches the list of
children; and the children cover ``span``. :rtype: list
of ``pair`` of ``Production``, (list of
(``ProbabilisticTree`` or token.
:type span: tuple(int, int)
:param span: The section of the text for which we are
trying to find production instantiations. The span is
specified as a pair of integers, where the first integer
is the index of the first token that should be covered by
the production instantiation; and the second integer is
the index of the first token that should not be covered by
the production instantiation.
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
:param constituents: The most likely constituents table. This
table records the most probable tree representation for
any given span and node value. See the module
documentation for more information.
"""
rv = []
for production in self._grammar.productions():
childlists = self._match_rhs(production.rhs(), span, constituents)
for childlist in childlists:
rv.append((production, childlist))
return rv
def _match_rhs(self, rhs, span, constituents):
"""
:return: a set of all the lists of children that cover ``span``
and that match ``rhs``.
:rtype: list(list(ProbabilisticTree or token)
:type rhs: list(Nonterminal or any)
:param rhs: The list specifying what kinds of children need to
cover ``span``. Each nonterminal in ``rhs`` specifies
that the corresponding child should be a tree whose node
value is that nonterminal's symbol. Each terminal in ``rhs``
specifies that the corresponding child should be a token
whose type is that terminal.
:type span: tuple(int, int)
:param span: The section of the text for which we are
trying to find child lists. The span is specified as a
pair of integers, where the first integer is the index of
the first token that should be covered by the child list;
and the second integer is the index of the first token
that should not be covered by the child list.
:type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
:param constituents: The most likely constituents table. This
table records the most probable tree representation for
any given span and node value. See the module
documentation for more information.
"""
(start, end) = span
# Base case
if start >= end and rhs == ():
return [[]]
if start >= end or rhs == ():
return []
# Find everything that matches the 1st symbol of the RHS
childlists = []
for split in range(start, end + 1):
l = constituents.get((start, split, rhs[0]))
if l is not None:
rights = self._match_rhs(rhs[1:], (split, end), constituents)
childlists += [[l] + r for r in rights]
return childlists
def _trace_production(self, production, p, span, width):
"""
Print trace output indicating that a given production has been
applied at a given location.
:param production: The production that has been applied
:type production: Production
:param p: The probability of the tree produced by the production.
:type p: float
:param span: The span of the production
:type span: tuple
:rtype: None
"""
str = "|" + "." * span[0]
str += "=" * (span[1] - span[0])
str += "." * (width - span[1]) + "| "
str += "%s" % production
if self._trace > 2:
str = f"{str:<40} {p:12.10f} "
print(str)
def _trace_lexical_insertion(self, token, index, width):
str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
str += f"{token}"
print(str)
def __repr__(self):
return "<ViterbiParser for %r>" % self._grammar
##//////////////////////////////////////////////////////
## Test Code
##//////////////////////////////////////////////////////
def demo():
"""
A demonstration of the probabilistic parsers. The user is
prompted to select which demo to run, and how many parses should
be found; and then each parser is run on the same demo, and a
summary of the results are displayed.
"""
import sys
import time
from nltk import tokenize
from nltk.grammar import PCFG
from nltk.parse import ViterbiParser
toy_pcfg1 = PCFG.fromstring(
"""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
"""
)
toy_pcfg2 = PCFG.fromstring(
"""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
VP -> VP PP [.01]
NP -> Det N [.41]
NP -> Name [.28]
NP -> NP PP [.31]
PP -> P NP [1.0]
V -> 'saw' [.21]
V -> 'ate' [.51]
V -> 'ran' [.28]
N -> 'boy' [.11]
N -> 'cookie' [.12]
N -> 'table' [.13]
N -> 'telescope' [.14]
N -> 'hill' [.5]
Name -> 'Jack' [.52]
Name -> 'Bob' [.48]
P -> 'with' [.61]
P -> 'under' [.39]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
"""
)
# Define two demos. Each demo has a sentence and a grammar.
demos = [
("I saw the man with my telescope", toy_pcfg1),
("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
]
# Ask the user which demo they want to use.
print()
for i in range(len(demos)):
print(f"{i + 1:>3}: {demos[i][0]}")
print(" %r" % demos[i][1])
print()
print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
try:
snum = int(sys.stdin.readline().strip()) - 1
sent, grammar = demos[snum]
except:
print("Bad sentence number")
return
# Tokenize the sentence.
tokens = sent.split()
parser = ViterbiParser(grammar)
all_parses = {}
print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
parser.trace(3)
t = time.time()
parses = parser.parse_all(tokens)
time = time.time() - t
average = (
reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
)
num_parses = len(parses)
for p in parses:
all_parses[p.freeze()] = 1
# Print some summary statistics
print()
print("Time (secs) # Parses Average P(parse)")
print("-----------------------------------------")
print("%11.4f%11d%19.14f" % (time, num_parses, average))
parses = all_parses.keys()
if parses:
p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
else:
p = 0
print("------------------------------------------")
print("%11s%11d%19.14f" % ("n/a", len(parses), p))
# Ask the user if we should draw the parses.
print()
print("Draw parses (y/n)? ", end=" ")
if sys.stdin.readline().strip().lower().startswith("y"):
from nltk.draw.tree import draw_trees
print(" please wait...")
draw_trees(*parses)
# Ask the user if we should print the parses.
print()
print("Print parses (y/n)? ", end=" ")
if sys.stdin.readline().strip().lower().startswith("y"):
for parse in parses:
print(parse)
if __name__ == "__main__":
demo()