updates
This commit is contained in:
480
Backend/venv/lib/python3.12/site-packages/nltk/ccg/chart.py
Normal file
480
Backend/venv/lib/python3.12/site-packages/nltk/ccg/chart.py
Normal file
@@ -0,0 +1,480 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The lexicon is constructed by calling
|
||||
``lexicon.fromstring(<lexicon string>)``.
|
||||
|
||||
In order to construct a parser, you also need a rule set.
|
||||
The standard English rules are provided in chart as
|
||||
``chart.DefaultRuleSet``.
|
||||
|
||||
The parser can then be constructed by calling, for example:
|
||||
``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
|
||||
|
||||
Parsing is then performed by running
|
||||
``parser.parse(<sentence>.split())``.
|
||||
|
||||
While this returns a list of trees, the default representation
|
||||
of the produced trees is not very enlightening, particularly
|
||||
given that it uses the same tree class as the CFG parsers.
|
||||
It is probably better to call:
|
||||
``chart.printCCGDerivation(<parse tree extracted from list>)``
|
||||
which should print a nice representation of the derivation.
|
||||
|
||||
This entire process is shown far more clearly in the demonstration:
|
||||
python chart.py
|
||||
"""
|
||||
|
||||
import itertools
|
||||
|
||||
from nltk.ccg.combinator import *
|
||||
from nltk.ccg.combinator import (
|
||||
BackwardApplication,
|
||||
BackwardBx,
|
||||
BackwardComposition,
|
||||
BackwardSx,
|
||||
BackwardT,
|
||||
ForwardApplication,
|
||||
ForwardComposition,
|
||||
ForwardSubstitution,
|
||||
ForwardT,
|
||||
)
|
||||
from nltk.ccg.lexicon import Token, fromstring
|
||||
from nltk.ccg.logic import *
|
||||
from nltk.parse import ParserI
|
||||
from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
|
||||
from nltk.sem.logic import *
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
# Based on the EdgeI class from NLTK.
|
||||
# A number of the properties of the EdgeI interface don't
|
||||
# transfer well to CCGs, however.
|
||||
class CCGEdge(EdgeI):
|
||||
def __init__(self, span, categ, rule):
|
||||
self._span = span
|
||||
self._categ = categ
|
||||
self._rule = rule
|
||||
self._comparison_key = (span, categ, rule)
|
||||
|
||||
# Accessors
|
||||
def lhs(self):
|
||||
return self._categ
|
||||
|
||||
def span(self):
|
||||
return self._span
|
||||
|
||||
def start(self):
|
||||
return self._span[0]
|
||||
|
||||
def end(self):
|
||||
return self._span[1]
|
||||
|
||||
def length(self):
|
||||
return self._span[1] - self.span[0]
|
||||
|
||||
def rhs(self):
|
||||
return ()
|
||||
|
||||
def dot(self):
|
||||
return 0
|
||||
|
||||
def is_complete(self):
|
||||
return True
|
||||
|
||||
def is_incomplete(self):
|
||||
return False
|
||||
|
||||
def nextsym(self):
|
||||
return None
|
||||
|
||||
def categ(self):
|
||||
return self._categ
|
||||
|
||||
def rule(self):
|
||||
return self._rule
|
||||
|
||||
|
||||
class CCGLeafEdge(EdgeI):
|
||||
"""
|
||||
Class representing leaf edges in a CCG derivation.
|
||||
"""
|
||||
|
||||
def __init__(self, pos, token, leaf):
|
||||
self._pos = pos
|
||||
self._token = token
|
||||
self._leaf = leaf
|
||||
self._comparison_key = (pos, token.categ(), leaf)
|
||||
|
||||
# Accessors
|
||||
def lhs(self):
|
||||
return self._token.categ()
|
||||
|
||||
def span(self):
|
||||
return (self._pos, self._pos + 1)
|
||||
|
||||
def start(self):
|
||||
return self._pos
|
||||
|
||||
def end(self):
|
||||
return self._pos + 1
|
||||
|
||||
def length(self):
|
||||
return 1
|
||||
|
||||
def rhs(self):
|
||||
return self._leaf
|
||||
|
||||
def dot(self):
|
||||
return 0
|
||||
|
||||
def is_complete(self):
|
||||
return True
|
||||
|
||||
def is_incomplete(self):
|
||||
return False
|
||||
|
||||
def nextsym(self):
|
||||
return None
|
||||
|
||||
def token(self):
|
||||
return self._token
|
||||
|
||||
def categ(self):
|
||||
return self._token.categ()
|
||||
|
||||
def leaf(self):
|
||||
return self._leaf
|
||||
|
||||
|
||||
class BinaryCombinatorRule(AbstractChartRule):
|
||||
"""
|
||||
Class implementing application of a binary combinator to a chart.
|
||||
Takes the directed combinator to apply.
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self, combinator):
|
||||
self._combinator = combinator
|
||||
|
||||
# Apply a combinator
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
# The left & right edges must be touching.
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
# Check if the two edges are permitted to combine.
|
||||
# If so, generate the corresponding edge.
|
||||
if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(
|
||||
span=(left_edge.start(), right_edge.end()),
|
||||
categ=res,
|
||||
rule=self._combinator,
|
||||
)
|
||||
if chart.insert(new_edge, (left_edge, right_edge)):
|
||||
yield new_edge
|
||||
|
||||
# The representation of the combinator (for printing derivations)
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
# Type-raising must be handled slightly differently to the other rules, as the
|
||||
# resulting rules only span a single edge, rather than both edges.
|
||||
|
||||
|
||||
class ForwardTypeRaiseRule(AbstractChartRule):
|
||||
"""
|
||||
Class for applying forward type raising
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self):
|
||||
self._combinator = ForwardT
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
|
||||
if chart.insert(new_edge, (left_edge,)):
|
||||
yield new_edge
|
||||
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
class BackwardTypeRaiseRule(AbstractChartRule):
|
||||
"""
|
||||
Class for applying backward type raising.
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self):
|
||||
self._combinator = BackwardT
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
|
||||
if chart.insert(new_edge, (right_edge,)):
|
||||
yield new_edge
|
||||
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
# Common sets of combinators used for English derivations.
|
||||
ApplicationRuleSet = [
|
||||
BinaryCombinatorRule(ForwardApplication),
|
||||
BinaryCombinatorRule(BackwardApplication),
|
||||
]
|
||||
CompositionRuleSet = [
|
||||
BinaryCombinatorRule(ForwardComposition),
|
||||
BinaryCombinatorRule(BackwardComposition),
|
||||
BinaryCombinatorRule(BackwardBx),
|
||||
]
|
||||
SubstitutionRuleSet = [
|
||||
BinaryCombinatorRule(ForwardSubstitution),
|
||||
BinaryCombinatorRule(BackwardSx),
|
||||
]
|
||||
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
|
||||
|
||||
# The standard English rule set.
|
||||
DefaultRuleSet = (
|
||||
ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
|
||||
)
|
||||
|
||||
|
||||
class CCGChartParser(ParserI):
|
||||
"""
|
||||
Chart parser for CCGs.
|
||||
Based largely on the ChartParser class from NLTK.
|
||||
"""
|
||||
|
||||
def __init__(self, lexicon, rules, trace=0):
|
||||
self._lexicon = lexicon
|
||||
self._rules = rules
|
||||
self._trace = trace
|
||||
|
||||
def lexicon(self):
|
||||
return self._lexicon
|
||||
|
||||
# Implements the CYK algorithm
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
chart = CCGChart(list(tokens))
|
||||
lex = self._lexicon
|
||||
|
||||
# Initialize leaf edges.
|
||||
for index in range(chart.num_leaves()):
|
||||
for token in lex.categories(chart.leaf(index)):
|
||||
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
|
||||
chart.insert(new_edge, ())
|
||||
|
||||
# Select a span for the new edges
|
||||
for span in range(2, chart.num_leaves() + 1):
|
||||
for start in range(0, chart.num_leaves() - span + 1):
|
||||
# Try all possible pairs of edges that could generate
|
||||
# an edge for that span
|
||||
for part in range(1, span):
|
||||
lstart = start
|
||||
mid = start + part
|
||||
rend = start + span
|
||||
|
||||
for left in chart.select(span=(lstart, mid)):
|
||||
for right in chart.select(span=(mid, rend)):
|
||||
# Generate all possible combinations of the two edges
|
||||
for rule in self._rules:
|
||||
edges_added_by_rule = 0
|
||||
for newedge in rule.apply(chart, lex, left, right):
|
||||
edges_added_by_rule += 1
|
||||
|
||||
# Output the resulting parses
|
||||
return chart.parses(lex.start())
|
||||
|
||||
|
||||
class CCGChart(Chart):
|
||||
def __init__(self, tokens):
|
||||
Chart.__init__(self, tokens)
|
||||
|
||||
# Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
|
||||
# constructed slightly differently to those in the default Chart class, so it has to
|
||||
# be reimplemented
|
||||
def _trees(self, edge, complete, memo, tree_class):
|
||||
assert complete, "CCGChart cannot build incomplete trees"
|
||||
|
||||
if edge in memo:
|
||||
return memo[edge]
|
||||
|
||||
if isinstance(edge, CCGLeafEdge):
|
||||
word = tree_class(edge.token(), [self._tokens[edge.start()]])
|
||||
leaf = tree_class((edge.token(), "Leaf"), [word])
|
||||
memo[edge] = [leaf]
|
||||
return [leaf]
|
||||
|
||||
memo[edge] = []
|
||||
trees = []
|
||||
|
||||
for cpl in self.child_pointer_lists(edge):
|
||||
child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
|
||||
for children in itertools.product(*child_choices):
|
||||
lhs = (
|
||||
Token(
|
||||
self._tokens[edge.start() : edge.end()],
|
||||
edge.lhs(),
|
||||
compute_semantics(children, edge),
|
||||
),
|
||||
str(edge.rule()),
|
||||
)
|
||||
trees.append(tree_class(lhs, children))
|
||||
|
||||
memo[edge] = trees
|
||||
return trees
|
||||
|
||||
|
||||
def compute_semantics(children, edge):
|
||||
if children[0].label()[0].semantics() is None:
|
||||
return None
|
||||
|
||||
if len(children) == 2:
|
||||
if isinstance(edge.rule(), BackwardCombinator):
|
||||
children = [children[1], children[0]]
|
||||
|
||||
combinator = edge.rule()._combinator
|
||||
function = children[0].label()[0].semantics()
|
||||
argument = children[1].label()[0].semantics()
|
||||
|
||||
if isinstance(combinator, UndirectedFunctionApplication):
|
||||
return compute_function_semantics(function, argument)
|
||||
elif isinstance(combinator, UndirectedComposition):
|
||||
return compute_composition_semantics(function, argument)
|
||||
elif isinstance(combinator, UndirectedSubstitution):
|
||||
return compute_substitution_semantics(function, argument)
|
||||
else:
|
||||
raise AssertionError("Unsupported combinator '" + combinator + "'")
|
||||
else:
|
||||
return compute_type_raised_semantics(children[0].label()[0].semantics())
|
||||
|
||||
|
||||
# --------
|
||||
# Displaying derivations
|
||||
# --------
|
||||
def printCCGDerivation(tree):
|
||||
# Get the leaves and initial categories
|
||||
leafcats = tree.pos()
|
||||
leafstr = ""
|
||||
catstr = ""
|
||||
|
||||
# Construct a string with both the leaf word and corresponding
|
||||
# category aligned.
|
||||
for leaf, cat in leafcats:
|
||||
str_cat = "%s" % cat
|
||||
nextlen = 2 + max(len(leaf), len(str_cat))
|
||||
lcatlen = (nextlen - len(str_cat)) // 2
|
||||
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
|
||||
catstr += " " * lcatlen + str_cat + " " * rcatlen
|
||||
lleaflen = (nextlen - len(leaf)) // 2
|
||||
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
|
||||
leafstr += " " * lleaflen + leaf + " " * rleaflen
|
||||
print(leafstr.rstrip())
|
||||
print(catstr.rstrip())
|
||||
|
||||
# Display the derivation steps
|
||||
printCCGTree(0, tree)
|
||||
|
||||
|
||||
# Prints the sequence of derivation steps.
|
||||
def printCCGTree(lwidth, tree):
|
||||
rwidth = lwidth
|
||||
|
||||
# Is a leaf (word).
|
||||
# Increment the span by the space occupied by the leaf.
|
||||
if not isinstance(tree, Tree):
|
||||
return 2 + lwidth + len(tree)
|
||||
|
||||
# Find the width of the current derivation step
|
||||
for child in tree:
|
||||
rwidth = max(rwidth, printCCGTree(rwidth, child))
|
||||
|
||||
# Is a leaf node.
|
||||
# Don't print anything, but account for the space occupied.
|
||||
if not isinstance(tree.label(), tuple):
|
||||
return max(
|
||||
rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
|
||||
)
|
||||
|
||||
(token, op) = tree.label()
|
||||
|
||||
if op == "Leaf":
|
||||
return rwidth
|
||||
|
||||
# Pad to the left with spaces, followed by a sequence of '-'
|
||||
# and the derivation rule.
|
||||
print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
|
||||
# Print the resulting category on a new line.
|
||||
str_res = "%s" % (token.categ())
|
||||
if token.semantics() is not None:
|
||||
str_res += " {" + str(token.semantics()) + "}"
|
||||
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
|
||||
print(respadlen * " " + str_res)
|
||||
return rwidth
|
||||
|
||||
|
||||
### Demonstration code
|
||||
|
||||
# Construct the lexicon
|
||||
lex = fromstring(
|
||||
"""
|
||||
:- S, NP, N, VP # Primitive categories, S is the target primitive
|
||||
|
||||
Det :: NP/N # Family of words
|
||||
Pro :: NP
|
||||
TV :: VP/NP
|
||||
Modal :: (S\\NP)/VP # Backslashes need to be escaped
|
||||
|
||||
I => Pro # Word -> Category mapping
|
||||
you => Pro
|
||||
|
||||
the => Det
|
||||
|
||||
# Variables have the special keyword 'var'
|
||||
# '.' prevents permutation
|
||||
# ',' prevents composition
|
||||
and => var\\.,var/.,var
|
||||
|
||||
which => (N\\N)/(S/NP)
|
||||
|
||||
will => Modal # Categories can be either explicit, or families.
|
||||
might => Modal
|
||||
|
||||
cook => TV
|
||||
eat => TV
|
||||
|
||||
mushrooms => N
|
||||
parsnips => N
|
||||
bacon => N
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def demo():
|
||||
parser = CCGChartParser(lex, DefaultRuleSet)
|
||||
for parse in parser.parse("I might cook and eat the bacon".split()):
|
||||
printCCGDerivation(parse)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
Reference in New Issue
Block a user