This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,34 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Combinatory Categorial Grammar.
For more information see nltk/doc/contrib/ccg/ccg.pdf
"""
from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge
from nltk.ccg.combinator import (
BackwardApplication,
BackwardBx,
BackwardCombinator,
BackwardComposition,
BackwardSx,
BackwardT,
DirectedBinaryCombinator,
ForwardApplication,
ForwardCombinator,
ForwardComposition,
ForwardSubstitution,
ForwardT,
UndirectedBinaryCombinator,
UndirectedComposition,
UndirectedFunctionApplication,
UndirectedSubstitution,
UndirectedTypeRaise,
)
from nltk.ccg.lexicon import CCGLexicon

View File

@@ -0,0 +1,358 @@
# Natural Language Toolkit: CCG Categories
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
from functools import total_ordering
from nltk.internals import raise_unorderable_types
@total_ordering
class AbstractCCGCategory(metaclass=ABCMeta):
"""
Interface for categories in combinatory grammars.
"""
@abstractmethod
def is_primitive(self):
"""
Returns true if the category is primitive.
"""
@abstractmethod
def is_function(self):
"""
Returns true if the category is a function application.
"""
@abstractmethod
def is_var(self):
"""
Returns true if the category is a variable.
"""
@abstractmethod
def substitute(self, substitutions):
"""
Takes a set of (var, category) substitutions, and replaces every
occurrence of the variable with the corresponding category.
"""
@abstractmethod
def can_unify(self, other):
"""
Determines whether two categories can be unified.
- Returns None if they cannot be unified
- Returns a list of necessary substitutions if they can.
"""
# Utility functions: comparison, strings and hashing.
@abstractmethod
def __str__(self):
pass
def __eq__(self, other):
return (
self.__class__ is other.__class__
and self._comparison_key == other._comparison_key
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if not isinstance(other, AbstractCCGCategory):
raise_unorderable_types("<", self, other)
if self.__class__ is other.__class__:
return self._comparison_key < other._comparison_key
else:
return self.__class__.__name__ < other.__class__.__name__
def __hash__(self):
try:
return self._hash
except AttributeError:
self._hash = hash(self._comparison_key)
return self._hash
class CCGVar(AbstractCCGCategory):
"""
Class representing a variable CCG category.
Used for conjunctions (and possibly type-raising, if implemented as a
unary rule).
"""
_maxID = 0
def __init__(self, prim_only=False):
"""Initialize a variable (selects a new identifier)
:param prim_only: a boolean that determines whether the variable is
restricted to primitives
:type prim_only: bool
"""
self._id = self.new_id()
self._prim_only = prim_only
self._comparison_key = self._id
@classmethod
def new_id(cls):
"""
A class method allowing generation of unique variable identifiers.
"""
cls._maxID = cls._maxID + 1
return cls._maxID - 1
@classmethod
def reset_id(cls):
cls._maxID = 0
def is_primitive(self):
return False
def is_function(self):
return False
def is_var(self):
return True
def substitute(self, substitutions):
"""If there is a substitution corresponding to this variable,
return the substituted category.
"""
for var, cat in substitutions:
if var == self:
return cat
return self
def can_unify(self, other):
"""If the variable can be replaced with other
a substitution is returned.
"""
if other.is_primitive() or not self._prim_only:
return [(self, other)]
return None
def id(self):
return self._id
def __str__(self):
return "_var" + str(self._id)
@total_ordering
class Direction:
"""
Class representing the direction of a function application.
Also contains maintains information as to which combinators
may be used with the category.
"""
def __init__(self, dir, restrictions):
self._dir = dir
self._restrs = restrictions
self._comparison_key = (dir, tuple(restrictions))
# Testing the application direction
def is_forward(self):
return self._dir == "/"
def is_backward(self):
return self._dir == "\\"
def dir(self):
return self._dir
def restrs(self):
"""A list of restrictions on the combinators.
'.' denotes that permuting operations are disallowed
',' denotes that function composition is disallowed
'_' denotes that the direction has variable restrictions.
(This is redundant in the current implementation of type-raising)
"""
return self._restrs
def is_variable(self):
return self._restrs == "_"
# Unification and substitution of variable directions.
# Used only if type-raising is implemented as a unary rule, as it
# must inherit restrictions from the argument category.
def can_unify(self, other):
if other.is_variable():
return [("_", self.restrs())]
elif self.is_variable():
return [("_", other.restrs())]
else:
if self.restrs() == other.restrs():
return []
return None
def substitute(self, subs):
if not self.is_variable():
return self
for var, restrs in subs:
if var == "_":
return Direction(self._dir, restrs)
return self
# Testing permitted combinators
def can_compose(self):
return "," not in self._restrs
def can_cross(self):
return "." not in self._restrs
def __eq__(self, other):
return (
self.__class__ is other.__class__
and self._comparison_key == other._comparison_key
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if not isinstance(other, Direction):
raise_unorderable_types("<", self, other)
if self.__class__ is other.__class__:
return self._comparison_key < other._comparison_key
else:
return self.__class__.__name__ < other.__class__.__name__
def __hash__(self):
try:
return self._hash
except AttributeError:
self._hash = hash(self._comparison_key)
return self._hash
def __str__(self):
r_str = ""
for r in self._restrs:
r_str = r_str + "%s" % r
return f"{self._dir}{r_str}"
# The negation operator reverses the direction of the application
def __neg__(self):
if self._dir == "/":
return Direction("\\", self._restrs)
else:
return Direction("/", self._restrs)
class PrimitiveCategory(AbstractCCGCategory):
"""
Class representing primitive categories.
Takes a string representation of the category, and a
list of strings specifying the morphological subcategories.
"""
def __init__(self, categ, restrictions=[]):
self._categ = categ
self._restrs = restrictions
self._comparison_key = (categ, tuple(restrictions))
def is_primitive(self):
return True
def is_function(self):
return False
def is_var(self):
return False
def restrs(self):
return self._restrs
def categ(self):
return self._categ
# Substitution does nothing to a primitive category
def substitute(self, subs):
return self
# A primitive can be unified with a class of the same
# base category, given that the other category shares all
# of its subclasses, or with a variable.
def can_unify(self, other):
if not other.is_primitive():
return None
if other.is_var():
return [(other, self)]
if other.categ() == self.categ():
for restr in self._restrs:
if restr not in other.restrs():
return None
return []
return None
def __str__(self):
if self._restrs == []:
return "%s" % self._categ
restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
return f"{self._categ}{restrictions}"
class FunctionalCategory(AbstractCCGCategory):
"""
Class that represents a function application category.
Consists of argument and result categories, together with
an application direction.
"""
def __init__(self, res, arg, dir):
self._res = res
self._arg = arg
self._dir = dir
self._comparison_key = (arg, dir, res)
def is_primitive(self):
return False
def is_function(self):
return True
def is_var(self):
return False
# Substitution returns the category consisting of the
# substitution applied to each of its constituents.
def substitute(self, subs):
sub_res = self._res.substitute(subs)
sub_dir = self._dir.substitute(subs)
sub_arg = self._arg.substitute(subs)
return FunctionalCategory(sub_res, sub_arg, self._dir)
# A function can unify with another function, so long as its
# constituents can unify, or with an unrestricted variable.
def can_unify(self, other):
if other.is_var():
return [(other, self)]
if other.is_function():
sa = self._res.can_unify(other.res())
sd = self._dir.can_unify(other.dir())
if sa is not None and sd is not None:
sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
if sb is not None:
return sa + sb
return None
# Constituent accessors
def arg(self):
return self._arg
def res(self):
return self._res
def dir(self):
return self._dir
def __str__(self):
return f"({self._res}{self._dir}{self._arg})"

View File

@@ -0,0 +1,480 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
The lexicon is constructed by calling
``lexicon.fromstring(<lexicon string>)``.
In order to construct a parser, you also need a rule set.
The standard English rules are provided in chart as
``chart.DefaultRuleSet``.
The parser can then be constructed by calling, for example:
``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
Parsing is then performed by running
``parser.parse(<sentence>.split())``.
While this returns a list of trees, the default representation
of the produced trees is not very enlightening, particularly
given that it uses the same tree class as the CFG parsers.
It is probably better to call:
``chart.printCCGDerivation(<parse tree extracted from list>)``
which should print a nice representation of the derivation.
This entire process is shown far more clearly in the demonstration:
python chart.py
"""
import itertools
from nltk.ccg.combinator import *
from nltk.ccg.combinator import (
BackwardApplication,
BackwardBx,
BackwardComposition,
BackwardSx,
BackwardT,
ForwardApplication,
ForwardComposition,
ForwardSubstitution,
ForwardT,
)
from nltk.ccg.lexicon import Token, fromstring
from nltk.ccg.logic import *
from nltk.parse import ParserI
from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
from nltk.sem.logic import *
from nltk.tree import Tree
# Based on the EdgeI class from NLTK.
# A number of the properties of the EdgeI interface don't
# transfer well to CCGs, however.
class CCGEdge(EdgeI):
def __init__(self, span, categ, rule):
self._span = span
self._categ = categ
self._rule = rule
self._comparison_key = (span, categ, rule)
# Accessors
def lhs(self):
return self._categ
def span(self):
return self._span
def start(self):
return self._span[0]
def end(self):
return self._span[1]
def length(self):
return self._span[1] - self.span[0]
def rhs(self):
return ()
def dot(self):
return 0
def is_complete(self):
return True
def is_incomplete(self):
return False
def nextsym(self):
return None
def categ(self):
return self._categ
def rule(self):
return self._rule
class CCGLeafEdge(EdgeI):
"""
Class representing leaf edges in a CCG derivation.
"""
def __init__(self, pos, token, leaf):
self._pos = pos
self._token = token
self._leaf = leaf
self._comparison_key = (pos, token.categ(), leaf)
# Accessors
def lhs(self):
return self._token.categ()
def span(self):
return (self._pos, self._pos + 1)
def start(self):
return self._pos
def end(self):
return self._pos + 1
def length(self):
return 1
def rhs(self):
return self._leaf
def dot(self):
return 0
def is_complete(self):
return True
def is_incomplete(self):
return False
def nextsym(self):
return None
def token(self):
return self._token
def categ(self):
return self._token.categ()
def leaf(self):
return self._leaf
class BinaryCombinatorRule(AbstractChartRule):
"""
Class implementing application of a binary combinator to a chart.
Takes the directed combinator to apply.
"""
NUMEDGES = 2
def __init__(self, combinator):
self._combinator = combinator
# Apply a combinator
def apply(self, chart, grammar, left_edge, right_edge):
# The left & right edges must be touching.
if not (left_edge.end() == right_edge.start()):
return
# Check if the two edges are permitted to combine.
# If so, generate the corresponding edge.
if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(
span=(left_edge.start(), right_edge.end()),
categ=res,
rule=self._combinator,
)
if chart.insert(new_edge, (left_edge, right_edge)):
yield new_edge
# The representation of the combinator (for printing derivations)
def __str__(self):
return "%s" % self._combinator
# Type-raising must be handled slightly differently to the other rules, as the
# resulting rules only span a single edge, rather than both edges.
class ForwardTypeRaiseRule(AbstractChartRule):
"""
Class for applying forward type raising
"""
NUMEDGES = 2
def __init__(self):
self._combinator = ForwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
if chart.insert(new_edge, (left_edge,)):
yield new_edge
def __str__(self):
return "%s" % self._combinator
class BackwardTypeRaiseRule(AbstractChartRule):
"""
Class for applying backward type raising.
"""
NUMEDGES = 2
def __init__(self):
self._combinator = BackwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
if chart.insert(new_edge, (right_edge,)):
yield new_edge
def __str__(self):
return "%s" % self._combinator
# Common sets of combinators used for English derivations.
ApplicationRuleSet = [
BinaryCombinatorRule(ForwardApplication),
BinaryCombinatorRule(BackwardApplication),
]
CompositionRuleSet = [
BinaryCombinatorRule(ForwardComposition),
BinaryCombinatorRule(BackwardComposition),
BinaryCombinatorRule(BackwardBx),
]
SubstitutionRuleSet = [
BinaryCombinatorRule(ForwardSubstitution),
BinaryCombinatorRule(BackwardSx),
]
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
# The standard English rule set.
DefaultRuleSet = (
ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
)
class CCGChartParser(ParserI):
"""
Chart parser for CCGs.
Based largely on the ChartParser class from NLTK.
"""
def __init__(self, lexicon, rules, trace=0):
self._lexicon = lexicon
self._rules = rules
self._trace = trace
def lexicon(self):
return self._lexicon
# Implements the CYK algorithm
def parse(self, tokens):
tokens = list(tokens)
chart = CCGChart(list(tokens))
lex = self._lexicon
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
chart.insert(new_edge, ())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in self._rules:
edges_added_by_rule = 0
for newedge in rule.apply(chart, lex, left, right):
edges_added_by_rule += 1
# Output the resulting parses
return chart.parses(lex.start())
class CCGChart(Chart):
def __init__(self, tokens):
Chart.__init__(self, tokens)
# Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
# constructed slightly differently to those in the default Chart class, so it has to
# be reimplemented
def _trees(self, edge, complete, memo, tree_class):
assert complete, "CCGChart cannot build incomplete trees"
if edge in memo:
return memo[edge]
if isinstance(edge, CCGLeafEdge):
word = tree_class(edge.token(), [self._tokens[edge.start()]])
leaf = tree_class((edge.token(), "Leaf"), [word])
memo[edge] = [leaf]
return [leaf]
memo[edge] = []
trees = []
for cpl in self.child_pointer_lists(edge):
child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
for children in itertools.product(*child_choices):
lhs = (
Token(
self._tokens[edge.start() : edge.end()],
edge.lhs(),
compute_semantics(children, edge),
),
str(edge.rule()),
)
trees.append(tree_class(lhs, children))
memo[edge] = trees
return trees
def compute_semantics(children, edge):
if children[0].label()[0].semantics() is None:
return None
if len(children) == 2:
if isinstance(edge.rule(), BackwardCombinator):
children = [children[1], children[0]]
combinator = edge.rule()._combinator
function = children[0].label()[0].semantics()
argument = children[1].label()[0].semantics()
if isinstance(combinator, UndirectedFunctionApplication):
return compute_function_semantics(function, argument)
elif isinstance(combinator, UndirectedComposition):
return compute_composition_semantics(function, argument)
elif isinstance(combinator, UndirectedSubstitution):
return compute_substitution_semantics(function, argument)
else:
raise AssertionError("Unsupported combinator '" + combinator + "'")
else:
return compute_type_raised_semantics(children[0].label()[0].semantics())
# --------
# Displaying derivations
# --------
def printCCGDerivation(tree):
# Get the leaves and initial categories
leafcats = tree.pos()
leafstr = ""
catstr = ""
# Construct a string with both the leaf word and corresponding
# category aligned.
for leaf, cat in leafcats:
str_cat = "%s" % cat
nextlen = 2 + max(len(leaf), len(str_cat))
lcatlen = (nextlen - len(str_cat)) // 2
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
catstr += " " * lcatlen + str_cat + " " * rcatlen
lleaflen = (nextlen - len(leaf)) // 2
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
leafstr += " " * lleaflen + leaf + " " * rleaflen
print(leafstr.rstrip())
print(catstr.rstrip())
# Display the derivation steps
printCCGTree(0, tree)
# Prints the sequence of derivation steps.
def printCCGTree(lwidth, tree):
rwidth = lwidth
# Is a leaf (word).
# Increment the span by the space occupied by the leaf.
if not isinstance(tree, Tree):
return 2 + lwidth + len(tree)
# Find the width of the current derivation step
for child in tree:
rwidth = max(rwidth, printCCGTree(rwidth, child))
# Is a leaf node.
# Don't print anything, but account for the space occupied.
if not isinstance(tree.label(), tuple):
return max(
rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
)
(token, op) = tree.label()
if op == "Leaf":
return rwidth
# Pad to the left with spaces, followed by a sequence of '-'
# and the derivation rule.
print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
# Print the resulting category on a new line.
str_res = "%s" % (token.categ())
if token.semantics() is not None:
str_res += " {" + str(token.semantics()) + "}"
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
print(respadlen * " " + str_res)
return rwidth
### Demonstration code
# Construct the lexicon
lex = fromstring(
"""
:- S, NP, N, VP # Primitive categories, S is the target primitive
Det :: NP/N # Family of words
Pro :: NP
TV :: VP/NP
Modal :: (S\\NP)/VP # Backslashes need to be escaped
I => Pro # Word -> Category mapping
you => Pro
the => Det
# Variables have the special keyword 'var'
# '.' prevents permutation
# ',' prevents composition
and => var\\.,var/.,var
which => (N\\N)/(S/NP)
will => Modal # Categories can be either explicit, or families.
might => Modal
cook => TV
eat => TV
mushrooms => N
parsnips => N
bacon => N
"""
)
def demo():
parser = CCGChartParser(lex, DefaultRuleSet)
for parse in parser.parse("I might cook and eat the bacon".split()):
printCCGDerivation(parse)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,340 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CCG Combinators
"""
from abc import ABCMeta, abstractmethod
from nltk.ccg.api import FunctionalCategory
class UndirectedBinaryCombinator(metaclass=ABCMeta):
"""
Abstract class for representing a binary combinator.
Merely defines functions for checking if the function and argument
are able to be combined, and what the resulting category is.
Note that as no assumptions are made as to direction, the unrestricted
combinators can perform all backward, forward and crossed variations
of the combinators; these restrictions must be added in the rule
class.
"""
@abstractmethod
def can_combine(self, function, argument):
pass
@abstractmethod
def combine(self, function, argument):
pass
class DirectedBinaryCombinator(metaclass=ABCMeta):
"""
Wrapper for the undirected binary combinator.
It takes left and right categories, and decides which is to be
the function, and which the argument.
It then decides whether or not they can be combined.
"""
@abstractmethod
def can_combine(self, left, right):
pass
@abstractmethod
def combine(self, left, right):
pass
class ForwardCombinator(DirectedBinaryCombinator):
"""
Class representing combinators where the primary functor is on the left.
Takes an undirected combinator, and a predicate which adds constraints
restricting the cases in which it may apply.
"""
def __init__(self, combinator, predicate, suffix=""):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
return self._combinator.can_combine(left, right) and self._predicate(
left, right
)
def combine(self, left, right):
yield from self._combinator.combine(left, right)
def __str__(self):
return f">{self._combinator}{self._suffix}"
class BackwardCombinator(DirectedBinaryCombinator):
"""
The backward equivalent of the ForwardCombinator class.
"""
def __init__(self, combinator, predicate, suffix=""):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
return self._combinator.can_combine(right, left) and self._predicate(
left, right
)
def combine(self, left, right):
yield from self._combinator.combine(right, left)
def __str__(self):
return f"<{self._combinator}{self._suffix}"
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
"""
Class representing function application.
Implements rules of the form:
X/Y Y -> X (>)
And the corresponding backwards application rule
"""
def can_combine(self, function, argument):
if not function.is_function():
return False
return not function.arg().can_unify(argument) is None
def combine(self, function, argument):
if not function.is_function():
return
subs = function.arg().can_unify(argument)
if subs is None:
return
yield function.res().substitute(subs)
def __str__(self):
return ""
# Predicates for function application.
# Ensures the left functor takes an argument on the right
def forwardOnly(left, right):
return left.dir().is_forward()
# Ensures the right functor takes an argument on the left
def backwardOnly(left, right):
return right.dir().is_backward()
# Application combinator instances
ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
class UndirectedComposition(UndirectedBinaryCombinator):
"""
Functional composition (harmonic) combinator.
Implements rules of the form
X/Y Y/Z -> X/Z (B>)
And the corresponding backwards and crossed variations.
"""
def can_combine(self, function, argument):
# Can only combine two functions, and both functions must
# allow composition.
if not (function.is_function() and argument.is_function()):
return False
if function.dir().can_compose() and argument.dir().can_compose():
return not function.arg().can_unify(argument.res()) is None
return False
def combine(self, function, argument):
if not (function.is_function() and argument.is_function()):
return
if function.dir().can_compose() and argument.dir().can_compose():
subs = function.arg().can_unify(argument.res())
if subs is not None:
yield FunctionalCategory(
function.res().substitute(subs),
argument.arg().substitute(subs),
argument.dir(),
)
def __str__(self):
return "B"
# Predicates for restricting application of straight composition.
def bothForward(left, right):
return left.dir().is_forward() and right.dir().is_forward()
def bothBackward(left, right):
return left.dir().is_backward() and right.dir().is_backward()
# Predicates for crossed composition
def crossedDirs(left, right):
return left.dir().is_forward() and right.dir().is_backward()
def backwardBxConstraint(left, right):
# The functors must be crossed inwards
if not crossedDirs(left, right):
return False
# Permuting combinators must be allowed
if not left.dir().can_cross() and right.dir().can_cross():
return False
# The resulting argument category is restricted to be primitive
return left.arg().is_primitive()
# Straight composition combinators
ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
# Backward crossed composition
BackwardBx = BackwardCombinator(
UndirectedComposition(), backwardBxConstraint, suffix="x"
)
class UndirectedSubstitution(UndirectedBinaryCombinator):
r"""
Substitution (permutation) combinator.
Implements rules of the form
Y/Z (X\Y)/Z -> X/Z (<Sx)
And other variations.
"""
def can_combine(self, function, argument):
if function.is_primitive() or argument.is_primitive():
return False
# These could potentially be moved to the predicates, as the
# constraints may not be general to all languages.
if function.res().is_primitive():
return False
if not function.arg().is_primitive():
return False
if not (function.dir().can_compose() and argument.dir().can_compose()):
return False
return (function.res().arg() == argument.res()) and (
function.arg() == argument.arg()
)
def combine(self, function, argument):
if self.can_combine(function, argument):
yield FunctionalCategory(
function.res().res(), argument.arg(), argument.dir()
)
def __str__(self):
return "S"
# Predicate for forward substitution
def forwardSConstraint(left, right):
if not bothForward(left, right):
return False
return left.res().dir().is_forward() and left.arg().is_primitive()
# Predicate for backward crossed substitution
def backwardSxConstraint(left, right):
if not left.dir().can_cross() and right.dir().can_cross():
return False
if not bothForward(left, right):
return False
return right.res().dir().is_backward() and right.arg().is_primitive()
# Instances of substitution combinators
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
# Retrieves the left-most functional category.
# ie, (N\N)/(S/NP) => N\N
def innermostFunction(categ):
while categ.res().is_function():
categ = categ.res()
return categ
class UndirectedTypeRaise(UndirectedBinaryCombinator):
"""
Undirected combinator for type raising.
"""
def can_combine(self, function, arg):
# The argument must be a function.
# The restriction that arg.res() must be a function
# merely reduces redundant type-raising; if arg.res() is
# primitive, we have:
# X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
# which is equivalent to
# X Y\X =>(<) Y
if not (arg.is_function() and arg.res().is_function()):
return False
arg = innermostFunction(arg)
# left, arg_categ are undefined!
subs = left.can_unify(arg_categ.arg())
if subs is not None:
return True
return False
def combine(self, function, arg):
if not (
function.is_primitive() and arg.is_function() and arg.res().is_function()
):
return
# Type-raising matches only the innermost application.
arg = innermostFunction(arg)
subs = function.can_unify(arg.arg())
if subs is not None:
xcat = arg.res().substitute(subs)
yield FunctionalCategory(
xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
)
def __str__(self):
return "T"
# Predicates for type-raising
# The direction of the innermost category must be towards
# the primary functor.
# The restriction that the variable must be primitive is not
# common to all versions of CCGs; some authors have other restrictions.
def forwardTConstraint(left, right):
arg = innermostFunction(right)
return arg.dir().is_backward() and arg.res().is_primitive()
def backwardTConstraint(left, right):
arg = innermostFunction(left)
return arg.dir().is_forward() and arg.res().is_primitive()
# Instances of type-raising combinators
ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)

View File

@@ -0,0 +1,338 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CCG Lexicons
"""
import re
from collections import defaultdict
from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
from nltk.internals import deprecated
from nltk.sem.logic import Expression
# ------------
# Regular expressions used for parsing components of the lexicon
# ------------
# Parses a primitive category and subscripts
PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
# Separates the next primitive category from the remainder of the
# string
NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
# Separates the next application operator from the remainder
APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
# Parses the definition of the right-hand side (rhs) of either a word or a family
LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
# Parses the right hand side that contains category and maybe semantic predicate
RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
# Parses the semantic predicate
SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
# Strips comments from a line
COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
class Token:
"""
Class representing a token.
token => category {semantics}
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
* `token` (string)
* `categ` (string)
* `semantics` (Expression)
"""
def __init__(self, token, categ, semantics=None):
self._token = token
self._categ = categ
self._semantics = semantics
def categ(self):
return self._categ
def semantics(self):
return self._semantics
def __str__(self):
semantics_str = ""
if self._semantics is not None:
semantics_str = " {" + str(self._semantics) + "}"
return "" + str(self._categ) + semantics_str
def __cmp__(self, other):
if not isinstance(other, Token):
return -1
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
class CCGLexicon:
"""
Class representing a lexicon for CCG grammars.
* `primitives`: The list of primitive categories for the lexicon
* `families`: Families of categories
* `entries`: A mapping of words to possible categories
"""
def __init__(self, start, primitives, families, entries):
self._start = PrimitiveCategory(start)
self._primitives = primitives
self._families = families
self._entries = entries
def categories(self, word):
"""
Returns all the possible categories for a word
"""
return self._entries[word]
def start(self):
"""
Return the target category for the parser
"""
return self._start
def __str__(self):
"""
String representation of the lexicon. Used for debugging.
"""
string = ""
first = True
for ident in sorted(self._entries):
if not first:
string = string + "\n"
string = string + ident + " => "
first = True
for cat in self._entries[ident]:
if not first:
string = string + " | "
else:
first = False
string = string + "%s" % cat
return string
# -----------
# Parsing lexicons
# -----------
def matchBrackets(string):
"""
Separate the contents matching the first set of brackets from the rest of
the input.
"""
rest = string[1:]
inside = "("
while rest != "" and not rest.startswith(")"):
if rest.startswith("("):
(part, rest) = matchBrackets(rest)
inside = inside + part
else:
inside = inside + rest[0]
rest = rest[1:]
if rest.startswith(")"):
return (inside + ")", rest[1:])
raise AssertionError("Unmatched bracket in string '" + string + "'")
def nextCategory(string):
"""
Separate the string for the next portion of the category from the rest
of the string
"""
if string.startswith("("):
return matchBrackets(string)
return NEXTPRIM_RE.match(string).groups()
def parseApplication(app):
"""
Parse an application operator
"""
return Direction(app[0], app[1:])
def parseSubscripts(subscr):
"""
Parse the subscripts for a primitive category
"""
if subscr:
return subscr[1:-1].split(",")
return []
def parsePrimitiveCategory(chunks, primitives, families, var):
"""
Parse a primitive category
If the primitive is the special category 'var', replace it with the
correct `CCGVar`.
"""
if chunks[0] == "var":
if chunks[1] is None:
if var is None:
var = CCGVar()
return (var, var)
catstr = chunks[0]
if catstr in families:
(cat, cvar) = families[catstr]
if var is None:
var = cvar
else:
cat = cat.substitute([(cvar, var)])
return (cat, var)
if catstr in primitives:
subscrs = parseSubscripts(chunks[1])
return (PrimitiveCategory(catstr, subscrs), var)
raise AssertionError(
"String '" + catstr + "' is neither a family nor primitive category."
)
def augParseCategory(line, primitives, families, var=None):
"""
Parse a string representing a category, and returns a tuple with
(possibly) the CCG variable for the category
"""
(cat_string, rest) = nextCategory(line)
if cat_string.startswith("("):
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(res, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
while rest != "":
app = APP_RE.match(rest).groups()
direction = parseApplication(app[0:3])
rest = app[3]
(cat_string, rest) = nextCategory(rest)
if cat_string.startswith("("):
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(arg, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
res = FunctionalCategory(res, arg, direction)
return (res, var)
def fromstring(lex_str, include_semantics=False):
"""
Convert string representation into a lexicon for CCGs.
"""
CCGVar.reset_id()
primitives = []
families = {}
entries = defaultdict(list)
for line in lex_str.splitlines():
# Strip comments and leading/trailing whitespace.
line = COMMENTS_RE.match(line).groups()[0].strip()
if line == "":
continue
if line.startswith(":-"):
# A line of primitive categories.
# The first one is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [
prim.strip() for prim in line[2:].strip().split(",")
]
else:
# Either a family definition, or a word definition
(ident, sep, rhs) = LEX_RE.match(line).groups()
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
(cat, var) = augParseCategory(catstr, primitives, families)
if sep == "::":
# Family definition
# ie, Det :: NP/N
families[ident] = (cat, var)
else:
semantics = None
if include_semantics is True:
if semantics_str is None:
raise AssertionError(
line
+ " must contain semantics because include_semantics is set to True"
)
else:
semantics = Expression.fromstring(
SEMANTICS_RE.match(semantics_str).groups()[0]
)
# Word definition
# ie, which => (N\N)/(S/NP)
entries[ident].append(Token(ident, cat, semantics))
return CCGLexicon(primitives[0], primitives, families, entries)
@deprecated("Use fromstring() instead.")
def parseLexicon(lex_str):
return fromstring(lex_str)
openccg_tinytiny = fromstring(
"""
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
# Only incorporates a subset of the morphological subcategories, however.
:- S,NP,N # Primitive categories
Det :: NP/N # Determiners
Pro :: NP
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
IntransVpl :: S\\NP[pl] # Plural
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
TransVpl :: S\\NP[pl]/NP # Plural
the => NP[sg]/N[sg]
the => NP[pl]/N[pl]
I => Pro
me => Pro
we => Pro
us => Pro
book => N[sg]
books => N[pl]
peach => N[sg]
peaches => N[pl]
policeman => N[sg]
policemen => N[pl]
boy => N[sg]
boys => N[pl]
sleep => IntransVsg
sleep => IntransVpl
eat => IntransVpl
eat => TransVpl
eats => IntransVsg
eats => TransVsg
see => TransVpl
sees => TransVsg
"""
)

View File

@@ -0,0 +1,63 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tanin Na Nakorn (@tanin)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Helper functions for CCG semantics computation
"""
import copy
from nltk.sem.logic import *
def compute_type_raised_semantics(semantics):
semantics_copy = copy.deepcopy(semantics)
core = semantics_copy
parent = None
while isinstance(core, LambdaExpression):
parent = core
core = core.term
var = Variable("F")
while var in core.free():
var = unique_variable(pattern=var)
core = ApplicationExpression(FunctionVariableExpression(var), core)
if parent is not None:
parent.term = core
else:
semantics_copy = core
return LambdaExpression(var, semantics_copy)
def compute_function_semantics(function, argument):
return ApplicationExpression(function, argument).simplify()
def compute_composition_semantics(function, argument):
assert isinstance(argument, LambdaExpression), (
"`" + str(argument) + "` must be a lambda expression"
)
return LambdaExpression(
argument.variable, ApplicationExpression(function, argument.term).simplify()
)
def compute_substitution_semantics(function, argument):
assert isinstance(function, LambdaExpression) and isinstance(
function.term, LambdaExpression
), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
assert isinstance(argument, LambdaExpression), (
"`" + str(argument) + "` must be a lambda expression"
)
new_argument = ApplicationExpression(
argument, VariableExpression(function.variable)
).simplify()
new_term = ApplicationExpression(function.term, new_argument).simplify()
return LambdaExpression(function.variable, new_term)