This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1 @@
3.9.2

View File

@@ -0,0 +1,208 @@
# Natural Language Toolkit (NLTK)
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
The Natural Language Toolkit (NLTK) is an open source Python library
for Natural Language Processing. A free online book is available.
(If you use the library for academic research, please cite the book.)
Steven Bird, Ewan Klein, and Edward Loper (2009).
Natural Language Processing with Python. O'Reilly Media Inc.
https://www.nltk.org/book/
isort:skip_file
"""
import os
import importlib
# //////////////////////////////////////////////////////
# Metadata
# //////////////////////////////////////////////////////
# Version. For each new release, the version number should be updated
# in the file VERSION.
try:
# If a VERSION file exists, use it!
version_file = os.path.join(os.path.dirname(__file__), "VERSION")
with open(version_file) as infile:
__version__ = infile.read().strip()
except NameError:
__version__ = "unknown (running code interactively?)"
except OSError as ex:
__version__ = "unknown (%s)" % ex
if __doc__ is not None: # fix for the ``python -OO``
__doc__ += "\n@version: " + __version__
# Copyright notice
__copyright__ = """\
Copyright (C) 2001-2025 NLTK Project.
Distributed and Licensed under the Apache License, Version 2.0,
which is included by reference.
"""
__license__ = "Apache License, Version 2.0"
# Description of the toolkit, keywords, and the project's primary URL.
__longdescr__ = """\
The Natural Language Toolkit (NLTK) is a Python package for
natural language processing. NLTK requires Python 3.9, 3.10, 3.11, 3.12 or 3.13."""
__keywords__ = [
"NLP",
"CL",
"natural language processing",
"computational linguistics",
"parsing",
"tagging",
"tokenizing",
"syntax",
"linguistics",
"language",
"natural language",
"text analytics",
]
__url__ = "https://www.nltk.org/"
# Maintainer, contributors, etc.
__maintainer__ = "NLTK Team"
__maintainer_email__ = "nltk.team@gmail.com"
__author__ = __maintainer__
__author_email__ = __maintainer_email__
# "Trove" classifiers for Python Package Index.
__classifiers__ = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Human Machine Interfaces",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Text Processing",
"Topic :: Text Processing :: Filters",
"Topic :: Text Processing :: General",
"Topic :: Text Processing :: Indexing",
"Topic :: Text Processing :: Linguistic",
]
from nltk.internals import config_java
# support numpy from pypy
try:
import numpypy
except ImportError:
pass
# Override missing methods on environments where it cannot be used like GAE.
import subprocess
if not hasattr(subprocess, "PIPE"):
def _fake_PIPE(*args, **kwargs):
raise NotImplementedError("subprocess.PIPE is not supported.")
subprocess.PIPE = _fake_PIPE
if not hasattr(subprocess, "Popen"):
def _fake_Popen(*args, **kwargs):
raise NotImplementedError("subprocess.Popen is not supported.")
subprocess.Popen = _fake_Popen
###########################################################
# TOP-LEVEL MODULES
###########################################################
# Import top-level functionality into top-level namespace
from nltk.collocations import *
from nltk.decorators import decorator, memoize
from nltk.featstruct import *
from nltk.grammar import *
from nltk.probability import *
from nltk.text import *
from nltk.util import *
from nltk.jsontags import *
###########################################################
# PACKAGES
###########################################################
from nltk.chunk import *
from nltk.classify import *
from nltk.inference import *
from nltk.metrics import *
from nltk.parse import *
from nltk.tag import *
from nltk.tokenize import *
from nltk.translate import *
from nltk.tree import *
from nltk.sem import *
from nltk.stem import *
# Packages which can be lazily imported
# (a) we don't import *
# (b) they're slow to import or have run-time dependencies
# that can safely fail at run time
from nltk import lazyimport
app = lazyimport.LazyModule("app", locals(), globals())
chat = lazyimport.LazyModule("chat", locals(), globals())
corpus = lazyimport.LazyModule("corpus", locals(), globals())
draw = lazyimport.LazyModule("draw", locals(), globals())
toolbox = lazyimport.LazyModule("toolbox", locals(), globals())
# Optional loading
try:
import numpy
except ImportError:
pass
else:
from nltk import cluster
from nltk.downloader import download, download_shell
# Check if tkinter exists without importing it to avoid crashes after
# forks on macOS. Only nltk.app, nltk.draw, and demo modules should
# have top-level tkinter imports. See #2949 for more details.
if importlib.util.find_spec("tkinter"):
try:
from nltk.downloader import download_gui
except RuntimeError as e:
import warnings
warnings.warn(
"Corpus downloader GUI not loaded "
"(RuntimeError during import: %s)" % str(e)
)
# explicitly import all top-level modules (ensuring
# they override the same names inadvertently imported
# from a subpackage)
from nltk import ccg, chunk, classify, collocations
from nltk import data, featstruct, grammar, help, inference, metrics
from nltk import misc, parse, probability, sem, stem, wsd
from nltk import tag, tbl, text, tokenize, translate, tree, util
# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116
def demo():
print("To run the demo code for a module, type nltk.module.demo()")

View File

@@ -0,0 +1,47 @@
# Natural Language Toolkit: Applications package
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Interactive NLTK Applications:
chartparser: Chart Parser
chunkparser: Regular-Expression Chunk Parser
collocations: Find collocations in text
concordance: Part-of-speech concordancer
nemo: Finding (and Replacing) Nemo regular expression tool
rdparser: Recursive Descent Parser
srparser: Shift-Reduce Parser
wordnet: WordNet Browser
"""
# Import Tkinter-based modules if Tkinter is installed
try:
import tkinter
except ImportError:
import warnings
warnings.warn("nltk.app package not loaded (please install Tkinter library).")
else:
from nltk.app.chartparser_app import app as chartparser
from nltk.app.chunkparser_app import app as chunkparser
from nltk.app.collocations_app import app as collocations
from nltk.app.concordance_app import app as concordance
from nltk.app.nemo_app import app as nemo
from nltk.app.rdparser_app import app as rdparser
from nltk.app.srparser_app import app as srparser
from nltk.app.wordnet_app import app as wordnet
try:
from matplotlib import pylab
except ImportError:
import warnings
warnings.warn("nltk.app.wordfreq not loaded (requires the matplotlib library).")
else:
from nltk.app.wordfreq_app import app as wordfreq

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,438 @@
# Natural Language Toolkit: Collocations Application
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
# Copyright (C) 2001-2025 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
import queue as q
import threading
from tkinter import (
END,
LEFT,
SUNKEN,
Button,
Frame,
IntVar,
Label,
Menu,
OptionMenu,
Scrollbar,
StringVar,
Text,
Tk,
)
from tkinter.font import Font
from nltk.corpus import (
alpino,
brown,
cess_cat,
cess_esp,
floresta,
indian,
mac_morpho,
machado,
nps_chat,
sinica_treebank,
treebank,
)
from nltk.probability import FreqDist
from nltk.util import in_idle
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
POLL_INTERVAL = 100
_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
"Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
"English: Brown Corpus": lambda: brown.words(),
"English: Brown Corpus (Press)": lambda: brown.words(
categories=["news", "editorial", "reviews"]
),
"English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
"English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
"English: Brown Corpus (Science Fiction)": lambda: brown.words(
categories="science_fiction"
),
"English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
"English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
"English: NPS Chat Corpus": lambda: nps_chat.words(),
"English: Wall Street Journal Corpus": lambda: treebank.words(),
"Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
"Dutch: Alpino Corpus": lambda: alpino.words(),
"Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
"Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
"Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}
class CollocationsView:
_BACKGROUND_COLOUR = "#FFF" # white
def __init__(self):
self.queue = q.Queue()
self.model = CollocationsModel(self.queue)
self.top = Tk()
self._init_top(self.top)
self._init_menubar()
self._init_widgets(self.top)
self.load_corpus(self.model.DEFAULT_CORPUS)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
top.geometry("550x650+50+50")
top.title("NLTK Collocations List")
top.bind("<Control-q>", self.destroy)
top.protocol("WM_DELETE_WINDOW", self.destroy)
top.minsize(550, 650)
def _init_widgets(self, parent):
self.main_frame = Frame(
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
)
self._init_corpus_select(self.main_frame)
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
self.main_frame.pack(fill="both", expand=True)
def _init_corpus_select(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.var = StringVar(innerframe)
self.var.set(self.model.DEFAULT_CORPUS)
Label(
innerframe,
justify=LEFT,
text=" Corpus: ",
background=self._BACKGROUND_COLOUR,
padx=2,
pady=1,
border=0,
).pack(side="left")
other_corpora = list(self.model.CORPORA.keys()).remove(
self.model.DEFAULT_CORPUS
)
om = OptionMenu(
innerframe,
self.var,
self.model.DEFAULT_CORPUS,
command=self.corpus_selected,
*self.model.non_default_corpora()
)
om["borderwidth"] = 0
om["highlightthickness"] = 1
om.pack(side="left")
innerframe.pack(side="top", fill="x", anchor="n")
def _init_status(self, parent):
self.status = Label(
parent,
justify=LEFT,
relief=SUNKEN,
background=self._BACKGROUND_COLOUR,
border=0,
padx=1,
pady=0,
)
self.status.pack(side="top", anchor="sw")
def _init_menubar(self):
self._result_size = IntVar(self.top)
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
filemenu.add_command(
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
)
menubar.add_cascade(label="File", underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
rescntmenu.add_radiobutton(
label="20",
variable=self._result_size,
underline=0,
value=20,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="50",
variable=self._result_size,
underline=0,
value=50,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="100",
variable=self._result_size,
underline=0,
value=100,
command=self.set_result_size,
)
rescntmenu.invoke(1)
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
self.top.config(menu=menubar)
def set_result_size(self, **kwargs):
self.model.result_count = self._result_size.get()
def _init_results_box(self, parent):
innerframe = Frame(parent)
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
self.results_box = Text(
i1,
font=Font(family="courier", size="16"),
state="disabled",
borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set,
wrap="none",
width="40",
height="20",
exportselection=1,
)
self.results_box.pack(side="left", fill="both", expand=True)
vscrollbar.pack(side="left", fill="y", anchor="e")
vscrollbar.config(command=self.results_box.yview)
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
hscrollbar.config(command=self.results_box.xview)
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
side="left", anchor="e"
)
i1.pack(side="top", fill="both", expand=True, anchor="n")
i2.pack(side="bottom", fill="x", anchor="s")
innerframe.pack(side="top", fill="both", expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.prev = prev = Button(
innerframe,
text="Previous",
command=self.previous,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
prev.pack(side="left", anchor="center")
self.next = next = Button(
innerframe,
text="Next",
command=self.__next__,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
next.pack(side="right", anchor="center")
innerframe.pack(side="top", fill="y")
self.reset_current_page()
def reset_current_page(self):
self.current_page = -1
def _poll(self):
try:
event = self.queue.get(block=False)
except q.Empty:
pass
else:
if event == CORPUS_LOADED_EVENT:
self.handle_corpus_loaded(event)
elif event == ERROR_LOADING_CORPUS_EVENT:
self.handle_error_loading_corpus(event)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
self.status["text"] = "Error in loading " + self.var.get()
self.unfreeze_editable()
self.clear_results_box()
self.freeze_editable()
self.reset_current_page()
def handle_corpus_loaded(self, event):
self.status["text"] = self.var.get() + " is loaded"
self.unfreeze_editable()
self.clear_results_box()
self.reset_current_page()
# self.next()
collocations = self.model.next(self.current_page + 1)
self.write_results(collocations)
self.current_page += 1
def corpus_selected(self, *args):
new_selection = self.var.get()
self.load_corpus(new_selection)
def previous(self):
self.freeze_editable()
collocations = self.model.prev(self.current_page - 1)
self.current_page = self.current_page - 1
self.clear_results_box()
self.write_results(collocations)
self.unfreeze_editable()
def __next__(self):
self.freeze_editable()
collocations = self.model.next(self.current_page + 1)
self.clear_results_box()
self.write_results(collocations)
self.current_page += 1
self.unfreeze_editable()
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
self.status["text"] = "Loading " + selection + "..."
self.freeze_editable()
self.model.load_corpus(selection)
def freeze_editable(self):
self.prev["state"] = "disabled"
self.next["state"] = "disabled"
def clear_results_box(self):
self.results_box["state"] = "normal"
self.results_box.delete("1.0", END)
self.results_box["state"] = "disabled"
def fire_event(self, event):
# Firing an event so that rendering of widgets happen in the mainloop thread
self.top.event_generate(event, when="tail")
def destroy(self, *e):
if self.top is None:
return
self.top.after_cancel(self.after)
self.top.destroy()
self.top = None
def mainloop(self, *args, **kwargs):
if in_idle():
return
self.top.mainloop(*args, **kwargs)
def unfreeze_editable(self):
self.set_paging_button_states()
def set_paging_button_states(self):
if self.current_page == -1 or self.current_page == 0:
self.prev["state"] = "disabled"
else:
self.prev["state"] = "normal"
if self.model.is_last_page(self.current_page):
self.next["state"] = "disabled"
else:
self.next["state"] = "normal"
def write_results(self, results):
self.results_box["state"] = "normal"
row = 1
for each in results:
self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
row += 1
self.results_box["state"] = "disabled"
class CollocationsModel:
def __init__(self, queue):
self.result_count = None
self.selected_corpus = None
self.collocations = None
self.CORPORA = _CORPORA
self.DEFAULT_CORPUS = _DEFAULT
self.queue = queue
self.reset_results()
def reset_results(self):
self.result_pages = []
self.results_returned = 0
def load_corpus(self, name):
self.selected_corpus = name
self.collocations = None
runner_thread = self.LoadCorpus(name, self)
runner_thread.start()
self.reset_results()
def non_default_corpora(self):
copy = []
copy.extend(list(self.CORPORA.keys()))
copy.remove(self.DEFAULT_CORPUS)
copy.sort()
return copy
def is_last_page(self, number):
if number < len(self.result_pages):
return False
return self.results_returned + (
number - len(self.result_pages)
) * self.result_count >= len(self.collocations)
def next(self, page):
if (len(self.result_pages) - 1) < page:
for i in range(page - (len(self.result_pages) - 1)):
self.result_pages.append(
self.collocations[
self.results_returned : self.results_returned
+ self.result_count
]
)
self.results_returned += self.result_count
return self.result_pages[page]
def prev(self, page):
if page == -1:
return []
return self.result_pages[page]
class LoadCorpus(threading.Thread):
def __init__(self, name, model):
threading.Thread.__init__(self)
self.model, self.name = model, name
def run(self):
try:
words = self.model.CORPORA[self.name]()
from operator import itemgetter
text = [w for w in words if len(w) > 2]
fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
vocab = FreqDist(text)
scored = [
((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
for w1, w2 in fd
]
scored.sort(key=itemgetter(1), reverse=True)
self.model.collocations = list(map(itemgetter(0), scored))
self.model.queue.put(CORPUS_LOADED_EVENT)
except Exception as e:
print(e)
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
# def collocations():
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
def app():
c = CollocationsView()
c.mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

View File

@@ -0,0 +1,709 @@
# Natural Language Toolkit: Concordance Application
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import queue as q
import re
import threading
from tkinter import (
END,
LEFT,
SUNKEN,
Button,
Entry,
Frame,
IntVar,
Label,
Menu,
OptionMenu,
Scrollbar,
StringVar,
Text,
Tk,
)
from tkinter.font import Font
from nltk.corpus import (
alpino,
brown,
cess_cat,
cess_esp,
floresta,
indian,
mac_morpho,
nps_chat,
sinica_treebank,
treebank,
)
from nltk.draw.util import ShowText
from nltk.util import in_idle
WORD_OR_TAG = "[^/ ]+"
BOUNDARY = r"\b"
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
POLL_INTERVAL = 50
# NB All corpora must be specified in a lambda expression so as not to be
# loaded when the module is imported.
_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
"Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
tagset="universal"
),
"English: Brown Corpus": lambda: brown.tagged_sents(),
"English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
tagset="universal"
),
"English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
categories=["news", "editorial", "reviews"], tagset="universal"
),
"English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
categories="religion", tagset="universal"
),
"English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
categories="learned", tagset="universal"
),
"English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
categories="science_fiction", tagset="universal"
),
"English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
categories="romance", tagset="universal"
),
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
categories="humor", tagset="universal"
),
"English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
"English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
tagset="universal"
),
"English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
"English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
tagset="universal"
),
"Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
"Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
tagset="universal"
),
"Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
"Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
tagset="universal"
),
"Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
"Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
files="hindi.pos", tagset="universal"
),
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
"Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
tagset="universal"
),
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
"Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
tagset="universal"
),
"Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
tagset="universal"
),
}
class ConcordanceSearchView:
_BACKGROUND_COLOUR = "#FFF" # white
# Colour of highlighted results
_HIGHLIGHT_WORD_COLOUR = "#F00" # red
_HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
_HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
_HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
# Percentage of text left of the scrollbar position
_FRACTION_LEFT_TEXT = 0.30
def __init__(self):
self.queue = q.Queue()
self.model = ConcordanceSearchModel(self.queue)
self.top = Tk()
self._init_top(self.top)
self._init_menubar()
self._init_widgets(self.top)
self.load_corpus(self.model.DEFAULT_CORPUS)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def _init_top(self, top):
top.geometry("950x680+50+50")
top.title("NLTK Concordance Search")
top.bind("<Control-q>", self.destroy)
top.protocol("WM_DELETE_WINDOW", self.destroy)
top.minsize(950, 680)
def _init_widgets(self, parent):
self.main_frame = Frame(
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
)
self._init_corpus_select(self.main_frame)
self._init_query_box(self.main_frame)
self._init_results_box(self.main_frame)
self._init_paging(self.main_frame)
self._init_status(self.main_frame)
self.main_frame.pack(fill="both", expand=True)
def _init_menubar(self):
self._result_size = IntVar(self.top)
self._cntx_bf_len = IntVar(self.top)
self._cntx_af_len = IntVar(self.top)
menubar = Menu(self.top)
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
filemenu.add_command(
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
)
menubar.add_cascade(label="File", underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
rescntmenu = Menu(editmenu, tearoff=0)
rescntmenu.add_radiobutton(
label="20",
variable=self._result_size,
underline=0,
value=20,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="50",
variable=self._result_size,
underline=0,
value=50,
command=self.set_result_size,
)
rescntmenu.add_radiobutton(
label="100",
variable=self._result_size,
underline=0,
value=100,
command=self.set_result_size,
)
rescntmenu.invoke(1)
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
cntxmenu = Menu(editmenu, tearoff=0)
cntxbfmenu = Menu(cntxmenu, tearoff=0)
cntxbfmenu.add_radiobutton(
label="60 characters",
variable=self._cntx_bf_len,
underline=0,
value=60,
command=self.set_cntx_bf_len,
)
cntxbfmenu.add_radiobutton(
label="80 characters",
variable=self._cntx_bf_len,
underline=0,
value=80,
command=self.set_cntx_bf_len,
)
cntxbfmenu.add_radiobutton(
label="100 characters",
variable=self._cntx_bf_len,
underline=0,
value=100,
command=self.set_cntx_bf_len,
)
cntxbfmenu.invoke(1)
cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
cntxafmenu = Menu(cntxmenu, tearoff=0)
cntxafmenu.add_radiobutton(
label="70 characters",
variable=self._cntx_af_len,
underline=0,
value=70,
command=self.set_cntx_af_len,
)
cntxafmenu.add_radiobutton(
label="90 characters",
variable=self._cntx_af_len,
underline=0,
value=90,
command=self.set_cntx_af_len,
)
cntxafmenu.add_radiobutton(
label="110 characters",
variable=self._cntx_af_len,
underline=0,
value=110,
command=self.set_cntx_af_len,
)
cntxafmenu.invoke(1)
cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
self.top.config(menu=menubar)
def set_result_size(self, **kwargs):
self.model.result_count = self._result_size.get()
def set_cntx_af_len(self, **kwargs):
self._char_after = self._cntx_af_len.get()
def set_cntx_bf_len(self, **kwargs):
self._char_before = self._cntx_bf_len.get()
def _init_corpus_select(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.var = StringVar(innerframe)
self.var.set(self.model.DEFAULT_CORPUS)
Label(
innerframe,
justify=LEFT,
text=" Corpus: ",
background=self._BACKGROUND_COLOUR,
padx=2,
pady=1,
border=0,
).pack(side="left")
other_corpora = list(self.model.CORPORA.keys()).remove(
self.model.DEFAULT_CORPUS
)
om = OptionMenu(
innerframe,
self.var,
self.model.DEFAULT_CORPUS,
command=self.corpus_selected,
*self.model.non_default_corpora()
)
om["borderwidth"] = 0
om["highlightthickness"] = 1
om.pack(side="left")
innerframe.pack(side="top", fill="x", anchor="n")
def _init_status(self, parent):
self.status = Label(
parent,
justify=LEFT,
relief=SUNKEN,
background=self._BACKGROUND_COLOUR,
border=0,
padx=1,
pady=0,
)
self.status.pack(side="top", anchor="sw")
def _init_query_box(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
self.query_box = Entry(another, width=60)
self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
self.search_button = Button(
another,
text="Search",
command=self.search,
borderwidth=1,
highlightthickness=1,
)
self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
another.pack()
innerframe.pack(side="top", fill="x", anchor="n")
def search_enter_keypress_handler(self, *event):
self.search()
def _init_results_box(self, parent):
innerframe = Frame(parent)
i1 = Frame(innerframe)
i2 = Frame(innerframe)
vscrollbar = Scrollbar(i1, borderwidth=1)
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
self.results_box = Text(
i1,
font=Font(family="courier", size="16"),
state="disabled",
borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set,
wrap="none",
width="40",
height="20",
exportselection=1,
)
self.results_box.pack(side="left", fill="both", expand=True)
self.results_box.tag_config(
self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
)
self.results_box.tag_config(
self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
)
vscrollbar.pack(side="left", fill="y", anchor="e")
vscrollbar.config(command=self.results_box.yview)
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
hscrollbar.config(command=self.results_box.xview)
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
side="left", anchor="e"
)
i1.pack(side="top", fill="both", expand=True, anchor="n")
i2.pack(side="bottom", fill="x", anchor="s")
innerframe.pack(side="top", fill="both", expand=True)
def _init_paging(self, parent):
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
self.prev = prev = Button(
innerframe,
text="Previous",
command=self.previous,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
prev.pack(side="left", anchor="center")
self.next = next = Button(
innerframe,
text="Next",
command=self.__next__,
width="10",
borderwidth=1,
highlightthickness=1,
state="disabled",
)
next.pack(side="right", anchor="center")
innerframe.pack(side="top", fill="y")
self.current_page = 0
def previous(self):
self.clear_results_box()
self.freeze_editable()
self.model.prev(self.current_page - 1)
def __next__(self):
self.clear_results_box()
self.freeze_editable()
self.model.next(self.current_page + 1)
def about(self, *e):
ABOUT = "NLTK Concordance Search Demo\n"
TITLE = "About: NLTK Concordance Search Demo"
try:
from tkinter.messagebox import Message
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
except:
ShowText(self.top, TITLE, ABOUT)
def _bind_event_handlers(self):
self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
def _poll(self):
try:
event = self.queue.get(block=False)
except q.Empty:
pass
else:
if event == CORPUS_LOADED_EVENT:
self.handle_corpus_loaded(event)
elif event == SEARCH_TERMINATED_EVENT:
self.handle_search_terminated(event)
elif event == SEARCH_ERROR_EVENT:
self.handle_search_error(event)
elif event == ERROR_LOADING_CORPUS_EVENT:
self.handle_error_loading_corpus(event)
self.after = self.top.after(POLL_INTERVAL, self._poll)
def handle_error_loading_corpus(self, event):
self.status["text"] = "Error in loading " + self.var.get()
self.unfreeze_editable()
self.clear_all()
self.freeze_editable()
def handle_corpus_loaded(self, event):
self.status["text"] = self.var.get() + " is loaded"
self.unfreeze_editable()
self.clear_all()
self.query_box.focus_set()
def handle_search_terminated(self, event):
# todo: refactor the model such that it is less state sensitive
results = self.model.get_results()
self.write_results(results)
self.status["text"] = ""
if len(results) == 0:
self.status["text"] = "No results found for " + self.model.query
else:
self.current_page = self.model.last_requested_page
self.unfreeze_editable()
self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
def handle_search_error(self, event):
self.status["text"] = "Error in query " + self.model.query
self.unfreeze_editable()
def corpus_selected(self, *args):
new_selection = self.var.get()
self.load_corpus(new_selection)
def load_corpus(self, selection):
if self.model.selected_corpus != selection:
self.status["text"] = "Loading " + selection + "..."
self.freeze_editable()
self.model.load_corpus(selection)
def search(self):
self.current_page = 0
self.clear_results_box()
self.model.reset_results()
query = self.query_box.get()
if len(query.strip()) == 0:
return
self.status["text"] = "Searching for " + query
self.freeze_editable()
self.model.search(query, self.current_page + 1)
def write_results(self, results):
self.results_box["state"] = "normal"
row = 1
for each in results:
sent, pos1, pos2 = each[0].strip(), each[1], each[2]
if len(sent) != 0:
if pos1 < self._char_before:
sent, pos1, pos2 = self.pad(sent, pos1, pos2)
sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
if not row == len(results):
sentence += "\n"
self.results_box.insert(str(row) + ".0", sentence)
word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
for marker in word_markers:
self.results_box.tag_add(
self._HIGHLIGHT_WORD_TAG,
str(row) + "." + str(marker[0]),
str(row) + "." + str(marker[1]),
)
for marker in label_markers:
self.results_box.tag_add(
self._HIGHLIGHT_LABEL_TAG,
str(row) + "." + str(marker[0]),
str(row) + "." + str(marker[1]),
)
row += 1
self.results_box["state"] = "disabled"
def words_and_labels(self, sentence, pos1, pos2):
search_exp = sentence[pos1:pos2]
words, labels = [], []
labeled_words = search_exp.split(" ")
index = 0
for each in labeled_words:
if each == "":
index += 1
else:
word, label = each.split("/")
words.append(
(self._char_before + index, self._char_before + index + len(word))
)
index += len(word) + 1
labels.append(
(self._char_before + index, self._char_before + index + len(label))
)
index += len(label)
index += 1
return words, labels
def pad(self, sent, hstart, hend):
if hstart >= self._char_before:
return sent, hstart, hend
d = self._char_before - hstart
sent = "".join([" "] * d) + sent
return sent, hstart + d, hend + d
def destroy(self, *e):
if self.top is None:
return
self.top.after_cancel(self.after)
self.top.destroy()
self.top = None
def clear_all(self):
self.query_box.delete(0, END)
self.model.reset_query()
self.clear_results_box()
def clear_results_box(self):
self.results_box["state"] = "normal"
self.results_box.delete("1.0", END)
self.results_box["state"] = "disabled"
def freeze_editable(self):
self.query_box["state"] = "disabled"
self.search_button["state"] = "disabled"
self.prev["state"] = "disabled"
self.next["state"] = "disabled"
def unfreeze_editable(self):
self.query_box["state"] = "normal"
self.search_button["state"] = "normal"
self.set_paging_button_states()
def set_paging_button_states(self):
if self.current_page == 0 or self.current_page == 1:
self.prev["state"] = "disabled"
else:
self.prev["state"] = "normal"
if self.model.has_more_pages(self.current_page):
self.next["state"] = "normal"
else:
self.next["state"] = "disabled"
def fire_event(self, event):
# Firing an event so that rendering of widgets happen in the mainloop thread
self.top.event_generate(event, when="tail")
def mainloop(self, *args, **kwargs):
if in_idle():
return
self.top.mainloop(*args, **kwargs)
class ConcordanceSearchModel:
def __init__(self, queue):
self.queue = queue
self.CORPORA = _CORPORA
self.DEFAULT_CORPUS = _DEFAULT
self.selected_corpus = None
self.reset_query()
self.reset_results()
self.result_count = None
self.last_sent_searched = 0
def non_default_corpora(self):
copy = []
copy.extend(list(self.CORPORA.keys()))
copy.remove(self.DEFAULT_CORPUS)
copy.sort()
return copy
def load_corpus(self, name):
self.selected_corpus = name
self.tagged_sents = []
runner_thread = self.LoadCorpus(name, self)
runner_thread.start()
def search(self, query, page):
self.query = query
self.last_requested_page = page
self.SearchCorpus(self, page, self.result_count).start()
def next(self, page):
self.last_requested_page = page
if len(self.results) < page:
self.search(self.query, page)
else:
self.queue.put(SEARCH_TERMINATED_EVENT)
def prev(self, page):
self.last_requested_page = page
self.queue.put(SEARCH_TERMINATED_EVENT)
def reset_results(self):
self.last_sent_searched = 0
self.results = []
self.last_page = None
def reset_query(self):
self.query = None
def set_results(self, page, resultset):
self.results.insert(page - 1, resultset)
def get_results(self):
return self.results[self.last_requested_page - 1]
def has_more_pages(self, page):
if self.results == [] or self.results[0] == []:
return False
if self.last_page is None:
return True
return page < self.last_page
class LoadCorpus(threading.Thread):
def __init__(self, name, model):
threading.Thread.__init__(self)
self.model, self.name = model, name
def run(self):
try:
ts = self.model.CORPORA[self.name]()
self.model.tagged_sents = [
" ".join(w + "/" + t for (w, t) in sent) for sent in ts
]
self.model.queue.put(CORPUS_LOADED_EVENT)
except Exception as e:
print(e)
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
class SearchCorpus(threading.Thread):
def __init__(self, model, page, count):
self.model, self.count, self.page = model, count, page
threading.Thread.__init__(self)
def run(self):
q = self.processed_query()
sent_pos, i, sent_count = [], 0, 0
for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
try:
m = re.search(q, sent)
except re.error:
self.model.reset_results()
self.model.queue.put(SEARCH_ERROR_EVENT)
return
if m:
sent_pos.append((sent, m.start(), m.end()))
i += 1
if i > self.count:
self.model.last_sent_searched += sent_count - 1
break
sent_count += 1
if self.count >= len(sent_pos):
self.model.last_sent_searched += sent_count - 1
self.model.last_page = self.page
self.model.set_results(self.page, sent_pos)
else:
self.model.set_results(self.page, sent_pos[:-1])
self.model.queue.put(SEARCH_TERMINATED_EVENT)
def processed_query(self):
new = []
for term in self.model.query.split():
term = re.sub(r"\.", r"[^/ ]", term)
if re.match("[A-Z]+$", term):
new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
elif "/" in term:
new.append(BOUNDARY + term + BOUNDARY)
else:
new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
return " ".join(new)
def app():
d = ConcordanceSearchView()
d.mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

View File

@@ -0,0 +1,163 @@
# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06
# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783
"""
Finding (and Replacing) Nemo
Instant Regular Expressions
Created by Aristide Grange
"""
import itertools
import re
from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk
windowTitle = "Finding (and Replacing) Nemo"
initialFind = r"n(.*?)e(.*?)m(.*?)o"
initialRepl = r"M\1A\2K\3I"
initialText = """\
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
"""
images = {
"FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
"find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
"REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
"repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
}
colors = ["#FF7B39", "#80F121"]
emphColors = ["#DAFC33", "#F42548"]
fieldParams = {
"height": 3,
"width": 70,
"font": ("monaco", 14),
"highlightthickness": 0,
"borderwidth": 0,
"background": "white",
}
textParams = {
"bg": "#F7E0D4",
"fg": "#2321F1",
"highlightthickness": 0,
"width": 1,
"height": 10,
"font": ("verdana", 16),
"wrap": "word",
}
class Zone:
def __init__(self, image, initialField, initialText):
frm = Frame(root)
frm.config(background="white")
self.image = PhotoImage(format="gif", data=images[image.upper()])
self.imageDimmed = PhotoImage(format="gif", data=images[image])
self.img = Label(frm)
self.img.config(borderwidth=0)
self.img.pack(side="left")
self.fld = Text(frm, **fieldParams)
self.initScrollText(frm, self.fld, initialField)
frm = Frame(root)
self.txt = Text(frm, **textParams)
self.initScrollText(frm, self.txt, initialText)
for i in range(2):
self.txt.tag_config(colors[i], background=colors[i])
self.txt.tag_config("emph" + colors[i], foreground=emphColors[i])
def initScrollText(self, frm, txt, contents):
scl = Scrollbar(frm)
scl.config(command=txt.yview)
scl.pack(side="right", fill="y")
txt.pack(side="left", expand=True, fill="x")
txt.config(yscrollcommand=scl.set)
txt.insert("1.0", contents)
frm.pack(fill="x")
Frame(height=2, bd=1, relief="ridge").pack(fill="x")
def refresh(self):
self.colorCycle = itertools.cycle(colors)
try:
self.substitute()
self.img.config(image=self.image)
except re.error:
self.img.config(image=self.imageDimmed)
class FindZone(Zone):
def addTags(self, m):
color = next(self.colorCycle)
self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end())
try:
self.txt.tag_add(
"emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph")
)
except:
pass
def substitute(self, *args):
for color in colors:
self.txt.tag_remove(color, "1.0", "end")
self.txt.tag_remove("emph" + color, "1.0", "end")
self.rex = re.compile("") # default value in case of malformed regexp
self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE)
try:
re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST, SEL_LAST))
self.rexSel = re.compile(
"%s(?P<emph>%s)%s"
% (
self.fld.get("1.0", SEL_FIRST),
self.fld.get(SEL_FIRST, SEL_LAST),
self.fld.get(SEL_LAST, "end")[:-1],
),
re.MULTILINE,
)
except:
self.rexSel = self.rex
self.rexSel.sub(self.addTags, self.txt.get("1.0", "end"))
class ReplaceZone(Zone):
def addTags(self, m):
s = sz.rex.sub(self.repl, m.group())
self.txt.delete(
"1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff)
)
self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle))
self.diff += len(s) - (m.end() - m.start())
def substitute(self):
self.txt.delete("1.0", "end")
self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1])
self.diff = 0
self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1])
sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1])
def launchRefresh(_):
sz.fld.after_idle(sz.refresh)
rz.fld.after_idle(rz.refresh)
def app():
global root, sz, rz, rex0
root = Tk()
root.resizable(height=False, width=True)
root.title(windowTitle)
root.minsize(width=250, height=0)
sz = FindZone("find", initialFind, initialText)
sz.fld.bind("<Button-1>", launchRefresh)
sz.fld.bind("<ButtonRelease-1>", launchRefresh)
sz.fld.bind("<B1-Motion>", launchRefresh)
sz.rexSel = re.compile("")
rz = ReplaceZone("repl", initialRepl, "")
rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
root.bind_all("<Key>", launchRefresh)
launchRefresh(None)
root.mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,937 @@
# Natural Language Toolkit: Shift-Reduce Parser Application
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A graphical tool for exploring the shift-reduce parser.
The shift-reduce parser maintains a stack, which records the structure
of the portion of the text that has been parsed. The stack is
initially empty. Its contents are shown on the left side of the main
canvas.
On the right side of the main canvas is the remaining text. This is
the portion of the text which has not yet been considered by the
parser.
The parser builds up a tree structure for the text using two
operations:
- "shift" moves the first token from the remaining text to the top
of the stack. In the demo, the top of the stack is its right-hand
side.
- "reduce" uses a grammar production to combine the rightmost stack
elements into a single tree token.
You can control the parser's operation by using the "shift" and
"reduce" buttons; or you can use the "step" button to let the parser
automatically decide which operation to apply. The parser uses the
following rules to decide which operation to apply:
- Only shift if no reductions are available.
- If multiple reductions are available, then apply the reduction
whose CFG production is listed earliest in the grammar.
The "reduce" button applies the reduction whose CFG production is
listed earliest in the grammar. There are two ways to manually choose
which reduction to apply:
- Click on a CFG production from the list of available reductions,
on the left side of the main window. The reduction based on that
production will be applied to the top of the stack.
- Click on one of the stack elements. A popup window will appear,
containing all available reductions. Select one, and it will be
applied to the top of the stack.
Note that reductions can only be applied to the top of the stack.
Keyboard Shortcuts::
[Space]\t Perform the next shift or reduce operation
[s]\t Perform a shift operation
[r]\t Perform a reduction operation
[Ctrl-z]\t Undo most recent operation
[Delete]\t Reset the parser
[g]\t Show/hide available production list
[Ctrl-a]\t Toggle animations
[h]\t Help
[Ctrl-p]\t Print
[q]\t Quit
"""
from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
from tkinter.font import Font
from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
from nltk.parse import SteppingShiftReduceParser
from nltk.tree import Tree
from nltk.util import in_idle
"""
Possible future improvements:
- button/window to change and/or select text. Just pop up a window
with an entry, and let them modify the text; and then retokenize
it? Maybe give a warning if it contains tokens whose types are
not in the grammar.
- button/window to change and/or select grammar. Select from
several alternative grammars? Or actually change the grammar? If
the later, then I'd want to define nltk.draw.cfg, which would be
responsible for that.
"""
class ShiftReduceApp:
"""
A graphical tool for exploring the shift-reduce parser. The tool
displays the parser's stack and the remaining text, and allows the
user to control the parser's operation. In particular, the user
can shift tokens onto the stack, and can perform reductions on the
top elements of the stack. A "step" button simply steps through
the parsing process, performing the operations that
``nltk.parse.ShiftReduceParser`` would use.
"""
def __init__(self, grammar, sent, trace=0):
self._sent = sent
self._parser = SteppingShiftReduceParser(grammar, trace)
# Set up the main window.
self._top = Tk()
self._top.title("Shift Reduce Parser Application")
# Animations. animating_lock is a lock to prevent the demo
# from performing new operations while it's animating.
self._animating_lock = 0
self._animate = IntVar(self._top)
self._animate.set(10) # = medium
# The user can hide the grammar.
self._show_grammar = IntVar(self._top)
self._show_grammar.set(1)
# Initialize fonts.
self._init_fonts(self._top)
# Set up key bindings.
self._init_bindings()
# Create the basic frames.
self._init_menubar(self._top)
self._init_buttons(self._top)
self._init_feedback(self._top)
self._init_grammar(self._top)
self._init_canvas(self._top)
# A popup menu for reducing.
self._reduce_menu = Menu(self._canvas, tearoff=0)
# Reset the demo, and set the feedback frame to empty.
self.reset()
self._lastoper1["text"] = ""
#########################################
## Initialization Helpers
#########################################
def _init_fonts(self, root):
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
self._size.set(self._sysfont.cget("size"))
self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
self._font = Font(family="helvetica", size=self._size.get())
def _init_grammar(self, parent):
# Grammar view.
self._prodframe = listframe = Frame(parent)
self._prodframe.pack(fill="both", side="left", padx=2)
self._prodlist_label = Label(
self._prodframe, font=self._boldfont, text="Available Reductions"
)
self._prodlist_label.pack()
self._prodlist = Listbox(
self._prodframe,
selectmode="single",
relief="groove",
background="white",
foreground="#909090",
font=self._font,
selectforeground="#004040",
selectbackground="#c0f0c0",
)
self._prodlist.pack(side="right", fill="both", expand=1)
self._productions = list(self._parser.grammar().productions())
for production in self._productions:
self._prodlist.insert("end", (" %s" % production))
self._prodlist.config(height=min(len(self._productions), 25))
# Add a scrollbar if there are more than 25 productions.
if 1: # len(self._productions) > 25:
listscroll = Scrollbar(self._prodframe, orient="vertical")
self._prodlist.config(yscrollcommand=listscroll.set)
listscroll.config(command=self._prodlist.yview)
listscroll.pack(side="left", fill="y")
# If they select a production, apply it.
self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
# When they hover over a production, highlight it.
self._hover = -1
self._prodlist.bind("<Motion>", self._highlight_hover)
self._prodlist.bind("<Leave>", self._clear_hover)
def _init_bindings(self):
# Quit
self._top.bind("<Control-q>", self.destroy)
self._top.bind("<Control-x>", self.destroy)
self._top.bind("<Alt-q>", self.destroy)
self._top.bind("<Alt-x>", self.destroy)
# Ops (step, shift, reduce, undo)
self._top.bind("<space>", self.step)
self._top.bind("<s>", self.shift)
self._top.bind("<Alt-s>", self.shift)
self._top.bind("<Control-s>", self.shift)
self._top.bind("<r>", self.reduce)
self._top.bind("<Alt-r>", self.reduce)
self._top.bind("<Control-r>", self.reduce)
self._top.bind("<Delete>", self.reset)
self._top.bind("<u>", self.undo)
self._top.bind("<Alt-u>", self.undo)
self._top.bind("<Control-u>", self.undo)
self._top.bind("<Control-z>", self.undo)
self._top.bind("<BackSpace>", self.undo)
# Misc
self._top.bind("<Control-p>", self.postscript)
self._top.bind("<Control-h>", self.help)
self._top.bind("<F1>", self.help)
self._top.bind("<Control-g>", self.edit_grammar)
self._top.bind("<Control-t>", self.edit_sentence)
# Animation speed control
self._top.bind("-", lambda e, a=self._animate: a.set(20))
self._top.bind("=", lambda e, a=self._animate: a.set(10))
self._top.bind("+", lambda e, a=self._animate: a.set(4))
def _init_buttons(self, parent):
# Set up the frames.
self._buttonframe = buttonframe = Frame(parent)
buttonframe.pack(fill="none", side="bottom")
Button(
buttonframe,
text="Step",
background="#90c0d0",
foreground="black",
command=self.step,
).pack(side="left")
Button(
buttonframe,
text="Shift",
underline=0,
background="#90f090",
foreground="black",
command=self.shift,
).pack(side="left")
Button(
buttonframe,
text="Reduce",
underline=0,
background="#90f090",
foreground="black",
command=self.reduce,
).pack(side="left")
Button(
buttonframe,
text="Undo",
underline=0,
background="#f0a0a0",
foreground="black",
command=self.undo,
).pack(side="left")
def _init_menubar(self, parent):
menubar = Menu(parent)
filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(
label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
)
filemenu.add_command(
label="Print to Postscript",
underline=0,
command=self.postscript,
accelerator="Ctrl-p",
)
filemenu.add_command(
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
)
menubar.add_cascade(label="File", underline=0, menu=filemenu)
editmenu = Menu(menubar, tearoff=0)
editmenu.add_command(
label="Edit Grammar",
underline=5,
command=self.edit_grammar,
accelerator="Ctrl-g",
)
editmenu.add_command(
label="Edit Text",
underline=5,
command=self.edit_sentence,
accelerator="Ctrl-t",
)
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
rulemenu = Menu(menubar, tearoff=0)
rulemenu.add_command(
label="Step", underline=1, command=self.step, accelerator="Space"
)
rulemenu.add_separator()
rulemenu.add_command(
label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
)
rulemenu.add_command(
label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
)
rulemenu.add_separator()
rulemenu.add_command(
label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
)
menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_checkbutton(
label="Show Grammar",
underline=0,
variable=self._show_grammar,
command=self._toggle_grammar,
)
viewmenu.add_separator()
viewmenu.add_radiobutton(
label="Tiny",
variable=self._size,
underline=0,
value=10,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Small",
variable=self._size,
underline=0,
value=12,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Medium",
variable=self._size,
underline=0,
value=14,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Large",
variable=self._size,
underline=0,
value=18,
command=self.resize,
)
viewmenu.add_radiobutton(
label="Huge",
variable=self._size,
underline=0,
value=24,
command=self.resize,
)
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
animatemenu = Menu(menubar, tearoff=0)
animatemenu.add_radiobutton(
label="No Animation", underline=0, variable=self._animate, value=0
)
animatemenu.add_radiobutton(
label="Slow Animation",
underline=0,
variable=self._animate,
value=20,
accelerator="-",
)
animatemenu.add_radiobutton(
label="Normal Animation",
underline=0,
variable=self._animate,
value=10,
accelerator="=",
)
animatemenu.add_radiobutton(
label="Fast Animation",
underline=0,
variable=self._animate,
value=4,
accelerator="+",
)
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
helpmenu = Menu(menubar, tearoff=0)
helpmenu.add_command(label="About", underline=0, command=self.about)
helpmenu.add_command(
label="Instructions", underline=0, command=self.help, accelerator="F1"
)
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
parent.config(menu=menubar)
def _init_feedback(self, parent):
self._feedbackframe = feedbackframe = Frame(parent)
feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
self._lastoper_label = Label(
feedbackframe, text="Last Operation:", font=self._font
)
self._lastoper_label.pack(side="left")
lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
self._lastoper1 = Label(
lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
)
self._lastoper2 = Label(
lastoperframe,
anchor="w",
width=30,
foreground="#004040",
background="#f0f0f0",
font=self._font,
)
self._lastoper1.pack(side="left")
self._lastoper2.pack(side="left", fill="x", expand=1)
def _init_canvas(self, parent):
self._cframe = CanvasFrame(
parent,
background="white",
width=525,
closeenough=10,
border=2,
relief="sunken",
)
self._cframe.pack(expand=1, fill="both", side="top", pady=2)
canvas = self._canvas = self._cframe.canvas()
self._stackwidgets = []
self._rtextwidgets = []
self._titlebar = canvas.create_rectangle(
0, 0, 0, 0, fill="#c0f0f0", outline="black"
)
self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
size = self._size.get() + 4
self._stacklabel = TextWidget(
canvas, "Stack", color="#004040", font=self._boldfont
)
self._rtextlabel = TextWidget(
canvas, "Remaining Text", color="#004040", font=self._boldfont
)
self._cframe.add_widget(self._stacklabel)
self._cframe.add_widget(self._rtextlabel)
#########################################
## Main draw procedure
#########################################
def _redraw(self):
scrollregion = self._canvas["scrollregion"].split()
(cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion)
# Delete the old stack & rtext widgets.
for stackwidget in self._stackwidgets:
self._cframe.destroy_widget(stackwidget)
self._stackwidgets = []
for rtextwidget in self._rtextwidgets:
self._cframe.destroy_widget(rtextwidget)
self._rtextwidgets = []
# Position the titlebar & exprline
(x1, y1, x2, y2) = self._stacklabel.bbox()
y = y2 - y1 + 10
self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4)
self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10)
# Position the titlebar labels..
(x1, y1, x2, y2) = self._stacklabel.bbox()
self._stacklabel.move(5 - x1, 3 - y1)
(x1, y1, x2, y2) = self._rtextlabel.bbox()
self._rtextlabel.move(cx2 - x2 - 5, 3 - y1)
# Draw the stack.
stackx = 5
for tok in self._parser.stack():
if isinstance(tok, Tree):
attribs = {
"tree_color": "#4080a0",
"tree_width": 2,
"node_font": self._boldfont,
"node_color": "#006060",
"leaf_color": "#006060",
"leaf_font": self._font,
}
widget = tree_to_treesegment(self._canvas, tok, **attribs)
widget.label()["color"] = "#000000"
else:
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
widget.bind_click(self._popup_reduce)
self._stackwidgets.append(widget)
self._cframe.add_widget(widget, stackx, y)
stackx = widget.bbox()[2] + 10
# Draw the remaining text.
rtextwidth = 0
for tok in self._parser.remaining_text():
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
self._rtextwidgets.append(widget)
self._cframe.add_widget(widget, rtextwidth, y)
rtextwidth = widget.bbox()[2] + 4
# Allow enough room to shift the next token (for animations)
if len(self._rtextwidgets) > 0:
stackx += self._rtextwidgets[0].width()
# Move the remaining text to the correct location (keep it
# right-justified, when possible); and move the remaining text
# label, if necessary.
stackx = max(stackx, self._stacklabel.width() + 25)
rlabelwidth = self._rtextlabel.width() + 10
if stackx >= cx2 - max(rtextwidth, rlabelwidth):
cx2 = stackx + max(rtextwidth, rlabelwidth)
for rtextwidget in self._rtextwidgets:
rtextwidget.move(4 + cx2 - rtextwidth, 0)
self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0)
midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
(x1, y1, x2, y2) = self._stacklabel.bbox()
# Set up binding to allow them to shift a token by dragging it.
if len(self._rtextwidgets) > 0:
def drag_shift(widget, midx=midx, self=self):
if widget.bbox()[0] < midx:
self.shift()
else:
self._redraw()
self._rtextwidgets[0].bind_drag(drag_shift)
self._rtextwidgets[0].bind_click(self.shift)
# Draw the stack top.
self._highlight_productions()
def _draw_stack_top(self, widget):
# hack..
midx = widget.bbox()[2] + 50
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
def _highlight_productions(self):
# Highlight the productions that can be reduced.
self._prodlist.selection_clear(0, "end")
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
#########################################
## Button Callbacks
#########################################
def destroy(self, *e):
if self._top is None:
return
self._top.destroy()
self._top = None
def reset(self, *e):
self._parser.initialize(self._sent)
self._lastoper1["text"] = "Reset App"
self._lastoper2["text"] = ""
self._redraw()
def step(self, *e):
if self.reduce():
return True
elif self.shift():
return True
else:
if list(self._parser.parses()):
self._lastoper1["text"] = "Finished:"
self._lastoper2["text"] = "Success"
else:
self._lastoper1["text"] = "Finished:"
self._lastoper2["text"] = "Failure"
def shift(self, *e):
if self._animating_lock:
return
if self._parser.shift():
tok = self._parser.stack()[-1]
self._lastoper1["text"] = "Shift:"
self._lastoper2["text"] = "%r" % tok
if self._animate.get():
self._animate_shift()
else:
self._redraw()
return True
return False
def reduce(self, *e):
if self._animating_lock:
return
production = self._parser.reduce()
if production:
self._lastoper1["text"] = "Reduce:"
self._lastoper2["text"] = "%s" % production
if self._animate.get():
self._animate_reduce()
else:
self._redraw()
return production
def undo(self, *e):
if self._animating_lock:
return
if self._parser.undo():
self._redraw()
def postscript(self, *e):
self._cframe.print_to_file()
def mainloop(self, *args, **kwargs):
"""
Enter the Tkinter mainloop. This function must be called if
this demo is created from a non-interactive program (e.g.
from a secript); otherwise, the demo will close as soon as
the script completes.
"""
if in_idle():
return
self._top.mainloop(*args, **kwargs)
#########################################
## Menubar callbacks
#########################################
def resize(self, size=None):
if size is not None:
self._size.set(size)
size = self._size.get()
self._font.configure(size=-(abs(size)))
self._boldfont.configure(size=-(abs(size)))
self._sysfont.configure(size=-(abs(size)))
# self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
# self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
# self._lastoper_label['font'] = ('helvetica', -size)
# self._lastoper1['font'] = ('helvetica', -size)
# self._lastoper2['font'] = ('helvetica', -size)
# self._prodlist['font'] = ('helvetica', -size)
# self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
self._redraw()
def help(self, *e):
# The default font's not very legible; try using 'fixed' instead.
try:
ShowText(
self._top,
"Help: Shift-Reduce Parser Application",
(__doc__ or "").strip(),
width=75,
font="fixed",
)
except:
ShowText(
self._top,
"Help: Shift-Reduce Parser Application",
(__doc__ or "").strip(),
width=75,
)
def about(self, *e):
ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
TITLE = "About: Shift-Reduce Parser Application"
try:
from tkinter.messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
def edit_grammar(self, *e):
CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
def set_grammar(self, grammar):
self._parser.set_grammar(grammar)
self._productions = list(grammar.productions())
self._prodlist.delete(0, "end")
for production in self._productions:
self._prodlist.insert("end", (" %s" % production))
def edit_sentence(self, *e):
sentence = " ".join(self._sent)
title = "Edit Text"
instr = "Enter a new sentence to parse."
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
def set_sentence(self, sent):
self._sent = sent.split() # [XX] use tagged?
self.reset()
#########################################
## Reduce Production Selection
#########################################
def _toggle_grammar(self, *e):
if self._show_grammar.get():
self._prodframe.pack(
fill="both", side="left", padx=2, after=self._feedbackframe
)
self._lastoper1["text"] = "Show Grammar"
else:
self._prodframe.pack_forget()
self._lastoper1["text"] = "Hide Grammar"
self._lastoper2["text"] = ""
def _prodlist_select(self, event):
selection = self._prodlist.curselection()
if len(selection) != 1:
return
index = int(selection[0])
production = self._parser.reduce(self._productions[index])
if production:
self._lastoper1["text"] = "Reduce:"
self._lastoper2["text"] = "%s" % production
if self._animate.get():
self._animate_reduce()
else:
self._redraw()
else:
# Reset the production selections.
self._prodlist.selection_clear(0, "end")
for prod in self._parser.reducible_productions():
index = self._productions.index(prod)
self._prodlist.selection_set(index)
def _popup_reduce(self, widget):
# Remove old commands.
productions = self._parser.reducible_productions()
if len(productions) == 0:
return
self._reduce_menu.delete(0, "end")
for production in productions:
self._reduce_menu.add_command(label=str(production), command=self.reduce)
self._reduce_menu.post(
self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()
)
#########################################
## Animations
#########################################
def _animate_shift(self):
# What widget are we shifting?
widget = self._rtextwidgets[0]
# Where are we shifting from & to?
right = widget.bbox()[0]
if len(self._stackwidgets) == 0:
left = 5
else:
left = self._stackwidgets[-1].bbox()[2] + 10
# Start animating.
dt = self._animate.get()
dx = (left - right) * 1.0 / dt
self._animate_shift_frame(dt, widget, dx)
def _animate_shift_frame(self, frame, widget, dx):
if frame > 0:
self._animating_lock = 1
widget.move(dx, 0)
self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx)
else:
# but: stacktop??
# Shift the widget to the stack.
del self._rtextwidgets[0]
self._stackwidgets.append(widget)
self._animating_lock = 0
# Display the available productions.
self._draw_stack_top(widget)
self._highlight_productions()
def _animate_reduce(self):
# What widgets are we shifting?
numwidgets = len(self._parser.stack()[-1]) # number of children
widgets = self._stackwidgets[-numwidgets:]
# How far are we moving?
if isinstance(widgets[0], TreeSegmentWidget):
ydist = 15 + widgets[0].label().height()
else:
ydist = 15 + widgets[0].height()
# Start animating.
dt = self._animate.get()
dy = ydist * 2.0 / dt
self._animate_reduce_frame(dt / 2, widgets, dy)
def _animate_reduce_frame(self, frame, widgets, dy):
if frame > 0:
self._animating_lock = 1
for widget in widgets:
widget.move(0, dy)
self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy)
else:
del self._stackwidgets[-len(widgets) :]
for widget in widgets:
self._cframe.remove_widget(widget)
tok = self._parser.stack()[-1]
if not isinstance(tok, Tree):
raise ValueError()
label = TextWidget(
self._canvas, str(tok.label()), color="#006060", font=self._boldfont
)
widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
(x1, y1, x2, y2) = self._stacklabel.bbox()
y = y2 - y1 + 10
if not self._stackwidgets:
x = 5
else:
x = self._stackwidgets[-1].bbox()[2] + 10
self._cframe.add_widget(widget, x, y)
self._stackwidgets.append(widget)
# Display the available productions.
self._draw_stack_top(widget)
self._highlight_productions()
# # Delete the old widgets..
# del self._stackwidgets[-len(widgets):]
# for widget in widgets:
# self._cframe.destroy_widget(widget)
#
# # Make a new one.
# tok = self._parser.stack()[-1]
# if isinstance(tok, Tree):
# attribs = {'tree_color': '#4080a0', 'tree_width': 2,
# 'node_font': bold, 'node_color': '#006060',
# 'leaf_color': '#006060', 'leaf_font':self._font}
# widget = tree_to_treesegment(self._canvas, tok.type(),
# **attribs)
# widget.node()['color'] = '#000000'
# else:
# widget = TextWidget(self._canvas, tok.type(),
# color='#000000', font=self._font)
# widget.bind_click(self._popup_reduce)
# (x1, y1, x2, y2) = self._stacklabel.bbox()
# y = y2-y1+10
# if not self._stackwidgets: x = 5
# else: x = self._stackwidgets[-1].bbox()[2] + 10
# self._cframe.add_widget(widget, x, y)
# self._stackwidgets.append(widget)
# self._redraw()
self._animating_lock = 0
#########################################
## Hovering.
#########################################
def _highlight_hover(self, event):
# What production are we hovering over?
index = self._prodlist.nearest(event.y)
if self._hover == index:
return
# Clear any previous hover highlighting.
self._clear_hover()
# If the production corresponds to an available reduction,
# highlight the stack.
selection = [int(s) for s in self._prodlist.curselection()]
if index in selection:
rhslen = len(self._productions[index].rhs())
for stackwidget in self._stackwidgets[-rhslen:]:
if isinstance(stackwidget, TreeSegmentWidget):
stackwidget.label()["color"] = "#00a000"
else:
stackwidget["color"] = "#00a000"
# Remember what production we're hovering over.
self._hover = index
def _clear_hover(self, *event):
# Clear any previous hover highlighting.
if self._hover == -1:
return
self._hover = -1
for stackwidget in self._stackwidgets:
if isinstance(stackwidget, TreeSegmentWidget):
stackwidget.label()["color"] = "black"
else:
stackwidget["color"] = "black"
def app():
"""
Create a shift reduce parser app, using a simple grammar and
text.
"""
from nltk.grammar import CFG, Nonterminal, Production
nonterminals = "S VP NP PP P N Name V Det"
(S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split())
productions = (
# Syntactic Productions
Production(S, [NP, VP]),
Production(NP, [Det, N]),
Production(NP, [NP, PP]),
Production(VP, [VP, PP]),
Production(VP, [V, NP, PP]),
Production(VP, [V, NP]),
Production(PP, [P, NP]),
# Lexical Productions
Production(NP, ["I"]),
Production(Det, ["the"]),
Production(Det, ["a"]),
Production(N, ["man"]),
Production(V, ["saw"]),
Production(P, ["in"]),
Production(P, ["with"]),
Production(N, ["park"]),
Production(N, ["dog"]),
Production(N, ["statue"]),
Production(Det, ["my"]),
)
grammar = CFG(S, productions)
# tokenize the sentence
sent = "my dog saw a man in the park with a statue".split()
ShiftReduceApp(grammar, sent).mainloop()
if __name__ == "__main__":
app()
__all__ = ["app"]

View File

@@ -0,0 +1,36 @@
# Natural Language Toolkit: Wordfreq Application
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from matplotlib import pylab
from nltk.corpus import gutenberg
from nltk.text import Text
def plot_word_freq_dist(text):
fd = text.vocab()
samples = [item for item, _ in fd.most_common(50)]
values = [fd[sample] for sample in samples]
values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))]
pylab.title(text.name)
pylab.xlabel("Samples")
pylab.ylabel("Cumulative Percentage")
pylab.plot(values)
pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
pylab.show()
def app():
t1 = Text(gutenberg.words("melville-moby_dick.txt"))
plot_word_freq_dist(t1)
if __name__ == "__main__":
app()
__all__ = ["app"]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,213 @@
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus import (
genesis,
gutenberg,
inaugural,
nps_chat,
treebank,
webtext,
wordnet,
)
from nltk.probability import FreqDist
from nltk.text import Text
from nltk.util import bigrams
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")
text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)
text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)
text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)
text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)
text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)
text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)
text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)
text9 = Text(gutenberg.words("chesterton-thursday.txt"))
print("text9:", text9.name)
def texts():
print("text1:", text1.name)
print("text2:", text2.name)
print("text3:", text3.name)
print("text4:", text4.name)
print("text5:", text5.name)
print("text6:", text6.name)
print("text7:", text7.name)
print("text8:", text8.name)
print("text9:", text9.name)
sent1 = ["Call", "me", "Ishmael", "."]
sent2 = [
"The",
"family",
"of",
"Dashwood",
"had",
"long",
"been",
"settled",
"in",
"Sussex",
".",
]
sent3 = [
"In",
"the",
"beginning",
"God",
"created",
"the",
"heaven",
"and",
"the",
"earth",
".",
]
sent4 = [
"Fellow",
"-",
"Citizens",
"of",
"the",
"Senate",
"and",
"of",
"the",
"House",
"of",
"Representatives",
":",
]
sent5 = [
"I",
"have",
"a",
"problem",
"with",
"people",
"PMing",
"me",
"to",
"lol",
"JOIN",
]
sent6 = [
"SCENE",
"1",
":",
"[",
"wind",
"]",
"[",
"clop",
"clop",
"clop",
"]",
"KING",
"ARTHUR",
":",
"Whoa",
"there",
"!",
]
sent7 = [
"Pierre",
"Vinken",
",",
"61",
"years",
"old",
",",
"will",
"join",
"the",
"board",
"as",
"a",
"nonexecutive",
"director",
"Nov.",
"29",
".",
]
sent8 = [
"25",
"SEXY",
"MALE",
",",
"seeks",
"attrac",
"older",
"single",
"lady",
",",
"for",
"discreet",
"encounters",
".",
]
sent9 = [
"THE",
"suburb",
"of",
"Saffron",
"Park",
"lay",
"on",
"the",
"sunset",
"side",
"of",
"London",
",",
"as",
"red",
"and",
"ragged",
"as",
"a",
"cloud",
"of",
"sunset",
".",
]
def sents():
print("sent1:", " ".join(sent1))
print("sent2:", " ".join(sent2))
print("sent3:", " ".join(sent3))
print("sent4:", " ".join(sent4))
print("sent5:", " ".join(sent5))
print("sent6:", " ".join(sent6))
print("sent7:", " ".join(sent7))
print("sent8:", " ".join(sent8))
print("sent9:", " ".join(sent9))

View File

@@ -0,0 +1,34 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Combinatory Categorial Grammar.
For more information see nltk/doc/contrib/ccg/ccg.pdf
"""
from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge
from nltk.ccg.combinator import (
BackwardApplication,
BackwardBx,
BackwardCombinator,
BackwardComposition,
BackwardSx,
BackwardT,
DirectedBinaryCombinator,
ForwardApplication,
ForwardCombinator,
ForwardComposition,
ForwardSubstitution,
ForwardT,
UndirectedBinaryCombinator,
UndirectedComposition,
UndirectedFunctionApplication,
UndirectedSubstitution,
UndirectedTypeRaise,
)
from nltk.ccg.lexicon import CCGLexicon

View File

@@ -0,0 +1,358 @@
# Natural Language Toolkit: CCG Categories
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
from functools import total_ordering
from nltk.internals import raise_unorderable_types
@total_ordering
class AbstractCCGCategory(metaclass=ABCMeta):
"""
Interface for categories in combinatory grammars.
"""
@abstractmethod
def is_primitive(self):
"""
Returns true if the category is primitive.
"""
@abstractmethod
def is_function(self):
"""
Returns true if the category is a function application.
"""
@abstractmethod
def is_var(self):
"""
Returns true if the category is a variable.
"""
@abstractmethod
def substitute(self, substitutions):
"""
Takes a set of (var, category) substitutions, and replaces every
occurrence of the variable with the corresponding category.
"""
@abstractmethod
def can_unify(self, other):
"""
Determines whether two categories can be unified.
- Returns None if they cannot be unified
- Returns a list of necessary substitutions if they can.
"""
# Utility functions: comparison, strings and hashing.
@abstractmethod
def __str__(self):
pass
def __eq__(self, other):
return (
self.__class__ is other.__class__
and self._comparison_key == other._comparison_key
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if not isinstance(other, AbstractCCGCategory):
raise_unorderable_types("<", self, other)
if self.__class__ is other.__class__:
return self._comparison_key < other._comparison_key
else:
return self.__class__.__name__ < other.__class__.__name__
def __hash__(self):
try:
return self._hash
except AttributeError:
self._hash = hash(self._comparison_key)
return self._hash
class CCGVar(AbstractCCGCategory):
"""
Class representing a variable CCG category.
Used for conjunctions (and possibly type-raising, if implemented as a
unary rule).
"""
_maxID = 0
def __init__(self, prim_only=False):
"""Initialize a variable (selects a new identifier)
:param prim_only: a boolean that determines whether the variable is
restricted to primitives
:type prim_only: bool
"""
self._id = self.new_id()
self._prim_only = prim_only
self._comparison_key = self._id
@classmethod
def new_id(cls):
"""
A class method allowing generation of unique variable identifiers.
"""
cls._maxID = cls._maxID + 1
return cls._maxID - 1
@classmethod
def reset_id(cls):
cls._maxID = 0
def is_primitive(self):
return False
def is_function(self):
return False
def is_var(self):
return True
def substitute(self, substitutions):
"""If there is a substitution corresponding to this variable,
return the substituted category.
"""
for var, cat in substitutions:
if var == self:
return cat
return self
def can_unify(self, other):
"""If the variable can be replaced with other
a substitution is returned.
"""
if other.is_primitive() or not self._prim_only:
return [(self, other)]
return None
def id(self):
return self._id
def __str__(self):
return "_var" + str(self._id)
@total_ordering
class Direction:
"""
Class representing the direction of a function application.
Also contains maintains information as to which combinators
may be used with the category.
"""
def __init__(self, dir, restrictions):
self._dir = dir
self._restrs = restrictions
self._comparison_key = (dir, tuple(restrictions))
# Testing the application direction
def is_forward(self):
return self._dir == "/"
def is_backward(self):
return self._dir == "\\"
def dir(self):
return self._dir
def restrs(self):
"""A list of restrictions on the combinators.
'.' denotes that permuting operations are disallowed
',' denotes that function composition is disallowed
'_' denotes that the direction has variable restrictions.
(This is redundant in the current implementation of type-raising)
"""
return self._restrs
def is_variable(self):
return self._restrs == "_"
# Unification and substitution of variable directions.
# Used only if type-raising is implemented as a unary rule, as it
# must inherit restrictions from the argument category.
def can_unify(self, other):
if other.is_variable():
return [("_", self.restrs())]
elif self.is_variable():
return [("_", other.restrs())]
else:
if self.restrs() == other.restrs():
return []
return None
def substitute(self, subs):
if not self.is_variable():
return self
for var, restrs in subs:
if var == "_":
return Direction(self._dir, restrs)
return self
# Testing permitted combinators
def can_compose(self):
return "," not in self._restrs
def can_cross(self):
return "." not in self._restrs
def __eq__(self, other):
return (
self.__class__ is other.__class__
and self._comparison_key == other._comparison_key
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if not isinstance(other, Direction):
raise_unorderable_types("<", self, other)
if self.__class__ is other.__class__:
return self._comparison_key < other._comparison_key
else:
return self.__class__.__name__ < other.__class__.__name__
def __hash__(self):
try:
return self._hash
except AttributeError:
self._hash = hash(self._comparison_key)
return self._hash
def __str__(self):
r_str = ""
for r in self._restrs:
r_str = r_str + "%s" % r
return f"{self._dir}{r_str}"
# The negation operator reverses the direction of the application
def __neg__(self):
if self._dir == "/":
return Direction("\\", self._restrs)
else:
return Direction("/", self._restrs)
class PrimitiveCategory(AbstractCCGCategory):
"""
Class representing primitive categories.
Takes a string representation of the category, and a
list of strings specifying the morphological subcategories.
"""
def __init__(self, categ, restrictions=[]):
self._categ = categ
self._restrs = restrictions
self._comparison_key = (categ, tuple(restrictions))
def is_primitive(self):
return True
def is_function(self):
return False
def is_var(self):
return False
def restrs(self):
return self._restrs
def categ(self):
return self._categ
# Substitution does nothing to a primitive category
def substitute(self, subs):
return self
# A primitive can be unified with a class of the same
# base category, given that the other category shares all
# of its subclasses, or with a variable.
def can_unify(self, other):
if not other.is_primitive():
return None
if other.is_var():
return [(other, self)]
if other.categ() == self.categ():
for restr in self._restrs:
if restr not in other.restrs():
return None
return []
return None
def __str__(self):
if self._restrs == []:
return "%s" % self._categ
restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
return f"{self._categ}{restrictions}"
class FunctionalCategory(AbstractCCGCategory):
"""
Class that represents a function application category.
Consists of argument and result categories, together with
an application direction.
"""
def __init__(self, res, arg, dir):
self._res = res
self._arg = arg
self._dir = dir
self._comparison_key = (arg, dir, res)
def is_primitive(self):
return False
def is_function(self):
return True
def is_var(self):
return False
# Substitution returns the category consisting of the
# substitution applied to each of its constituents.
def substitute(self, subs):
sub_res = self._res.substitute(subs)
sub_dir = self._dir.substitute(subs)
sub_arg = self._arg.substitute(subs)
return FunctionalCategory(sub_res, sub_arg, self._dir)
# A function can unify with another function, so long as its
# constituents can unify, or with an unrestricted variable.
def can_unify(self, other):
if other.is_var():
return [(other, self)]
if other.is_function():
sa = self._res.can_unify(other.res())
sd = self._dir.can_unify(other.dir())
if sa is not None and sd is not None:
sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
if sb is not None:
return sa + sb
return None
# Constituent accessors
def arg(self):
return self._arg
def res(self):
return self._res
def dir(self):
return self._dir
def __str__(self):
return f"({self._res}{self._dir}{self._arg})"

View File

@@ -0,0 +1,480 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
The lexicon is constructed by calling
``lexicon.fromstring(<lexicon string>)``.
In order to construct a parser, you also need a rule set.
The standard English rules are provided in chart as
``chart.DefaultRuleSet``.
The parser can then be constructed by calling, for example:
``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
Parsing is then performed by running
``parser.parse(<sentence>.split())``.
While this returns a list of trees, the default representation
of the produced trees is not very enlightening, particularly
given that it uses the same tree class as the CFG parsers.
It is probably better to call:
``chart.printCCGDerivation(<parse tree extracted from list>)``
which should print a nice representation of the derivation.
This entire process is shown far more clearly in the demonstration:
python chart.py
"""
import itertools
from nltk.ccg.combinator import *
from nltk.ccg.combinator import (
BackwardApplication,
BackwardBx,
BackwardComposition,
BackwardSx,
BackwardT,
ForwardApplication,
ForwardComposition,
ForwardSubstitution,
ForwardT,
)
from nltk.ccg.lexicon import Token, fromstring
from nltk.ccg.logic import *
from nltk.parse import ParserI
from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
from nltk.sem.logic import *
from nltk.tree import Tree
# Based on the EdgeI class from NLTK.
# A number of the properties of the EdgeI interface don't
# transfer well to CCGs, however.
class CCGEdge(EdgeI):
def __init__(self, span, categ, rule):
self._span = span
self._categ = categ
self._rule = rule
self._comparison_key = (span, categ, rule)
# Accessors
def lhs(self):
return self._categ
def span(self):
return self._span
def start(self):
return self._span[0]
def end(self):
return self._span[1]
def length(self):
return self._span[1] - self.span[0]
def rhs(self):
return ()
def dot(self):
return 0
def is_complete(self):
return True
def is_incomplete(self):
return False
def nextsym(self):
return None
def categ(self):
return self._categ
def rule(self):
return self._rule
class CCGLeafEdge(EdgeI):
"""
Class representing leaf edges in a CCG derivation.
"""
def __init__(self, pos, token, leaf):
self._pos = pos
self._token = token
self._leaf = leaf
self._comparison_key = (pos, token.categ(), leaf)
# Accessors
def lhs(self):
return self._token.categ()
def span(self):
return (self._pos, self._pos + 1)
def start(self):
return self._pos
def end(self):
return self._pos + 1
def length(self):
return 1
def rhs(self):
return self._leaf
def dot(self):
return 0
def is_complete(self):
return True
def is_incomplete(self):
return False
def nextsym(self):
return None
def token(self):
return self._token
def categ(self):
return self._token.categ()
def leaf(self):
return self._leaf
class BinaryCombinatorRule(AbstractChartRule):
"""
Class implementing application of a binary combinator to a chart.
Takes the directed combinator to apply.
"""
NUMEDGES = 2
def __init__(self, combinator):
self._combinator = combinator
# Apply a combinator
def apply(self, chart, grammar, left_edge, right_edge):
# The left & right edges must be touching.
if not (left_edge.end() == right_edge.start()):
return
# Check if the two edges are permitted to combine.
# If so, generate the corresponding edge.
if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(
span=(left_edge.start(), right_edge.end()),
categ=res,
rule=self._combinator,
)
if chart.insert(new_edge, (left_edge, right_edge)):
yield new_edge
# The representation of the combinator (for printing derivations)
def __str__(self):
return "%s" % self._combinator
# Type-raising must be handled slightly differently to the other rules, as the
# resulting rules only span a single edge, rather than both edges.
class ForwardTypeRaiseRule(AbstractChartRule):
"""
Class for applying forward type raising
"""
NUMEDGES = 2
def __init__(self):
self._combinator = ForwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
if chart.insert(new_edge, (left_edge,)):
yield new_edge
def __str__(self):
return "%s" % self._combinator
class BackwardTypeRaiseRule(AbstractChartRule):
"""
Class for applying backward type raising.
"""
NUMEDGES = 2
def __init__(self):
self._combinator = BackwardT
def apply(self, chart, grammar, left_edge, right_edge):
if not (left_edge.end() == right_edge.start()):
return
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
if chart.insert(new_edge, (right_edge,)):
yield new_edge
def __str__(self):
return "%s" % self._combinator
# Common sets of combinators used for English derivations.
ApplicationRuleSet = [
BinaryCombinatorRule(ForwardApplication),
BinaryCombinatorRule(BackwardApplication),
]
CompositionRuleSet = [
BinaryCombinatorRule(ForwardComposition),
BinaryCombinatorRule(BackwardComposition),
BinaryCombinatorRule(BackwardBx),
]
SubstitutionRuleSet = [
BinaryCombinatorRule(ForwardSubstitution),
BinaryCombinatorRule(BackwardSx),
]
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
# The standard English rule set.
DefaultRuleSet = (
ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
)
class CCGChartParser(ParserI):
"""
Chart parser for CCGs.
Based largely on the ChartParser class from NLTK.
"""
def __init__(self, lexicon, rules, trace=0):
self._lexicon = lexicon
self._rules = rules
self._trace = trace
def lexicon(self):
return self._lexicon
# Implements the CYK algorithm
def parse(self, tokens):
tokens = list(tokens)
chart = CCGChart(list(tokens))
lex = self._lexicon
# Initialize leaf edges.
for index in range(chart.num_leaves()):
for token in lex.categories(chart.leaf(index)):
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
chart.insert(new_edge, ())
# Select a span for the new edges
for span in range(2, chart.num_leaves() + 1):
for start in range(0, chart.num_leaves() - span + 1):
# Try all possible pairs of edges that could generate
# an edge for that span
for part in range(1, span):
lstart = start
mid = start + part
rend = start + span
for left in chart.select(span=(lstart, mid)):
for right in chart.select(span=(mid, rend)):
# Generate all possible combinations of the two edges
for rule in self._rules:
edges_added_by_rule = 0
for newedge in rule.apply(chart, lex, left, right):
edges_added_by_rule += 1
# Output the resulting parses
return chart.parses(lex.start())
class CCGChart(Chart):
def __init__(self, tokens):
Chart.__init__(self, tokens)
# Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
# constructed slightly differently to those in the default Chart class, so it has to
# be reimplemented
def _trees(self, edge, complete, memo, tree_class):
assert complete, "CCGChart cannot build incomplete trees"
if edge in memo:
return memo[edge]
if isinstance(edge, CCGLeafEdge):
word = tree_class(edge.token(), [self._tokens[edge.start()]])
leaf = tree_class((edge.token(), "Leaf"), [word])
memo[edge] = [leaf]
return [leaf]
memo[edge] = []
trees = []
for cpl in self.child_pointer_lists(edge):
child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
for children in itertools.product(*child_choices):
lhs = (
Token(
self._tokens[edge.start() : edge.end()],
edge.lhs(),
compute_semantics(children, edge),
),
str(edge.rule()),
)
trees.append(tree_class(lhs, children))
memo[edge] = trees
return trees
def compute_semantics(children, edge):
if children[0].label()[0].semantics() is None:
return None
if len(children) == 2:
if isinstance(edge.rule(), BackwardCombinator):
children = [children[1], children[0]]
combinator = edge.rule()._combinator
function = children[0].label()[0].semantics()
argument = children[1].label()[0].semantics()
if isinstance(combinator, UndirectedFunctionApplication):
return compute_function_semantics(function, argument)
elif isinstance(combinator, UndirectedComposition):
return compute_composition_semantics(function, argument)
elif isinstance(combinator, UndirectedSubstitution):
return compute_substitution_semantics(function, argument)
else:
raise AssertionError("Unsupported combinator '" + combinator + "'")
else:
return compute_type_raised_semantics(children[0].label()[0].semantics())
# --------
# Displaying derivations
# --------
def printCCGDerivation(tree):
# Get the leaves and initial categories
leafcats = tree.pos()
leafstr = ""
catstr = ""
# Construct a string with both the leaf word and corresponding
# category aligned.
for leaf, cat in leafcats:
str_cat = "%s" % cat
nextlen = 2 + max(len(leaf), len(str_cat))
lcatlen = (nextlen - len(str_cat)) // 2
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
catstr += " " * lcatlen + str_cat + " " * rcatlen
lleaflen = (nextlen - len(leaf)) // 2
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
leafstr += " " * lleaflen + leaf + " " * rleaflen
print(leafstr.rstrip())
print(catstr.rstrip())
# Display the derivation steps
printCCGTree(0, tree)
# Prints the sequence of derivation steps.
def printCCGTree(lwidth, tree):
rwidth = lwidth
# Is a leaf (word).
# Increment the span by the space occupied by the leaf.
if not isinstance(tree, Tree):
return 2 + lwidth + len(tree)
# Find the width of the current derivation step
for child in tree:
rwidth = max(rwidth, printCCGTree(rwidth, child))
# Is a leaf node.
# Don't print anything, but account for the space occupied.
if not isinstance(tree.label(), tuple):
return max(
rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
)
(token, op) = tree.label()
if op == "Leaf":
return rwidth
# Pad to the left with spaces, followed by a sequence of '-'
# and the derivation rule.
print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
# Print the resulting category on a new line.
str_res = "%s" % (token.categ())
if token.semantics() is not None:
str_res += " {" + str(token.semantics()) + "}"
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
print(respadlen * " " + str_res)
return rwidth
### Demonstration code
# Construct the lexicon
lex = fromstring(
"""
:- S, NP, N, VP # Primitive categories, S is the target primitive
Det :: NP/N # Family of words
Pro :: NP
TV :: VP/NP
Modal :: (S\\NP)/VP # Backslashes need to be escaped
I => Pro # Word -> Category mapping
you => Pro
the => Det
# Variables have the special keyword 'var'
# '.' prevents permutation
# ',' prevents composition
and => var\\.,var/.,var
which => (N\\N)/(S/NP)
will => Modal # Categories can be either explicit, or families.
might => Modal
cook => TV
eat => TV
mushrooms => N
parsnips => N
bacon => N
"""
)
def demo():
parser = CCGChartParser(lex, DefaultRuleSet)
for parse in parser.parse("I might cook and eat the bacon".split()):
printCCGDerivation(parse)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,340 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CCG Combinators
"""
from abc import ABCMeta, abstractmethod
from nltk.ccg.api import FunctionalCategory
class UndirectedBinaryCombinator(metaclass=ABCMeta):
"""
Abstract class for representing a binary combinator.
Merely defines functions for checking if the function and argument
are able to be combined, and what the resulting category is.
Note that as no assumptions are made as to direction, the unrestricted
combinators can perform all backward, forward and crossed variations
of the combinators; these restrictions must be added in the rule
class.
"""
@abstractmethod
def can_combine(self, function, argument):
pass
@abstractmethod
def combine(self, function, argument):
pass
class DirectedBinaryCombinator(metaclass=ABCMeta):
"""
Wrapper for the undirected binary combinator.
It takes left and right categories, and decides which is to be
the function, and which the argument.
It then decides whether or not they can be combined.
"""
@abstractmethod
def can_combine(self, left, right):
pass
@abstractmethod
def combine(self, left, right):
pass
class ForwardCombinator(DirectedBinaryCombinator):
"""
Class representing combinators where the primary functor is on the left.
Takes an undirected combinator, and a predicate which adds constraints
restricting the cases in which it may apply.
"""
def __init__(self, combinator, predicate, suffix=""):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
return self._combinator.can_combine(left, right) and self._predicate(
left, right
)
def combine(self, left, right):
yield from self._combinator.combine(left, right)
def __str__(self):
return f">{self._combinator}{self._suffix}"
class BackwardCombinator(DirectedBinaryCombinator):
"""
The backward equivalent of the ForwardCombinator class.
"""
def __init__(self, combinator, predicate, suffix=""):
self._combinator = combinator
self._predicate = predicate
self._suffix = suffix
def can_combine(self, left, right):
return self._combinator.can_combine(right, left) and self._predicate(
left, right
)
def combine(self, left, right):
yield from self._combinator.combine(right, left)
def __str__(self):
return f"<{self._combinator}{self._suffix}"
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
"""
Class representing function application.
Implements rules of the form:
X/Y Y -> X (>)
And the corresponding backwards application rule
"""
def can_combine(self, function, argument):
if not function.is_function():
return False
return not function.arg().can_unify(argument) is None
def combine(self, function, argument):
if not function.is_function():
return
subs = function.arg().can_unify(argument)
if subs is None:
return
yield function.res().substitute(subs)
def __str__(self):
return ""
# Predicates for function application.
# Ensures the left functor takes an argument on the right
def forwardOnly(left, right):
return left.dir().is_forward()
# Ensures the right functor takes an argument on the left
def backwardOnly(left, right):
return right.dir().is_backward()
# Application combinator instances
ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
class UndirectedComposition(UndirectedBinaryCombinator):
"""
Functional composition (harmonic) combinator.
Implements rules of the form
X/Y Y/Z -> X/Z (B>)
And the corresponding backwards and crossed variations.
"""
def can_combine(self, function, argument):
# Can only combine two functions, and both functions must
# allow composition.
if not (function.is_function() and argument.is_function()):
return False
if function.dir().can_compose() and argument.dir().can_compose():
return not function.arg().can_unify(argument.res()) is None
return False
def combine(self, function, argument):
if not (function.is_function() and argument.is_function()):
return
if function.dir().can_compose() and argument.dir().can_compose():
subs = function.arg().can_unify(argument.res())
if subs is not None:
yield FunctionalCategory(
function.res().substitute(subs),
argument.arg().substitute(subs),
argument.dir(),
)
def __str__(self):
return "B"
# Predicates for restricting application of straight composition.
def bothForward(left, right):
return left.dir().is_forward() and right.dir().is_forward()
def bothBackward(left, right):
return left.dir().is_backward() and right.dir().is_backward()
# Predicates for crossed composition
def crossedDirs(left, right):
return left.dir().is_forward() and right.dir().is_backward()
def backwardBxConstraint(left, right):
# The functors must be crossed inwards
if not crossedDirs(left, right):
return False
# Permuting combinators must be allowed
if not left.dir().can_cross() and right.dir().can_cross():
return False
# The resulting argument category is restricted to be primitive
return left.arg().is_primitive()
# Straight composition combinators
ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
# Backward crossed composition
BackwardBx = BackwardCombinator(
UndirectedComposition(), backwardBxConstraint, suffix="x"
)
class UndirectedSubstitution(UndirectedBinaryCombinator):
r"""
Substitution (permutation) combinator.
Implements rules of the form
Y/Z (X\Y)/Z -> X/Z (<Sx)
And other variations.
"""
def can_combine(self, function, argument):
if function.is_primitive() or argument.is_primitive():
return False
# These could potentially be moved to the predicates, as the
# constraints may not be general to all languages.
if function.res().is_primitive():
return False
if not function.arg().is_primitive():
return False
if not (function.dir().can_compose() and argument.dir().can_compose()):
return False
return (function.res().arg() == argument.res()) and (
function.arg() == argument.arg()
)
def combine(self, function, argument):
if self.can_combine(function, argument):
yield FunctionalCategory(
function.res().res(), argument.arg(), argument.dir()
)
def __str__(self):
return "S"
# Predicate for forward substitution
def forwardSConstraint(left, right):
if not bothForward(left, right):
return False
return left.res().dir().is_forward() and left.arg().is_primitive()
# Predicate for backward crossed substitution
def backwardSxConstraint(left, right):
if not left.dir().can_cross() and right.dir().can_cross():
return False
if not bothForward(left, right):
return False
return right.res().dir().is_backward() and right.arg().is_primitive()
# Instances of substitution combinators
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
# Retrieves the left-most functional category.
# ie, (N\N)/(S/NP) => N\N
def innermostFunction(categ):
while categ.res().is_function():
categ = categ.res()
return categ
class UndirectedTypeRaise(UndirectedBinaryCombinator):
"""
Undirected combinator for type raising.
"""
def can_combine(self, function, arg):
# The argument must be a function.
# The restriction that arg.res() must be a function
# merely reduces redundant type-raising; if arg.res() is
# primitive, we have:
# X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
# which is equivalent to
# X Y\X =>(<) Y
if not (arg.is_function() and arg.res().is_function()):
return False
arg = innermostFunction(arg)
# left, arg_categ are undefined!
subs = left.can_unify(arg_categ.arg())
if subs is not None:
return True
return False
def combine(self, function, arg):
if not (
function.is_primitive() and arg.is_function() and arg.res().is_function()
):
return
# Type-raising matches only the innermost application.
arg = innermostFunction(arg)
subs = function.can_unify(arg.arg())
if subs is not None:
xcat = arg.res().substitute(subs)
yield FunctionalCategory(
xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
)
def __str__(self):
return "T"
# Predicates for type-raising
# The direction of the innermost category must be towards
# the primary functor.
# The restriction that the variable must be primitive is not
# common to all versions of CCGs; some authors have other restrictions.
def forwardTConstraint(left, right):
arg = innermostFunction(right)
return arg.dir().is_backward() and arg.res().is_primitive()
def backwardTConstraint(left, right):
arg = innermostFunction(left)
return arg.dir().is_forward() and arg.res().is_primitive()
# Instances of type-raising combinators
ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)

View File

@@ -0,0 +1,338 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CCG Lexicons
"""
import re
from collections import defaultdict
from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
from nltk.internals import deprecated
from nltk.sem.logic import Expression
# ------------
# Regular expressions used for parsing components of the lexicon
# ------------
# Parses a primitive category and subscripts
PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
# Separates the next primitive category from the remainder of the
# string
NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
# Separates the next application operator from the remainder
APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
# Parses the definition of the right-hand side (rhs) of either a word or a family
LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
# Parses the right hand side that contains category and maybe semantic predicate
RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
# Parses the semantic predicate
SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
# Strips comments from a line
COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
class Token:
"""
Class representing a token.
token => category {semantics}
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
* `token` (string)
* `categ` (string)
* `semantics` (Expression)
"""
def __init__(self, token, categ, semantics=None):
self._token = token
self._categ = categ
self._semantics = semantics
def categ(self):
return self._categ
def semantics(self):
return self._semantics
def __str__(self):
semantics_str = ""
if self._semantics is not None:
semantics_str = " {" + str(self._semantics) + "}"
return "" + str(self._categ) + semantics_str
def __cmp__(self, other):
if not isinstance(other, Token):
return -1
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
class CCGLexicon:
"""
Class representing a lexicon for CCG grammars.
* `primitives`: The list of primitive categories for the lexicon
* `families`: Families of categories
* `entries`: A mapping of words to possible categories
"""
def __init__(self, start, primitives, families, entries):
self._start = PrimitiveCategory(start)
self._primitives = primitives
self._families = families
self._entries = entries
def categories(self, word):
"""
Returns all the possible categories for a word
"""
return self._entries[word]
def start(self):
"""
Return the target category for the parser
"""
return self._start
def __str__(self):
"""
String representation of the lexicon. Used for debugging.
"""
string = ""
first = True
for ident in sorted(self._entries):
if not first:
string = string + "\n"
string = string + ident + " => "
first = True
for cat in self._entries[ident]:
if not first:
string = string + " | "
else:
first = False
string = string + "%s" % cat
return string
# -----------
# Parsing lexicons
# -----------
def matchBrackets(string):
"""
Separate the contents matching the first set of brackets from the rest of
the input.
"""
rest = string[1:]
inside = "("
while rest != "" and not rest.startswith(")"):
if rest.startswith("("):
(part, rest) = matchBrackets(rest)
inside = inside + part
else:
inside = inside + rest[0]
rest = rest[1:]
if rest.startswith(")"):
return (inside + ")", rest[1:])
raise AssertionError("Unmatched bracket in string '" + string + "'")
def nextCategory(string):
"""
Separate the string for the next portion of the category from the rest
of the string
"""
if string.startswith("("):
return matchBrackets(string)
return NEXTPRIM_RE.match(string).groups()
def parseApplication(app):
"""
Parse an application operator
"""
return Direction(app[0], app[1:])
def parseSubscripts(subscr):
"""
Parse the subscripts for a primitive category
"""
if subscr:
return subscr[1:-1].split(",")
return []
def parsePrimitiveCategory(chunks, primitives, families, var):
"""
Parse a primitive category
If the primitive is the special category 'var', replace it with the
correct `CCGVar`.
"""
if chunks[0] == "var":
if chunks[1] is None:
if var is None:
var = CCGVar()
return (var, var)
catstr = chunks[0]
if catstr in families:
(cat, cvar) = families[catstr]
if var is None:
var = cvar
else:
cat = cat.substitute([(cvar, var)])
return (cat, var)
if catstr in primitives:
subscrs = parseSubscripts(chunks[1])
return (PrimitiveCategory(catstr, subscrs), var)
raise AssertionError(
"String '" + catstr + "' is neither a family nor primitive category."
)
def augParseCategory(line, primitives, families, var=None):
"""
Parse a string representing a category, and returns a tuple with
(possibly) the CCG variable for the category
"""
(cat_string, rest) = nextCategory(line)
if cat_string.startswith("("):
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(res, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
while rest != "":
app = APP_RE.match(rest).groups()
direction = parseApplication(app[0:3])
rest = app[3]
(cat_string, rest) = nextCategory(rest)
if cat_string.startswith("("):
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
else:
(arg, var) = parsePrimitiveCategory(
PRIM_RE.match(cat_string).groups(), primitives, families, var
)
res = FunctionalCategory(res, arg, direction)
return (res, var)
def fromstring(lex_str, include_semantics=False):
"""
Convert string representation into a lexicon for CCGs.
"""
CCGVar.reset_id()
primitives = []
families = {}
entries = defaultdict(list)
for line in lex_str.splitlines():
# Strip comments and leading/trailing whitespace.
line = COMMENTS_RE.match(line).groups()[0].strip()
if line == "":
continue
if line.startswith(":-"):
# A line of primitive categories.
# The first one is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [
prim.strip() for prim in line[2:].strip().split(",")
]
else:
# Either a family definition, or a word definition
(ident, sep, rhs) = LEX_RE.match(line).groups()
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
(cat, var) = augParseCategory(catstr, primitives, families)
if sep == "::":
# Family definition
# ie, Det :: NP/N
families[ident] = (cat, var)
else:
semantics = None
if include_semantics is True:
if semantics_str is None:
raise AssertionError(
line
+ " must contain semantics because include_semantics is set to True"
)
else:
semantics = Expression.fromstring(
SEMANTICS_RE.match(semantics_str).groups()[0]
)
# Word definition
# ie, which => (N\N)/(S/NP)
entries[ident].append(Token(ident, cat, semantics))
return CCGLexicon(primitives[0], primitives, families, entries)
@deprecated("Use fromstring() instead.")
def parseLexicon(lex_str):
return fromstring(lex_str)
openccg_tinytiny = fromstring(
"""
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
# Only incorporates a subset of the morphological subcategories, however.
:- S,NP,N # Primitive categories
Det :: NP/N # Determiners
Pro :: NP
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
IntransVpl :: S\\NP[pl] # Plural
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
TransVpl :: S\\NP[pl]/NP # Plural
the => NP[sg]/N[sg]
the => NP[pl]/N[pl]
I => Pro
me => Pro
we => Pro
us => Pro
book => N[sg]
books => N[pl]
peach => N[sg]
peaches => N[pl]
policeman => N[sg]
policemen => N[pl]
boy => N[sg]
boys => N[pl]
sleep => IntransVsg
sleep => IntransVpl
eat => IntransVpl
eat => TransVpl
eats => IntransVsg
eats => TransVsg
see => TransVpl
sees => TransVsg
"""
)

View File

@@ -0,0 +1,63 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tanin Na Nakorn (@tanin)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Helper functions for CCG semantics computation
"""
import copy
from nltk.sem.logic import *
def compute_type_raised_semantics(semantics):
semantics_copy = copy.deepcopy(semantics)
core = semantics_copy
parent = None
while isinstance(core, LambdaExpression):
parent = core
core = core.term
var = Variable("F")
while var in core.free():
var = unique_variable(pattern=var)
core = ApplicationExpression(FunctionVariableExpression(var), core)
if parent is not None:
parent.term = core
else:
semantics_copy = core
return LambdaExpression(var, semantics_copy)
def compute_function_semantics(function, argument):
return ApplicationExpression(function, argument).simplify()
def compute_composition_semantics(function, argument):
assert isinstance(argument, LambdaExpression), (
"`" + str(argument) + "` must be a lambda expression"
)
return LambdaExpression(
argument.variable, ApplicationExpression(function, argument.term).simplify()
)
def compute_substitution_semantics(function, argument):
assert isinstance(function, LambdaExpression) and isinstance(
function.term, LambdaExpression
), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
assert isinstance(argument, LambdaExpression), (
"`" + str(argument) + "` must be a lambda expression"
)
new_argument = ApplicationExpression(
argument, VariableExpression(function.variable)
).simplify()
new_term = ApplicationExpression(function.term, new_argument).simplify()
return LambdaExpression(function.variable, new_term)

View File

@@ -0,0 +1,48 @@
# Natural Language Toolkit: Chatbots
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
"""
A class for simple chatbots. These perform simple pattern matching on sentences
typed by users, and respond with automatically generated sentences.
These chatbots may not work using the windows command line or the
windows IDLE GUI.
"""
from nltk.chat.eliza import eliza_chat
from nltk.chat.iesha import iesha_chat
from nltk.chat.rude import rude_chat
from nltk.chat.suntsu import suntsu_chat
from nltk.chat.util import Chat
from nltk.chat.zen import zen_chat
bots = [
(eliza_chat, "Eliza (psycho-babble)"),
(iesha_chat, "Iesha (teen anime junky)"),
(rude_chat, "Rude (abusive bot)"),
(suntsu_chat, "Suntsu (Chinese sayings)"),
(zen_chat, "Zen (gems of wisdom)"),
]
def chatbots():
print("Which chatbot would you like to talk to?")
botcount = len(bots)
for i in range(botcount):
print(" %d: %s" % (i + 1, bots[i][1]))
while True:
choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip()
if choice.isdigit() and (int(choice) - 1) in range(botcount):
break
else:
print(" Error: bad chatbot number")
chatbot = bots[int(choice) - 1][0]
chatbot()

View File

@@ -0,0 +1,337 @@
# Natural Language Toolkit: Eliza
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <mailto:jez@jezuk.co.uk>.
# a translation table used to convert things you say into things the
# computer says back, e.g. "I am" --> "you are"
from nltk.chat.util import Chat, reflections
# a table of response pairs, where each pair consists of a
# regular expression, and a list of possible responses,
# with group-macros labelled as %1, %2.
pairs = (
(
r"I need (.*)",
(
"Why do you need %1?",
"Would it really help you to get %1?",
"Are you sure you need %1?",
),
),
(
r"Why don\'t you (.*)",
(
"Do you really think I don't %1?",
"Perhaps eventually I will %1.",
"Do you really want me to %1?",
),
),
(
r"Why can\'t I (.*)",
(
"Do you think you should be able to %1?",
"If you could %1, what would you do?",
"I don't know -- why can't you %1?",
"Have you really tried?",
),
),
(
r"I can\'t (.*)",
(
"How do you know you can't %1?",
"Perhaps you could %1 if you tried.",
"What would it take for you to %1?",
),
),
(
r"I am (.*)",
(
"Did you come to me because you are %1?",
"How long have you been %1?",
"How do you feel about being %1?",
),
),
(
r"I\'m (.*)",
(
"How does being %1 make you feel?",
"Do you enjoy being %1?",
"Why do you tell me you're %1?",
"Why do you think you're %1?",
),
),
(
r"Are you (.*)",
(
"Why does it matter whether I am %1?",
"Would you prefer it if I were not %1?",
"Perhaps you believe I am %1.",
"I may be %1 -- what do you think?",
),
),
(
r"What (.*)",
(
"Why do you ask?",
"How would an answer to that help you?",
"What do you think?",
),
),
(
r"How (.*)",
(
"How do you suppose?",
"Perhaps you can answer your own question.",
"What is it you're really asking?",
),
),
(
r"Because (.*)",
(
"Is that the real reason?",
"What other reasons come to mind?",
"Does that reason apply to anything else?",
"If %1, what else must be true?",
),
),
(
r"(.*) sorry (.*)",
(
"There are many times when no apology is needed.",
"What feelings do you have when you apologize?",
),
),
(
r"Hello(.*)",
(
"Hello... I'm glad you could drop by today.",
"Hi there... how are you today?",
"Hello, how are you feeling today?",
),
),
(
r"I think (.*)",
("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
),
(
r"(.*) friend (.*)",
(
"Tell me more about your friends.",
"When you think of a friend, what comes to mind?",
"Why don't you tell me about a childhood friend?",
),
),
(r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
(
r"(.*) computer(.*)",
(
"Are you really talking about me?",
"Does it seem strange to talk to a computer?",
"How do computers make you feel?",
"Do you feel threatened by computers?",
),
),
(
r"Is it (.*)",
(
"Do you think it is %1?",
"Perhaps it's %1 -- what do you think?",
"If it were %1, what would you do?",
"It could well be that %1.",
),
),
(
r"It is (.*)",
(
"You seem very certain.",
"If I told you that it probably isn't %1, what would you feel?",
),
),
(
r"Can you (.*)",
(
"What makes you think I can't %1?",
"If I could %1, then what?",
"Why do you ask if I can %1?",
),
),
(
r"Can I (.*)",
(
"Perhaps you don't want to %1.",
"Do you want to be able to %1?",
"If you could %1, would you?",
),
),
(
r"You are (.*)",
(
"Why do you think I am %1?",
"Does it please you to think that I'm %1?",
"Perhaps you would like me to be %1.",
"Perhaps you're really talking about yourself?",
),
),
(
r"You\'re (.*)",
(
"Why do you say I am %1?",
"Why do you think I am %1?",
"Are we talking about you, or me?",
),
),
(
r"I don\'t (.*)",
("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
),
(
r"I feel (.*)",
(
"Good, tell me more about these feelings.",
"Do you often feel %1?",
"When do you usually feel %1?",
"When you feel %1, what do you do?",
),
),
(
r"I have (.*)",
(
"Why do you tell me that you've %1?",
"Have you really %1?",
"Now that you have %1, what will you do next?",
),
),
(
r"I would (.*)",
(
"Could you explain why you would %1?",
"Why would you %1?",
"Who else knows that you would %1?",
),
),
(
r"Is there (.*)",
(
"Do you think there is %1?",
"It's likely that there is %1.",
"Would you like there to be %1?",
),
),
(
r"My (.*)",
(
"I see, your %1.",
"Why do you say that your %1?",
"When your %1, how do you feel?",
),
),
(
r"You (.*)",
(
"We should be discussing you, not me.",
"Why do you say that about me?",
"Why do you care whether I %1?",
),
),
(r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
(
r"I want (.*)",
(
"What would it mean to you if you got %1?",
"Why do you want %1?",
"What would you do if you got %1?",
"If you got %1, then what would you do?",
),
),
(
r"(.*) mother(.*)",
(
"Tell me more about your mother.",
"What was your relationship with your mother like?",
"How do you feel about your mother?",
"How does this relate to your feelings today?",
"Good family relations are important.",
),
),
(
r"(.*) father(.*)",
(
"Tell me more about your father.",
"How did your father make you feel?",
"How do you feel about your father?",
"Does your relationship with your father relate to your feelings today?",
"Do you have trouble showing affection with your family?",
),
),
(
r"(.*) child(.*)",
(
"Did you have close friends as a child?",
"What is your favorite childhood memory?",
"Do you remember any dreams or nightmares from childhood?",
"Did the other children sometimes tease you?",
"How do you think your childhood experiences relate to your feelings today?",
),
),
(
r"(.*)\?",
(
"Why do you ask that?",
"Please consider whether you can answer your own question.",
"Perhaps the answer lies within yourself?",
"Why don't you tell me?",
),
),
(
r"quit",
(
"Thank you for talking with me.",
"Good-bye.",
"Thank you, that will be $150. Have a good day!",
),
),
(
r"(.*)",
(
"Please tell me more.",
"Let's change focus a bit... Tell me about your family.",
"Can you elaborate on that?",
"Why do you say that %1?",
"I see.",
"Very interesting.",
"%1.",
"I see. And what does that tell you?",
"How does that make you feel?",
"How do you feel when you say that?",
),
),
)
eliza_chatbot = Chat(pairs, reflections)
def eliza_chat():
print("Therapist\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("Hello. How are you feeling today?")
eliza_chatbot.converse()
def demo():
eliza_chat()
if __name__ == "__main__":
eliza_chat()

View File

@@ -0,0 +1,160 @@
# Natural Language Toolkit: Teen Chatbot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
This chatbot is a tongue-in-cheek take on the average teen
anime junky that frequents YahooMessenger or MSNM.
All spelling mistakes and flawed grammar are intentional.
"""
from nltk.chat.util import Chat
reflections = {
"am": "r",
"was": "were",
"i": "u",
"i'd": "u'd",
"i've": "u'v",
"ive": "u'v",
"i'll": "u'll",
"my": "ur",
"are": "am",
"you're": "im",
"you've": "ive",
"you'll": "i'll",
"your": "my",
"yours": "mine",
"you": "me",
"u": "me",
"ur": "my",
"urs": "mine",
"me": "u",
}
# Note: %1/2/etc are used without spaces prior as the chat bot seems
# to add a superfluous space when matching.
pairs = (
(
r"I\'m (.*)",
(
"ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
"ur%1? neat!! kekeke >_<",
),
),
(
r"(.*) don\'t you (.*)",
(
r"u think I can%2??! really?? kekeke \<_\<",
"what do u mean%2??!",
"i could if i wanted, don't you think!! kekeke",
),
),
(r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
(
r"do (you|u) (.*)\??",
("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
),
(
r"(.*)\?",
(
"man u ask lots of questions!",
"booooring! how old r u??",
"boooooring!! ur not very fun",
),
),
(
r"(cos|because) (.*)",
("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
),
(
r"why can\'t [iI] (.*)",
(
"i dunno! y u askin me for!",
"try harder, silly! hee! ^_^",
"i dunno! but when i can't%1 i jump up and down!",
),
),
(
r"I can\'t (.*)",
(
"u can't what??! >_<",
"that's ok! i can't%1 either! kekekekeke ^_^",
"try harder, silly! hee! ^&^",
),
),
(
r"(.*) (like|love|watch) anime",
(
"omg i love anime!! do u like sailor moon??! ^&^",
"anime yay! anime rocks sooooo much!",
"oooh anime! i love anime more than anything!",
"anime is the bestest evar! evangelion is the best!",
"hee anime is the best! do you have ur fav??",
),
),
(
r"I (like|love|watch|play) (.*)",
("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
),
(
r"anime sucks|(.*) (hate|detest) anime",
(
"ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
"no way! anime is the best ever!",
"nuh-uh, anime is the best!",
),
),
(
r"(are|r) (you|u) (.*)",
("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
),
(
r"what (.*)",
("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
),
(r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
(r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
(
r"quit",
(
"mom says i have to go eat dinner now :,( bye!!",
"awww u have to go?? see u next time!!",
"how to see u again soon! ^_^",
),
),
(
r"(.*)",
(
"ur funny! kekeke",
"boooooring! talk about something else! tell me wat u like!",
"do u like anime??",
"do u watch anime? i like sailor moon! ^_^",
"i wish i was a kitty!! kekekeke ^_^",
),
),
)
iesha_chatbot = Chat(pairs, reflections)
def iesha_chat():
print("Iesha the TeenBoT\n---------")
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("hi!! i'm iesha! who r u??!")
iesha_chatbot.converse()
def demo():
iesha_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,125 @@
# Natural Language Toolkit: Rude Chatbot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.chat.util import Chat, reflections
pairs = (
(
r"We (.*)",
(
"What do you mean, 'we'?",
"Don't include me in that!",
"I wouldn't be so sure about that.",
),
),
(
r"You should (.*)",
("Don't tell me what to do, buddy.", "Really? I should, should I?"),
),
(
r"You\'re(.*)",
(
"More like YOU'RE %1!",
"Hah! Look who's talking.",
"Come over here and tell me I'm %1.",
),
),
(
r"You are(.*)",
(
"More like YOU'RE %1!",
"Hah! Look who's talking.",
"Come over here and tell me I'm %1.",
),
),
(
r"I can\'t(.*)",
(
"You do sound like the type who can't %1.",
"Hear that splashing sound? That's my heart bleeding for you.",
"Tell somebody who might actually care.",
),
),
(
r"I think (.*)",
(
"I wouldn't think too hard if I were you.",
"You actually think? I'd never have guessed...",
),
),
(
r"I (.*)",
(
"I'm getting a bit tired of hearing about you.",
"How about we talk about me instead?",
"Me, me, me... Frankly, I don't care.",
),
),
(
r"How (.*)",
(
"How do you think?",
"Take a wild guess.",
"I'm not even going to dignify that with an answer.",
),
),
(r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
(
r"Why (.*)",
(
"Why not?",
"That's so obvious I thought even you'd have already figured it out.",
),
),
(
r"(.*)shut up(.*)",
(
"Make me.",
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
"Say that again, I dare you.",
),
),
(
r"Shut up(.*)",
(
"Make me.",
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
"Say that again, I dare you.",
),
),
(
r"Hello(.*)",
("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
),
(
r"(.*)",
(
"I'm getting bored here. Become more interesting.",
"Either become more thrilling or get lost, buddy.",
"Change the subject before I die of fatal boredom.",
),
),
)
rude_chatbot = Chat(pairs, reflections)
def rude_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("I suppose I should say hello.")
rude_chatbot.converse()
def demo():
rude_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,140 @@
# Natural Language Toolkit: Sun Tsu-Bot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Sam Huston 2007
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Tsu bot responds to all queries with a Sun Tsu sayings
Quoted from Sun Tsu's The Art of War
Translated by LIONEL GILES, M.A. 1910
Hosted by the Gutenberg Project
https://www.gutenberg.org/
"""
from nltk.chat.util import Chat, reflections
pairs = (
(r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
(
r"[^\?]*\?",
(
"Please consider whether you can answer your own question.",
"Ask me no questions!",
),
),
(
r"[0-9]+(.*)",
(
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
"There are five essentials for victory",
),
),
(
r"[A-Ca-c](.*)",
(
"The art of war is of vital importance to the State.",
"All warfare is based on deception.",
"If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
"If the campaign is protracted, the resources of the State will not be equal to the strain.",
"Attack him where he is unprepared, appear where you are not expected.",
"There is no instance of a country having benefited from prolonged warfare.",
),
),
(
r"[D-Fd-f](.*)",
(
"The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
"Bring war material with you from home, but forage on the enemy.",
"In war, then, let your great object be victory, not lengthy campaigns.",
"To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.",
),
),
(
r"[G-Ig-i](.*)",
(
"Heaven signifies night and day, cold and heat, times and seasons.",
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
"The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
"One may know how to conquer without being able to do it.",
),
),
(
r"[J-Lj-l](.*)",
(
"There are three ways in which a ruler can bring misfortune upon his army.",
"By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
"By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
"By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
"There are five essentials for victory",
"He will win who knows when to fight and when not to fight.",
"He will win who knows how to handle both superior and inferior forces.",
"He will win whose army is animated by the same spirit throughout all its ranks.",
"He will win who, prepared himself, waits to take the enemy unprepared.",
"He will win who has military capacity and is not interfered with by the sovereign.",
),
),
(
r"[M-Om-o](.*)",
(
"If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
"If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
"If you know neither the enemy nor yourself, you will succumb in every battle.",
"The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.",
),
),
(
r"[P-Rp-r](.*)",
(
"Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
"Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
"He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
"A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
"The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.",
),
),
(
r"[S-Us-u](.*)",
(
"What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
"Hence his victories bring him neither reputation for wisdom nor credit for courage.",
"Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
"In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
"There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
"Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.",
),
),
(
r"[V-Zv-z](.*)",
(
"It is a matter of life and death, a road either to safety or to ruin.",
"Hold out baits to entice the enemy. Feign disorder, and crush him.",
"All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
"Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
"So in war, the way is to avoid what is strong and to strike at what is weak.",
"Just as water retains no constant shape, so in warfare there are no constant conditions.",
),
),
(r"(.*)", ("Your statement insults me.", "")),
)
suntsu_chatbot = Chat(pairs, reflections)
def suntsu_chat():
print("Talk to the program by typing in plain English, using normal upper-")
print('and lower-case letters and punctuation. Enter "quit" when done.')
print("=" * 72)
print("You seek enlightenment?")
suntsu_chatbot.converse()
def demo():
suntsu_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,124 @@
# Natural Language Toolkit: Chatbot Utilities
#
# Copyright (C) 2001-2025 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
import random
import re
reflections = {
"i am": "you are",
"i was": "you were",
"i": "you",
"i'm": "you are",
"i'd": "you would",
"i've": "you have",
"i'll": "you will",
"my": "your",
"you are": "I am",
"you were": "I was",
"you've": "I have",
"you'll": "I will",
"your": "my",
"yours": "mine",
"you": "me",
"me": "you",
}
class Chat:
def __init__(self, pairs, reflections={}):
"""
Initialize the chatbot. Pairs is a list of patterns and responses. Each
pattern is a regular expression matching the user's statement or question,
e.g. r'I like (.*)'. For each such pattern a list of possible responses
is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material
which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to
the numbered positions in the responses, e.g. %1.
:type pairs: list of tuple
:param pairs: The patterns and responses
:type reflections: dict
:param reflections: A mapping between first and second person expressions
:rtype: None
"""
self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs]
self._reflections = reflections
self._regex = self._compile_reflections()
def _compile_reflections(self):
sorted_refl = sorted(self._reflections, key=len, reverse=True)
return re.compile(
r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
)
def _substitute(self, str):
"""
Substitute words in the string, according to the specified reflections,
e.g. "I'm" -> "you are"
:type str: str
:param str: The string to be mapped
:rtype: str
"""
return self._regex.sub(
lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower()
)
def _wildcards(self, response, match):
pos = response.find("%")
while pos >= 0:
num = int(response[pos + 1 : pos + 2])
response = (
response[:pos]
+ self._substitute(match.group(num))
+ response[pos + 2 :]
)
pos = response.find("%")
return response
def respond(self, str):
"""
Generate a response to the user input.
:type str: str
:param str: The string to be mapped
:rtype: str
"""
# check each pattern
for pattern, response in self._pairs:
match = pattern.match(str)
# did the pattern match?
if match:
resp = random.choice(response) # pick a random response
resp = self._wildcards(resp, match) # process wildcards
# fix munged punctuation at the end
if resp[-2:] == "?.":
resp = resp[:-2] + "."
if resp[-2:] == "??":
resp = resp[:-2] + "?"
return resp
# Hold a conversation with a chatbot
def converse(self, quit="quit"):
user_input = ""
while user_input != quit:
user_input = quit
try:
user_input = input(">")
except EOFError:
print(user_input)
if user_input:
while user_input[-1] in "!.":
user_input = user_input[:-1]
print(self.respond(user_input))

View File

@@ -0,0 +1,329 @@
# Natural Language Toolkit: Zen Chatbot
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Zen Chatbot talks in gems of Zen wisdom.
This is a sample conversation with Zen Chatbot:
ZC: Welcome, my child.
me: Good afternoon.
ZC: Ask the question you have come to ask.
me: How can I achieve enlightenment?
ZC: How do you suppose?
me: Through meditation.
ZC: Form is emptiness, and emptiness form.
me: How can I empty my mind of worldly troubles?
ZC: Will an answer to that really help in your search for enlightenment?
me: Yes.
ZC: It is better to be right than to be certain.
me: I seek truth and wisdom.
ZC: The search for truth is a long journey.
me: Are you sure?
ZC: Maybe sure, maybe not sure.
The chatbot structure is based on that of chat.eliza. Thus, it uses
a translation table to convert from question to response
i.e. "I am" --> "you are"
Of course, since Zen Chatbot does not understand the meaning of any words,
responses are very limited. Zen Chatbot will usually answer very vaguely, or
respond to a question by asking a different question, in much the same way
as Eliza.
"""
from nltk.chat.util import Chat, reflections
# responses are matched top to bottom, so non-specific matches occur later
# for each match, a list of possible responses is provided
responses = (
# Zen Chatbot opens with the line "Welcome, my child." The usual
# response will be a greeting problem: 'good' matches "good morning",
# "good day" etc, but also "good grief!" and other sentences starting
# with the word 'good' that may not be a greeting
(
r"(hello(.*))|(good [a-zA-Z]+)",
(
"The path to enlightenment is often difficult to see.",
"Greetings. I sense your mind is troubled. Tell me of your troubles.",
"Ask the question you have come to ask.",
"Hello. Do you seek englightenment?",
),
),
# "I need" and "I want" can be followed by a thing (eg 'help')
# or an action (eg 'to see you')
#
# This is a problem with this style of response -
# person: "I need you"
# chatbot: "me can be achieved by hard work and dedication of the mind"
# i.e. 'you' is not really a thing that can be mapped this way, so this
# interpretation only makes sense for some inputs
#
(
r"i need (.*)",
(
"%1 can be achieved by hard work and dedication of the mind.",
"%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
"Focus your mind on%1, and you will find what you need.",
),
),
(
r"i want (.*)",
(
"Desires of the heart will distract you from the path to enlightenment.",
"Will%1 help you attain enlightenment?",
"Is%1 a desire of the mind, or of the heart?",
),
),
# why questions are separated into three types:
# "why..I" e.g. "why am I here?" "Why do I like cake?"
# "why..you" e.g. "why are you here?" "Why won't you tell me?"
# "why..." e.g. "Why is the sky blue?"
# problems:
# person: "Why can't you tell me?"
# chatbot: "Are you sure I tell you?"
# - this style works for positives (e.g. "why do you like cake?")
# but does not work for negatives (e.g. "why don't you like cake?")
(r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
(r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
(r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
# e.g. "are you listening?", "are you a duck"
(
r"are you (.*)\?",
("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
),
# e.g. "am I a duck?", "am I going to die?"
(
r"am i (.*)\?",
("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
),
# what questions, e.g. "what time is it?"
# problems:
# person: "What do you want?"
# chatbot: "Seek truth, not what do me want."
(r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
# how questions, e.g. "how do you do?"
(
r"how (.*)\?",
(
"How do you suppose?",
"Will an answer to that really help in your search for enlightenment?",
"Ask yourself not how, but why.",
),
),
# can questions, e.g. "can you run?", "can you come over here please?"
(
r"can you (.*)\?",
(
"I probably can, but I may not.",
"Maybe I can%1, and maybe I cannot.",
"I can do all, and I can do nothing.",
),
),
# can questions, e.g. "can I have some cake?", "can I know truth?"
(
r"can i (.*)\?",
(
"You can%1 if you believe you can%1, and have a pure spirit.",
"Seek truth and you will know if you can%1.",
),
),
# e.g. "It is raining" - implies the speaker is certain of a fact
(
r"it is (.*)",
(
"How can you be certain that%1, when you do not even know yourself?",
"Whether it is%1 or not does not change the way the world is.",
),
),
# e.g. "is there a doctor in the house?"
(
r"is there (.*)\?",
("There is%1 if you believe there is.", "It is possible that there is%1."),
),
# e.g. "is it possible?", "is this true?"
(r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
# non-specific question
(
r"(.*)\?",
(
"Do you think %1?",
"You seek the truth. Does the truth seek you?",
"If you intentionally pursue the answers to your questions, the answers become hard to see.",
"The answer to your question cannot be told. It must be experienced.",
),
),
# expression of hate of form "I hate you" or "Kelly hates cheese"
(
r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
(
"Perhaps it is not about hating %2, but about hate from within.",
"Weeds only grow when we dislike them",
"Hate is a very strong emotion.",
),
),
# statement containing the word 'truth'
(
r"(.*) truth(.*)",
(
"Seek truth, and truth will seek you.",
"Remember, it is not the spoon which bends - only yourself.",
"The search for truth is a long journey.",
),
),
# desire to do an action
# e.g. "I want to go shopping"
(
r"i want to (.*)",
("You may %1 if your heart truly desires to.", "You may have to %1."),
),
# desire for an object
# e.g. "I want a pony"
(
r"i want (.*)",
(
"Does your heart truly desire %1?",
"Is this a desire of the heart, or of the mind?",
),
),
# e.g. "I can't wait" or "I can't do this"
(
r"i can\'t (.*)",
(
"What we can and can't do is a limitation of the mind.",
"There are limitations of the body, and limitations of the mind.",
"Have you tried to%1 with a clear mind?",
),
),
# "I think.." indicates uncertainty. e.g. "I think so."
# problem: exceptions...
# e.g. "I think, therefore I am"
(
r"i think (.*)",
(
"Uncertainty in an uncertain world.",
"Indeed, how can we be certain of anything in such uncertain times.",
"Are you not, in fact, certain that%1?",
),
),
# "I feel...emotions/sick/light-headed..."
(
r"i feel (.*)",
(
"Your body and your emotions are both symptoms of your mind."
"What do you believe is the root of such feelings?",
"Feeling%1 can be a sign of your state-of-mind.",
),
),
# exclaimation mark indicating emotion
# e.g. "Wow!" or "No!"
(
r"(.*)!",
(
"I sense that you are feeling emotional today.",
"You need to calm your emotions.",
),
),
# because [statement]
# e.g. "because I said so"
(
r"because (.*)",
(
"Does knowning the reasons behind things help you to understand"
" the things themselves?",
"If%1, what else must be true?",
),
),
# yes or no - raise an issue of certainty/correctness
(
r"(yes)|(no)",
(
"Is there certainty in an uncertain world?",
"It is better to be right than to be certain.",
),
),
# sentence containing word 'love'
(
r"(.*)love(.*)",
(
"Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
"Free love!",
),
),
# sentence containing word 'understand' - r
(
r"(.*)understand(.*)",
(
"If you understand, things are just as they are;"
" if you do not understand, things are just as they are.",
"Imagination is more important than knowledge.",
),
),
# 'I', 'me', 'my' - person is talking about themself.
# this breaks down when words contain these - eg 'Thyme', 'Irish'
(
r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
(
"'I', 'me', 'my'... these are selfish expressions.",
"Have you ever considered that you might be a selfish person?",
"Try to consider others, not just yourself.",
"Think not just of yourself, but of others.",
),
),
# 'you' starting a sentence
# e.g. "you stink!"
(
r"you (.*)",
("My path is not of concern to you.", "I am but one, and you but one more."),
),
# say goodbye with some extra Zen wisdom.
(
r"exit",
(
"Farewell. The obstacle is the path.",
"Farewell. Life is a journey, not a destination.",
"Good bye. We are cups, constantly and quietly being filled."
"\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.",
),
),
# fall through case -
# when stumped, respond with generic zen wisdom
#
(
r"(.*)",
(
"When you're enlightened, every word is wisdom.",
"Random talk is useless.",
"The reverse side also has a reverse side.",
"Form is emptiness, and emptiness is form.",
"I pour out a cup of water. Is the cup empty?",
),
),
)
zen_chatbot = Chat(responses, reflections)
def zen_chat():
print("*" * 75)
print("Zen Chatbot!".center(75))
print("*" * 75)
print('"Look beyond mere words and letters - look into your mind"'.center(75))
print("* Talk your way to truth with Zen Chatbot.")
print("* Type 'quit' when you have had enough.")
print("*" * 75)
print("Welcome, my child.")
zen_chatbot.converse()
def demo():
zen_chat()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,205 @@
# Natural Language Toolkit: Chunkers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Classes and interfaces for identifying non-overlapping linguistic
groups (such as base noun phrases) in unrestricted text. This task is
called "chunk parsing" or "chunking", and the identified groups are
called "chunks". The chunked text is represented using a shallow
tree called a "chunk structure." A chunk structure is a tree
containing tokens and chunks, where each chunk is a subtree containing
only tokens. For example, the chunk structure for base noun phrase
chunks in the sentence "I saw the big dog on the hill" is::
(SENTENCE:
(NP: <I>)
<saw>
(NP: <the> <big> <dog>)
<on>
(NP: <the> <hill>))
To convert a chunk structure back to a list of tokens, simply use the
chunk structure's ``leaves()`` method.
This module defines ``ChunkParserI``, a standard interface for
chunking texts; and ``RegexpChunkParser``, a regular-expression based
implementation of that interface. It also defines ``ChunkScore``, a
utility class for scoring chunk parsers.
RegexpChunkParser
=================
``RegexpChunkParser`` is an implementation of the chunk parser interface
that uses regular-expressions over tags to chunk a text. Its
``parse()`` method first constructs a ``ChunkString``, which encodes a
particular chunking of the input text. Initially, nothing is
chunked. ``parse.RegexpChunkParser`` then applies a sequence of
``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies
the chunking that it encodes. Finally, the ``ChunkString`` is
transformed back into a chunk structure, which is returned.
``RegexpChunkParser`` can only be used to chunk a single kind of phrase.
For example, you can use an ``RegexpChunkParser`` to chunk the noun
phrases in a text, or the verb phrases in a text; but you can not
use it to simultaneously chunk both noun phrases and verb phrases in
the same text. (This is a limitation of ``RegexpChunkParser``, not of
chunk parsers in general.)
RegexpChunkRules
----------------
A ``RegexpChunkRule`` is a transformational rule that updates the
chunking of a text by modifying its ``ChunkString``. Each
``RegexpChunkRule`` defines the ``apply()`` method, which modifies
the chunking encoded by a ``ChunkString``. The
``RegexpChunkRule`` class itself can be used to implement any
transformational rule based on regular expressions. There are
also a number of subclasses, which can be used to implement
simpler types of rules:
- ``ChunkRule`` chunks anything that matches a given regular
expression.
- ``StripRule`` strips anything that matches a given regular
expression.
- ``UnChunkRule`` will un-chunk any chunk that matches a given
regular expression.
- ``MergeRule`` can be used to merge two contiguous chunks.
- ``SplitRule`` can be used to split a single chunk into two
smaller chunks.
- ``ExpandLeftRule`` will expand a chunk to incorporate new
unchunked material on the left.
- ``ExpandRightRule`` will expand a chunk to incorporate new
unchunked material on the right.
Tag Patterns
~~~~~~~~~~~~
A ``RegexpChunkRule`` uses a modified version of regular
expression patterns, called "tag patterns". Tag patterns are
used to match sequences of tags. Examples of tag patterns are::
r'(<DT>|<JJ>|<NN>)+'
r'<NN>+'
r'<NN.*>'
The differences between regular expression patterns and tag
patterns are:
- In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
``'<NN'`` followed by one or more repetitions of ``'>'``.
- Whitespace in tag patterns is ignored. So
``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
- In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
``'<NN.*>'`` matches any single tag starting with ``'NN'``.
The function ``tag_pattern2re_pattern`` can be used to transform
a tag pattern to an equivalent regular expression pattern.
Efficiency
----------
Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a
rate of about 300 tokens/second, with a moderately complex rule set.
There may be problems if ``RegexpChunkParser`` is used with more than
5,000 tokens at a time. In particular, evaluation of some regular
expressions may cause the Python regular expression engine to
exceed its maximum recursion depth. We have attempted to minimize
these problems, but it is impossible to avoid them completely. We
therefore recommend that you apply the chunk parser to a single
sentence at a time.
Emacs Tip
---------
If you evaluate the following elisp expression in emacs, it will
colorize a ``ChunkString`` when you use an interactive python shell
with emacs or xemacs ("C-c !")::
(let ()
(defconst comint-mode-font-lock-keywords
'(("<[^>]+>" 0 'font-lock-reference-face)
("[{}]" 0 'font-lock-function-name-face)))
(add-hook 'comint-mode-hook (lambda () (turn-on-font-lock))))
You can evaluate this code by copying it to a temporary buffer,
placing the cursor after the last close parenthesis, and typing
"``C-x C-e``". You should evaluate it before running the interactive
session. The change will last until you close emacs.
Unresolved Issues
-----------------
If we use the ``re`` module for regular expressions, Python's
regular expression engine generates "maximum recursion depth
exceeded" errors when processing very large texts, even for
regular expressions that should not require any recursion. We
therefore use the ``pre`` module instead. But note that ``pre``
does not include Unicode support, so this module will not work
with unicode strings. Note also that ``pre`` regular expressions
are not quite as advanced as ``re`` ones (e.g., no leftward
zero-length assertions).
:type CHUNK_TAG_PATTERN: regexp
:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag
pattern is valid.
"""
from nltk.chunk.api import ChunkParserI
from nltk.chunk.named_entity import Maxent_NE_Chunker
from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
from nltk.chunk.util import (
ChunkScore,
accuracy,
conllstr2tree,
conlltags2tree,
ieerstr2tree,
tagstr2tree,
tree2conllstr,
tree2conlltags,
)
def ne_chunker(fmt="multiclass"):
"""
Load NLTK's currently recommended named entity chunker.
"""
return Maxent_NE_Chunker(fmt)
def ne_chunk(tagged_tokens, binary=False):
"""
Use NLTK's currently recommended named entity chunker to
chunk the given list of tagged tokens.
>>> from nltk.chunk import ne_chunk
>>> from nltk.corpus import treebank
>>> from pprint import pprint
>>> pprint(ne_chunk(treebank.tagged_sents()[2][8:14])) # doctest: +NORMALIZE_WHITESPACE
Tree('S', [('chairman', 'NN'), ('of', 'IN'), Tree('ORGANIZATION', [('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP')]), ('PLC', 'NNP')])
"""
if binary:
chunker = ne_chunker(fmt="binary")
else:
chunker = ne_chunker()
return chunker.parse(tagged_tokens)
def ne_chunk_sents(tagged_sentences, binary=False):
"""
Use NLTK's currently recommended named entity chunker to chunk the
given list of tagged sentences, each consisting of a list of tagged tokens.
"""
if binary:
chunker = ne_chunker(fmt="binary")
else:
chunker = ne_chunker()
return chunker.parse_sents(tagged_sentences)

View File

@@ -0,0 +1,56 @@
# Natural Language Toolkit: Chunk parsing API
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
##//////////////////////////////////////////////////////
## Chunk Parser Interface
##//////////////////////////////////////////////////////
from nltk.chunk.util import ChunkScore
from nltk.internals import deprecated
from nltk.parse import ParserI
class ChunkParserI(ParserI):
"""
A processing interface for identifying non-overlapping groups in
unrestricted text. Typically, chunk parsers are used to find base
syntactic constituents, such as base noun phrases. Unlike
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
will always generate a parse.
"""
def parse(self, tokens):
"""
Return the best chunk structure for the given tokens
and return a tree.
:param tokens: The list of (word, tag) tokens to be chunked.
:type tokens: list(tuple)
:rtype: Tree
"""
raise NotImplementedError()
@deprecated("Use accuracy(gold) instead.")
def evaluate(self, gold):
return self.accuracy(gold)
def accuracy(self, gold):
"""
Score the accuracy of the chunker against the gold standard.
Remove the chunking the gold standard text, rechunk it using
the chunker, and return a ``ChunkScore`` object
reflecting the performance of this chunk parser.
:type gold: list(Tree)
:param gold: The list of chunked sentences to score the chunker on.
:rtype: ChunkScore
"""
chunkscore = ChunkScore()
for correct in gold:
chunkscore.score(correct, self.parse(correct.leaves()))
return chunkscore

View File

@@ -0,0 +1,407 @@
# Natural Language Toolkit: Chunk parsing API
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Eric Kafe <kafe.eric@gmail.com> (tab-format models)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Named entity chunker
"""
import os
import re
from xml.etree import ElementTree as ET
from nltk.tag import ClassifierBasedTagger, pos_tag
try:
from nltk.classify import MaxentClassifier
except ImportError:
pass
from nltk.chunk.api import ChunkParserI
from nltk.chunk.util import ChunkScore
from nltk.data import find
from nltk.tokenize import word_tokenize
from nltk.tree import Tree
class NEChunkParserTagger(ClassifierBasedTagger):
"""
The IOB tagger used by the chunk parser.
"""
def __init__(self, train=None, classifier=None):
ClassifierBasedTagger.__init__(
self,
train=train,
classifier_builder=self._classifier_builder,
classifier=classifier,
)
def _classifier_builder(self, train):
return MaxentClassifier.train(
# "megam" cannot be the default algorithm since it requires compiling with ocaml
train,
algorithm="iis",
gaussian_prior_sigma=1,
trace=2,
)
def _english_wordlist(self):
try:
wl = self._en_wordlist
except AttributeError:
from nltk.corpus import words
self._en_wordlist = set(words.words("en-basic"))
wl = self._en_wordlist
return wl
def _feature_detector(self, tokens, index, history):
word = tokens[index][0]
pos = simplify_pos(tokens[index][1])
if index == 0:
prevword = prevprevword = None
prevpos = prevprevpos = None
prevshape = prevtag = prevprevtag = None
elif index == 1:
prevword = tokens[index - 1][0].lower()
prevprevword = None
prevpos = simplify_pos(tokens[index - 1][1])
prevprevpos = None
prevtag = history[index - 1][0]
prevshape = prevprevtag = None
else:
prevword = tokens[index - 1][0].lower()
prevprevword = tokens[index - 2][0].lower()
prevpos = simplify_pos(tokens[index - 1][1])
prevprevpos = simplify_pos(tokens[index - 2][1])
prevtag = history[index - 1]
prevprevtag = history[index - 2]
prevshape = shape(prevword)
if index == len(tokens) - 1:
nextword = nextnextword = None
nextpos = nextnextpos = None
elif index == len(tokens) - 2:
nextword = tokens[index + 1][0].lower()
nextpos = tokens[index + 1][1].lower()
nextnextword = None
nextnextpos = None
else:
nextword = tokens[index + 1][0].lower()
nextpos = tokens[index + 1][1].lower()
nextnextword = tokens[index + 2][0].lower()
nextnextpos = tokens[index + 2][1].lower()
# 89.6
features = {
"bias": True,
"shape": shape(word),
"wordlen": len(word),
"prefix3": word[:3].lower(),
"suffix3": word[-3:].lower(),
"pos": pos,
"word": word,
"en-wordlist": (word in self._english_wordlist()),
"prevtag": prevtag,
"prevpos": prevpos,
"nextpos": nextpos,
"prevword": prevword,
"nextword": nextword,
"word+nextpos": f"{word.lower()}+{nextpos}",
"pos+prevtag": f"{pos}+{prevtag}",
"shape+prevtag": f"{prevshape}+{prevtag}",
}
return features
class NEChunkParser(ChunkParserI):
"""
Expected input: list of pos-tagged words
"""
def __init__(self, train):
self._train(train)
def parse(self, tokens):
"""
Each token should be a pos-tagged word
"""
tagged = self._tagger.tag(tokens)
tree = self._tagged_to_parse(tagged)
return tree
def _train(self, corpus):
# Convert to tagged sequence
corpus = [self._parse_to_tagged(s) for s in corpus]
self._tagger = NEChunkParserTagger(train=corpus)
def _tagged_to_parse(self, tagged_tokens):
"""
Convert a list of tagged tokens to a chunk-parse tree.
"""
sent = Tree("S", [])
for tok, tag in tagged_tokens:
if tag == "O":
sent.append(tok)
elif tag.startswith("B-"):
sent.append(Tree(tag[2:], [tok]))
elif tag.startswith("I-"):
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
sent[-1].append(tok)
else:
sent.append(Tree(tag[2:], [tok]))
return sent
@staticmethod
def _parse_to_tagged(sent):
"""
Convert a chunk-parse tree to a list of tagged tokens.
"""
toks = []
for child in sent:
if isinstance(child, Tree):
if len(child) == 0:
print("Warning -- empty chunk in sentence")
continue
toks.append((child[0], f"B-{child.label()}"))
for tok in child[1:]:
toks.append((tok, f"I-{child.label()}"))
else:
toks.append((child, "O"))
return toks
def shape(word):
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
return "number"
elif re.match(r"\W+$", word, re.UNICODE):
return "punct"
elif re.match(r"\w+$", word, re.UNICODE):
if word.istitle():
return "upcase"
elif word.islower():
return "downcase"
else:
return "mixedcase"
else:
return "other"
def simplify_pos(s):
if s.startswith("V"):
return "V"
else:
return s.split("-")[0]
def postag_tree(tree):
# Part-of-speech tagging.
words = tree.leaves()
tag_iter = (pos for (word, pos) in pos_tag(words))
newtree = Tree("S", [])
for child in tree:
if isinstance(child, Tree):
newtree.append(Tree(child.label(), []))
for subchild in child:
newtree[-1].append((subchild, next(tag_iter)))
else:
newtree.append((child, next(tag_iter)))
return newtree
def load_ace_data(roots, fmt="binary", skip_bnews=True):
for root in roots:
for root, dirs, files in os.walk(root):
if root.endswith("bnews") and skip_bnews:
continue
for f in files:
if f.endswith(".sgm"):
yield from load_ace_file(os.path.join(root, f), fmt)
def load_ace_file(textfile, fmt):
print(f" - {os.path.split(textfile)[1]}")
annfile = textfile + ".tmx.rdc.xml"
# Read the xml file, and get a list of entities
entities = []
with open(annfile) as infile:
xml = ET.parse(infile).getroot()
for entity in xml.findall("document/entity"):
typ = entity.find("entity_type").text
for mention in entity.findall("entity_mention"):
if mention.get("TYPE") != "NAME":
continue # only NEs
s = int(mention.find("head/charseq/start").text)
e = int(mention.find("head/charseq/end").text) + 1
entities.append((s, e, typ))
# Read the text file, and mark the entities.
with open(textfile) as infile:
text = infile.read()
# Strip XML tags, since they don't count towards the indices
text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
# Blank out anything before/after <TEXT>
def subfunc(m):
return " " * (m.end() - m.start() - 6)
text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
text = re.sub(r"</TEXT>[\s\S]*", "", text)
# Simplify quotes
text = re.sub("``", ' "', text)
text = re.sub("''", '" ', text)
entity_types = {typ for (s, e, typ) in entities}
# Binary distinction (NE or not NE)
if fmt == "binary":
i = 0
toks = Tree("S", [])
for s, e, typ in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
if e <= s:
continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree("NE", text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
# Multiclass distinction (NE type)
elif fmt == "multiclass":
i = 0
toks = Tree("S", [])
for s, e, typ in sorted(entities):
if s < i:
s = i # Overlapping! Deal with this better?
if e <= s:
continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree(typ, text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
else:
raise ValueError("bad fmt value")
# This probably belongs in a more general-purpose location (as does
# the parse_to_tagged function).
def cmp_chunks(correct, guessed):
correct = NEChunkParser._parse_to_tagged(correct)
guessed = NEChunkParser._parse_to_tagged(guessed)
ellipsis = False
for (w, ct), (w, gt) in zip(correct, guessed):
if ct == gt == "O":
if not ellipsis:
print(f" {ct:15} {gt:15} {w}")
print(" {:15} {:15} {2}".format("...", "...", "..."))
ellipsis = True
else:
ellipsis = False
print(f" {ct:15} {gt:15} {w}")
# ======================================================================================
class Maxent_NE_Chunker(NEChunkParser):
"""
Expected input: list of pos-tagged words
"""
def __init__(self, fmt="multiclass"):
from nltk.data import find
self._fmt = fmt
self._tab_dir = find(f"chunkers/maxent_ne_chunker_tab/english_ace_{fmt}/")
self.load_params()
def load_params(self):
from nltk.classify.maxent import BinaryMaxentFeatureEncoding, load_maxent_params
wgt, mpg, lab, aon = load_maxent_params(self._tab_dir)
mc = MaxentClassifier(
BinaryMaxentFeatureEncoding(lab, mpg, alwayson_features=aon), wgt
)
self._tagger = NEChunkParserTagger(classifier=mc)
def save_params(self):
from nltk.classify.maxent import save_maxent_params
classif = self._tagger._classifier
ecg = classif._encoding
wgt = classif._weights
mpg = ecg._mapping
lab = ecg._labels
aon = ecg._alwayson
fmt = self._fmt
save_maxent_params(wgt, mpg, lab, aon, tab_dir=f"/tmp/english_ace_{fmt}/")
def build_model(fmt="multiclass"):
chunker = Maxent_NE_Chunker(fmt)
chunker.save_params()
return chunker
# ======================================================================================
"""
2004 update: pickles are not supported anymore.
Deprecated:
def build_model(fmt="binary"):
print("Loading training data...")
train_paths = [
find("corpora/ace_data/ace.dev"),
find("corpora/ace_data/ace.heldout"),
find("corpora/ace_data/bbn.dev"),
find("corpora/ace_data/muc.dev"),
]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print("Training...")
cp = NEChunkParser(train_data)
del train_data
print("Loading eval data...")
eval_paths = [find("corpora/ace_data/ace.eval")]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print("Evaluating...")
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3:
cmp_chunks(correct, guess)
print(chunkscore)
outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
print(f"Saving chunker to {outfilename}...")
with open(outfilename, "wb") as outfile:
pickle.dump(cp, outfile, -1)
return cp
"""
if __name__ == "__main__":
# Make sure that the object has the right class name:
build_model("binary")
build_model("multiclass")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,642 @@
# Natural Language Toolkit: Chunk format conversions
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.metrics import accuracy as _accuracy
from nltk.tag.mapping import map_tag
from nltk.tag.util import str2tuple
from nltk.tree import Tree
##//////////////////////////////////////////////////////
## EVALUATION
##//////////////////////////////////////////////////////
def accuracy(chunker, gold):
"""
Score the accuracy of the chunker against the gold standard.
Strip the chunk information from the gold standard and rechunk it using
the chunker, then compute the accuracy score.
:type chunker: ChunkParserI
:param chunker: The chunker being evaluated.
:type gold: tree
:param gold: The chunk structures to score the chunker on.
:rtype: float
"""
gold_tags = []
test_tags = []
for gold_tree in gold:
test_tree = chunker.parse(gold_tree.flatten())
gold_tags += tree2conlltags(gold_tree)
test_tags += tree2conlltags(test_tree)
# print 'GOLD:', gold_tags[:50]
# print 'TEST:', test_tags[:50]
return _accuracy(gold_tags, test_tags)
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
#
class ChunkScore:
"""
A utility class for scoring chunk parsers. ``ChunkScore`` can
evaluate a chunk parser's output, based on a number of statistics
(precision, recall, f-measure, misssed chunks, incorrect chunks).
It can also combine the scores from the parsing of multiple texts;
this makes it significantly easier to evaluate a chunk parser that
operates one sentence at a time.
Texts are evaluated with the ``score`` method. The results of
evaluation can be accessed via a number of accessor methods, such
as ``precision`` and ``f_measure``. A typical use of the
``ChunkScore`` class is::
>>> chunkscore = ChunkScore() # doctest: +SKIP
>>> for correct in correct_sentences: # doctest: +SKIP
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
... chunkscore.score(correct, guess) # doctest: +SKIP
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
F Measure: 0.823
:ivar kwargs: Keyword arguments:
- max_tp_examples: The maximum number actual examples of true
positives to record. This affects the ``correct`` member
function: ``correct`` will not return more than this number
of true positive examples. This does *not* affect any of
the numerical metrics (precision, recall, or f-measure)
- max_fp_examples: The maximum number actual examples of false
positives to record. This affects the ``incorrect`` member
function and the ``guessed`` member function: ``incorrect``
will not return more than this number of examples, and
``guessed`` will not return more than this number of true
positive examples. This does *not* affect any of the
numerical metrics (precision, recall, or f-measure)
- max_fn_examples: The maximum number actual examples of false
negatives to record. This affects the ``missed`` member
function and the ``correct`` member function: ``missed``
will not return more than this number of examples, and
``correct`` will not return more than this number of true
negative examples. This does *not* affect any of the
numerical metrics (precision, recall, or f-measure)
- chunk_label: A regular expression indicating which chunks
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
:type _tp: list(Token)
:ivar _tp: List of true positives
:type _fp: list(Token)
:ivar _fp: List of false positives
:type _fn: list(Token)
:ivar _fn: List of false negatives
:type _tp_num: int
:ivar _tp_num: Number of true positives
:type _fp_num: int
:ivar _fp_num: Number of false positives
:type _fn_num: int
:ivar _fn_num: Number of false negatives.
"""
def __init__(self, **kwargs):
self._correct = set()
self._guessed = set()
self._tp = set()
self._fp = set()
self._fn = set()
self._max_tp = kwargs.get("max_tp_examples", 100)
self._max_fp = kwargs.get("max_fp_examples", 100)
self._max_fn = kwargs.get("max_fn_examples", 100)
self._chunk_label = kwargs.get("chunk_label", ".*")
self._tp_num = 0
self._fp_num = 0
self._fn_num = 0
self._count = 0
self._tags_correct = 0.0
self._tags_total = 0.0
self._measuresNeedUpdate = False
def _updateMeasures(self):
if self._measuresNeedUpdate:
self._tp = self._guessed & self._correct
self._fn = self._correct - self._guessed
self._fp = self._guessed - self._correct
self._tp_num = len(self._tp)
self._fp_num = len(self._fp)
self._fn_num = len(self._fn)
self._measuresNeedUpdate = False
def score(self, correct, guessed):
"""
Given a correctly chunked sentence, score another chunked
version of the same sentence.
:type correct: chunk structure
:param correct: The known-correct ("gold standard") chunked
sentence.
:type guessed: chunk structure
:param guessed: The chunked sentence to be scored.
"""
self._correct |= _chunksets(correct, self._count, self._chunk_label)
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
self._count += 1
self._measuresNeedUpdate = True
# Keep track of per-tag accuracy (if possible)
try:
correct_tags = tree2conlltags(correct)
guessed_tags = tree2conlltags(guessed)
except ValueError:
# This exception case is for nested chunk structures,
# where tree2conlltags will fail with a ValueError: "Tree
# is too deeply nested to be printed in CoNLL format."
correct_tags = guessed_tags = ()
self._tags_total += len(correct_tags)
self._tags_correct += sum(
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
)
def accuracy(self):
"""
Return the overall tag-based accuracy for all text that have
been scored by this ``ChunkScore``, using the IOB (conll2000)
tag encoding.
:rtype: float
"""
if self._tags_total == 0:
return 1
return self._tags_correct / self._tags_total
def precision(self):
"""
Return the overall precision for all texts that have been
scored by this ``ChunkScore``.
:rtype: float
"""
self._updateMeasures()
div = self._tp_num + self._fp_num
if div == 0:
return 0
else:
return self._tp_num / div
def recall(self):
"""
Return the overall recall for all texts that have been
scored by this ``ChunkScore``.
:rtype: float
"""
self._updateMeasures()
div = self._tp_num + self._fn_num
if div == 0:
return 0
else:
return self._tp_num / div
def f_measure(self, alpha=0.5):
"""
Return the overall F measure for all texts that have been
scored by this ``ChunkScore``.
:param alpha: the relative weighting of precision and recall.
Larger alpha biases the score towards the precision value,
while smaller alpha biases the score towards the recall
value. ``alpha`` should have a value in the range [0,1].
:type alpha: float
:rtype: float
"""
self._updateMeasures()
p = self.precision()
r = self.recall()
if p == 0 or r == 0: # what if alpha is 0 or 1?
return 0
return 1 / (alpha / p + (1 - alpha) / r)
def missed(self):
"""
Return the chunks which were included in the
correct chunk structures, but not in the guessed chunk
structures, listed in input order.
:rtype: list of chunks
"""
self._updateMeasures()
chunks = list(self._fn)
return [c[1] for c in chunks] # discard position information
def incorrect(self):
"""
Return the chunks which were included in the guessed chunk structures,
but not in the correct chunk structures, listed in input order.
:rtype: list of chunks
"""
self._updateMeasures()
chunks = list(self._fp)
return [c[1] for c in chunks] # discard position information
def correct(self):
"""
Return the chunks which were included in the correct
chunk structures, listed in input order.
:rtype: list of chunks
"""
chunks = list(self._correct)
return [c[1] for c in chunks] # discard position information
def guessed(self):
"""
Return the chunks which were included in the guessed
chunk structures, listed in input order.
:rtype: list of chunks
"""
chunks = list(self._guessed)
return [c[1] for c in chunks] # discard position information
def __len__(self):
self._updateMeasures()
return self._tp_num + self._fn_num
def __repr__(self):
"""
Return a concise representation of this ``ChunkScoring``.
:rtype: str
"""
return "<ChunkScoring of " + repr(len(self)) + " chunks>"
def __str__(self):
"""
Return a verbose representation of this ``ChunkScoring``.
This representation includes the precision, recall, and
f-measure scores. For other information about the score,
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
:rtype: str
"""
return (
"ChunkParse score:\n"
+ f" IOB Accuracy: {self.accuracy() * 100:5.1f}%\n"
+ f" Precision: {self.precision() * 100:5.1f}%\n"
+ f" Recall: {self.recall() * 100:5.1f}%\n"
+ f" F-Measure: {self.f_measure() * 100:5.1f}%"
)
# extract chunks, and assign unique id, the absolute position of
# the first word of the chunk
def _chunksets(t, count, chunk_label):
pos = 0
chunks = []
for child in t:
if isinstance(child, Tree):
if re.match(chunk_label, child.label()):
chunks.append(((count, pos), child.freeze()))
pos += len(child.leaves())
else:
pos += 1
return set(chunks)
def tagstr2tree(
s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
):
"""
Divide a string of bracketted tagged text into
chunks and unchunked tokens, and produce a Tree.
Chunks are marked by square brackets (``[...]``). Words are
delimited by whitespace, and each word should have the form
``text/tag``. Words that do not contain a slash are
assigned a ``tag`` of None.
:param s: The string to be converted
:type s: str
:param chunk_label: The label to use for chunk nodes
:type chunk_label: str
:param root_label: The label to use for the root of the tree
:type root_label: str
:rtype: Tree
"""
WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
stack = [Tree(root_label, [])]
for match in WORD_OR_BRACKET.finditer(s):
text = match.group()
if text[0] == "[":
if len(stack) != 1:
raise ValueError(f"Unexpected [ at char {match.start():d}")
chunk = Tree(chunk_label, [])
stack[-1].append(chunk)
stack.append(chunk)
elif text[0] == "]":
if len(stack) != 2:
raise ValueError(f"Unexpected ] at char {match.start():d}")
stack.pop()
else:
if sep is None:
stack[-1].append(text)
else:
word, tag = str2tuple(text, sep)
if source_tagset and target_tagset:
tag = map_tag(source_tagset, target_tagset, tag)
stack[-1].append((word, tag))
if len(stack) != 1:
raise ValueError(f"Expected ] at char {len(s):d}")
return stack[0]
### CONLL
_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
"""
Return a chunk structure for a single sentence
encoded in the given CONLL 2000 style string.
This function converts a CoNLL IOB string into a tree.
It uses the specified chunk types
(defaults to NP, PP and VP), and creates a tree rooted at a node
labeled S (by default).
:param s: The CoNLL string to be converted.
:type s: str
:param chunk_types: The chunk types to be converted.
:type chunk_types: tuple
:param root_label: The node label to use for the root.
:type root_label: str
:rtype: Tree
"""
stack = [Tree(root_label, [])]
for lineno, line in enumerate(s.split("\n")):
if not line.strip():
continue
# Decode the line.
match = _LINE_RE.match(line)
if match is None:
raise ValueError(f"Error on line {lineno:d}")
(word, tag, state, chunk_type) = match.groups()
# If it's a chunk type we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
state = "O"
# For "Begin"/"Outside", finish any completed chunks -
# also do so for "Inside" which don't match the previous token.
mismatch_I = state == "I" and chunk_type != stack[-1].label()
if state in "BO" or mismatch_I:
if len(stack) == 2:
stack.pop()
# For "Begin", start a new chunk.
if state == "B" or mismatch_I:
chunk = Tree(chunk_type, [])
stack[-1].append(chunk)
stack.append(chunk)
# Add the new word token.
stack[-1].append((word, tag))
return stack[0]
def tree2conlltags(t):
"""
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
Convert a tree to the CoNLL IOB tag format.
:param t: The tree to be converted.
:type t: Tree
:rtype: list(tuple)
"""
tags = []
for child in t:
try:
category = child.label()
prefix = "B-"
for contents in child:
if isinstance(contents, Tree):
raise ValueError(
"Tree is too deeply nested to be printed in CoNLL format"
)
tags.append((contents[0], contents[1], prefix + category))
prefix = "I-"
except AttributeError:
tags.append((child[0], child[1], "O"))
return tags
def conlltags2tree(
sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
):
"""
Convert the CoNLL IOB format to a tree.
"""
tree = Tree(root_label, [])
for word, postag, chunktag in sentence:
if chunktag is None:
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as O
tree.append((word, postag))
elif chunktag.startswith("B-"):
tree.append(Tree(chunktag[2:], [(word, postag)]))
elif chunktag.startswith("I-"):
if (
len(tree) == 0
or not isinstance(tree[-1], Tree)
or tree[-1].label() != chunktag[2:]
):
if strict:
raise ValueError("Bad conll tag sequence")
else:
# Treat as B-*
tree.append(Tree(chunktag[2:], [(word, postag)]))
else:
tree[-1].append((word, postag))
elif chunktag == "O":
tree.append((word, postag))
else:
raise ValueError(f"Bad conll tag {chunktag!r}")
return tree
def tree2conllstr(t):
"""
Return a multiline string where each line contains a word, tag and IOB tag.
Convert a tree to the CoNLL IOB string format
:param t: The tree to be converted.
:type t: Tree
:rtype: str
"""
lines = [" ".join(token) for token in tree2conlltags(t)]
return "\n".join(lines)
### IEER
_IEER_DOC_RE = re.compile(
r"<DOC>\s*"
r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
r"<BODY>\s*"
r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
r"<TEXT>(?P<text>.*?)</TEXT>\s*"
r"</BODY>\s*</DOC>\s*",
re.DOTALL,
)
_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
def _ieer_read_text(s, root_label):
stack = [Tree(root_label, [])]
# s will be None if there is no headline in the text
# return the empty list in place of a Tree
if s is None:
return []
for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
piece = piece_m.group()
try:
if piece.startswith("<b_"):
m = _IEER_TYPE_RE.match(piece)
if m is None:
print("XXXX", piece)
chunk = Tree(m.group("type"), [])
stack[-1].append(chunk)
stack.append(chunk)
elif piece.startswith("<e_"):
stack.pop()
# elif piece.startswith('<'):
# print "ERROR:", piece
# raise ValueError # Unexpected HTML
else:
stack[-1].append(piece)
except (IndexError, ValueError) as e:
raise ValueError(
f"Bad IEER string (error at character {piece_m.start():d})"
) from e
if len(stack) != 1:
raise ValueError("Bad IEER string")
return stack[0]
def ieerstr2tree(
s,
chunk_types=[
"LOCATION",
"ORGANIZATION",
"PERSON",
"DURATION",
"DATE",
"CARDINAL",
"PERCENT",
"MONEY",
"MEASURE",
],
root_label="S",
):
"""
Return a chunk structure containing the chunked tagged text that is
encoded in the given IEER style string.
Convert a string of chunked tagged text in the IEER named
entity format into a chunk structure. Chunks are of several
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
PERCENT, MONEY, and MEASURE.
:rtype: Tree
"""
# Try looking for a single document. If that doesn't work, then just
# treat everything as if it was within the <TEXT>...</TEXT>.
m = _IEER_DOC_RE.match(s)
if m:
return {
"text": _ieer_read_text(m.group("text"), root_label),
"docno": m.group("docno"),
"doctype": m.group("doctype"),
"date_time": m.group("date_time"),
#'headline': m.group('headline')
# we want to capture NEs in the headline too!
"headline": _ieer_read_text(m.group("headline"), root_label),
}
else:
return _ieer_read_text(s, root_label)
def demo():
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
import nltk
t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
t.pprint()
print()
s = """
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
"""
conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
conll_tree.pprint()
# Demonstrate CoNLL output
print("CoNLL output:")
print(nltk.chunk.tree2conllstr(conll_tree))
print()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,101 @@
# Natural Language Toolkit: Classifiers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Classes and interfaces for labeling tokens with category labels (or
"class labels"). Typically, labels are represented with strings
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
perform a wide range of classification tasks. For example,
classifiers can be used...
- to classify documents by topic
- to classify ambiguous words by which word sense is intended
- to classify acoustic signals by which phoneme they represent
- to classify sentences by their author
Features
========
In order to decide which category label is appropriate for a given
token, classifiers examine one or more 'features' of the token. These
"features" are typically chosen by hand, and indicate which aspects
of the token are relevant to the classification decision. For
example, a document classifier might use a separate feature for each
word, recording how often that word occurred in the document.
Featuresets
===========
The features describing a token are encoded using a "featureset",
which is a dictionary that maps from "feature names" to "feature
values". Feature names are unique strings that indicate what aspect
of the token is encoded by the feature. Examples include
``'prevword'``, for a feature whose value is the previous word; and
``'contains-word(library)'`` for a feature that is true when a document
contains the word ``'library'``. Feature values are typically
booleans, numbers, or strings, depending on which feature they
describe.
Featuresets are typically constructed using a "feature detector"
(also known as a "feature extractor"). A feature detector is a
function that takes a token (and sometimes information about its
context) as its input, and returns a featureset describing that token.
For example, the following feature detector converts a document
(stored as a list of words) to a featureset describing the set of
words included in the document:
>>> # Define a feature detector function.
>>> def document_features(document):
... return dict([('contains-word(%s)' % w, True) for w in document])
Feature detectors are typically applied to each token before it is fed
to the classifier:
>>> # Classify each Gutenberg document.
>>> from nltk.corpus import gutenberg
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
... doc = gutenberg.words(fileid) # doctest: +SKIP
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
The parameters that a feature detector expects will vary, depending on
the task and the needs of the feature detector. For example, a
feature detector for word sense disambiguation (WSD) might take as its
input a sentence, and the index of a word that should be classified,
and return a featureset for that word. The following feature detector
for WSD includes features describing the left and right contexts of
the target word:
>>> def wsd_features(sentence, index):
... featureset = {}
... for i in range(max(0, index-3), index):
... featureset['left-context(%s)' % sentence[i]] = True
... for i in range(index, max(index+3, len(sentence))):
... featureset['right-context(%s)' % sentence[i]] = True
... return featureset
Training Classifiers
====================
Most classifiers are built by training them on a list of hand-labeled
examples, known as the "training set". Training sets are represented
as lists of ``(featuredict, label)`` tuples.
"""
from nltk.classify.api import ClassifierI, MultiClassifierI
from nltk.classify.decisiontree import DecisionTreeClassifier
from nltk.classify.maxent import (
BinaryMaxentFeatureEncoding,
ConditionalExponentialClassifier,
MaxentClassifier,
TypedMaxentFeatureEncoding,
)
from nltk.classify.megam import call_megam, config_megam
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify.senna import Senna
from nltk.classify.textcat import TextCat
from nltk.classify.util import accuracy, apply_features, log_likelihood
from nltk.classify.weka import WekaClassifier, config_weka

Some files were not shown because too many files have changed in this diff Show More