updates
This commit is contained in:
1
Backend/venv/lib/python3.12/site-packages/nltk/VERSION
Normal file
1
Backend/venv/lib/python3.12/site-packages/nltk/VERSION
Normal file
@@ -0,0 +1 @@
|
||||
3.9.2
|
||||
208
Backend/venv/lib/python3.12/site-packages/nltk/__init__.py
Normal file
208
Backend/venv/lib/python3.12/site-packages/nltk/__init__.py
Normal file
@@ -0,0 +1,208 @@
|
||||
# Natural Language Toolkit (NLTK)
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Natural Language Toolkit (NLTK) is an open source Python library
|
||||
for Natural Language Processing. A free online book is available.
|
||||
(If you use the library for academic research, please cite the book.)
|
||||
|
||||
Steven Bird, Ewan Klein, and Edward Loper (2009).
|
||||
Natural Language Processing with Python. O'Reilly Media Inc.
|
||||
https://www.nltk.org/book/
|
||||
|
||||
isort:skip_file
|
||||
"""
|
||||
|
||||
import os
|
||||
import importlib
|
||||
|
||||
# //////////////////////////////////////////////////////
|
||||
# Metadata
|
||||
# //////////////////////////////////////////////////////
|
||||
|
||||
# Version. For each new release, the version number should be updated
|
||||
# in the file VERSION.
|
||||
try:
|
||||
# If a VERSION file exists, use it!
|
||||
version_file = os.path.join(os.path.dirname(__file__), "VERSION")
|
||||
with open(version_file) as infile:
|
||||
__version__ = infile.read().strip()
|
||||
except NameError:
|
||||
__version__ = "unknown (running code interactively?)"
|
||||
except OSError as ex:
|
||||
__version__ = "unknown (%s)" % ex
|
||||
|
||||
if __doc__ is not None: # fix for the ``python -OO``
|
||||
__doc__ += "\n@version: " + __version__
|
||||
|
||||
|
||||
# Copyright notice
|
||||
__copyright__ = """\
|
||||
Copyright (C) 2001-2025 NLTK Project.
|
||||
|
||||
Distributed and Licensed under the Apache License, Version 2.0,
|
||||
which is included by reference.
|
||||
"""
|
||||
|
||||
__license__ = "Apache License, Version 2.0"
|
||||
# Description of the toolkit, keywords, and the project's primary URL.
|
||||
__longdescr__ = """\
|
||||
The Natural Language Toolkit (NLTK) is a Python package for
|
||||
natural language processing. NLTK requires Python 3.9, 3.10, 3.11, 3.12 or 3.13."""
|
||||
__keywords__ = [
|
||||
"NLP",
|
||||
"CL",
|
||||
"natural language processing",
|
||||
"computational linguistics",
|
||||
"parsing",
|
||||
"tagging",
|
||||
"tokenizing",
|
||||
"syntax",
|
||||
"linguistics",
|
||||
"language",
|
||||
"natural language",
|
||||
"text analytics",
|
||||
]
|
||||
__url__ = "https://www.nltk.org/"
|
||||
|
||||
# Maintainer, contributors, etc.
|
||||
__maintainer__ = "NLTK Team"
|
||||
__maintainer_email__ = "nltk.team@gmail.com"
|
||||
__author__ = __maintainer__
|
||||
__author_email__ = __maintainer_email__
|
||||
|
||||
# "Trove" classifiers for Python Package Index.
|
||||
__classifiers__ = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Education",
|
||||
"Intended Audience :: Information Technology",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Scientific/Engineering",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Scientific/Engineering :: Human Machine Interfaces",
|
||||
"Topic :: Scientific/Engineering :: Information Analysis",
|
||||
"Topic :: Text Processing",
|
||||
"Topic :: Text Processing :: Filters",
|
||||
"Topic :: Text Processing :: General",
|
||||
"Topic :: Text Processing :: Indexing",
|
||||
"Topic :: Text Processing :: Linguistic",
|
||||
]
|
||||
|
||||
from nltk.internals import config_java
|
||||
|
||||
# support numpy from pypy
|
||||
try:
|
||||
import numpypy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Override missing methods on environments where it cannot be used like GAE.
|
||||
import subprocess
|
||||
|
||||
if not hasattr(subprocess, "PIPE"):
|
||||
|
||||
def _fake_PIPE(*args, **kwargs):
|
||||
raise NotImplementedError("subprocess.PIPE is not supported.")
|
||||
|
||||
subprocess.PIPE = _fake_PIPE
|
||||
if not hasattr(subprocess, "Popen"):
|
||||
|
||||
def _fake_Popen(*args, **kwargs):
|
||||
raise NotImplementedError("subprocess.Popen is not supported.")
|
||||
|
||||
subprocess.Popen = _fake_Popen
|
||||
|
||||
###########################################################
|
||||
# TOP-LEVEL MODULES
|
||||
###########################################################
|
||||
|
||||
# Import top-level functionality into top-level namespace
|
||||
|
||||
from nltk.collocations import *
|
||||
from nltk.decorators import decorator, memoize
|
||||
from nltk.featstruct import *
|
||||
from nltk.grammar import *
|
||||
from nltk.probability import *
|
||||
from nltk.text import *
|
||||
from nltk.util import *
|
||||
from nltk.jsontags import *
|
||||
|
||||
###########################################################
|
||||
# PACKAGES
|
||||
###########################################################
|
||||
|
||||
from nltk.chunk import *
|
||||
from nltk.classify import *
|
||||
from nltk.inference import *
|
||||
from nltk.metrics import *
|
||||
from nltk.parse import *
|
||||
from nltk.tag import *
|
||||
from nltk.tokenize import *
|
||||
from nltk.translate import *
|
||||
from nltk.tree import *
|
||||
from nltk.sem import *
|
||||
from nltk.stem import *
|
||||
|
||||
# Packages which can be lazily imported
|
||||
# (a) we don't import *
|
||||
# (b) they're slow to import or have run-time dependencies
|
||||
# that can safely fail at run time
|
||||
|
||||
from nltk import lazyimport
|
||||
|
||||
app = lazyimport.LazyModule("app", locals(), globals())
|
||||
chat = lazyimport.LazyModule("chat", locals(), globals())
|
||||
corpus = lazyimport.LazyModule("corpus", locals(), globals())
|
||||
draw = lazyimport.LazyModule("draw", locals(), globals())
|
||||
toolbox = lazyimport.LazyModule("toolbox", locals(), globals())
|
||||
|
||||
# Optional loading
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
from nltk import cluster
|
||||
|
||||
from nltk.downloader import download, download_shell
|
||||
|
||||
# Check if tkinter exists without importing it to avoid crashes after
|
||||
# forks on macOS. Only nltk.app, nltk.draw, and demo modules should
|
||||
# have top-level tkinter imports. See #2949 for more details.
|
||||
if importlib.util.find_spec("tkinter"):
|
||||
try:
|
||||
from nltk.downloader import download_gui
|
||||
except RuntimeError as e:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"Corpus downloader GUI not loaded "
|
||||
"(RuntimeError during import: %s)" % str(e)
|
||||
)
|
||||
|
||||
# explicitly import all top-level modules (ensuring
|
||||
# they override the same names inadvertently imported
|
||||
# from a subpackage)
|
||||
|
||||
from nltk import ccg, chunk, classify, collocations
|
||||
from nltk import data, featstruct, grammar, help, inference, metrics
|
||||
from nltk import misc, parse, probability, sem, stem, wsd
|
||||
from nltk import tag, tbl, text, tokenize, translate, tree, util
|
||||
|
||||
|
||||
# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116
|
||||
def demo():
|
||||
print("To run the demo code for a module, type nltk.module.demo()")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,47 @@
|
||||
# Natural Language Toolkit: Applications package
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Interactive NLTK Applications:
|
||||
|
||||
chartparser: Chart Parser
|
||||
chunkparser: Regular-Expression Chunk Parser
|
||||
collocations: Find collocations in text
|
||||
concordance: Part-of-speech concordancer
|
||||
nemo: Finding (and Replacing) Nemo regular expression tool
|
||||
rdparser: Recursive Descent Parser
|
||||
srparser: Shift-Reduce Parser
|
||||
wordnet: WordNet Browser
|
||||
"""
|
||||
|
||||
|
||||
# Import Tkinter-based modules if Tkinter is installed
|
||||
try:
|
||||
import tkinter
|
||||
except ImportError:
|
||||
import warnings
|
||||
|
||||
warnings.warn("nltk.app package not loaded (please install Tkinter library).")
|
||||
else:
|
||||
from nltk.app.chartparser_app import app as chartparser
|
||||
from nltk.app.chunkparser_app import app as chunkparser
|
||||
from nltk.app.collocations_app import app as collocations
|
||||
from nltk.app.concordance_app import app as concordance
|
||||
from nltk.app.nemo_app import app as nemo
|
||||
from nltk.app.rdparser_app import app as rdparser
|
||||
from nltk.app.srparser_app import app as srparser
|
||||
from nltk.app.wordnet_app import app as wordnet
|
||||
|
||||
try:
|
||||
from matplotlib import pylab
|
||||
except ImportError:
|
||||
import warnings
|
||||
|
||||
warnings.warn("nltk.app.wordfreq not loaded (requires the matplotlib library).")
|
||||
else:
|
||||
from nltk.app.wordfreq_app import app as wordfreq
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,438 @@
|
||||
# Natural Language Toolkit: Collocations Application
|
||||
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
|
||||
import queue as q
|
||||
import threading
|
||||
from tkinter import (
|
||||
END,
|
||||
LEFT,
|
||||
SUNKEN,
|
||||
Button,
|
||||
Frame,
|
||||
IntVar,
|
||||
Label,
|
||||
Menu,
|
||||
OptionMenu,
|
||||
Scrollbar,
|
||||
StringVar,
|
||||
Text,
|
||||
Tk,
|
||||
)
|
||||
from tkinter.font import Font
|
||||
|
||||
from nltk.corpus import (
|
||||
alpino,
|
||||
brown,
|
||||
cess_cat,
|
||||
cess_esp,
|
||||
floresta,
|
||||
indian,
|
||||
mac_morpho,
|
||||
machado,
|
||||
nps_chat,
|
||||
sinica_treebank,
|
||||
treebank,
|
||||
)
|
||||
from nltk.probability import FreqDist
|
||||
from nltk.util import in_idle
|
||||
|
||||
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
|
||||
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
|
||||
POLL_INTERVAL = 100
|
||||
|
||||
_DEFAULT = "English: Brown Corpus (Humor)"
|
||||
_CORPORA = {
|
||||
"Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
|
||||
"English: Brown Corpus": lambda: brown.words(),
|
||||
"English: Brown Corpus (Press)": lambda: brown.words(
|
||||
categories=["news", "editorial", "reviews"]
|
||||
),
|
||||
"English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
|
||||
"English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
|
||||
"English: Brown Corpus (Science Fiction)": lambda: brown.words(
|
||||
categories="science_fiction"
|
||||
),
|
||||
"English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
|
||||
"English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
|
||||
"English: NPS Chat Corpus": lambda: nps_chat.words(),
|
||||
"English: Wall Street Journal Corpus": lambda: treebank.words(),
|
||||
"Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
|
||||
"Dutch: Alpino Corpus": lambda: alpino.words(),
|
||||
"Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
|
||||
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
|
||||
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
|
||||
"Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
|
||||
"Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
|
||||
}
|
||||
|
||||
|
||||
class CollocationsView:
|
||||
_BACKGROUND_COLOUR = "#FFF" # white
|
||||
|
||||
def __init__(self):
|
||||
self.queue = q.Queue()
|
||||
self.model = CollocationsModel(self.queue)
|
||||
self.top = Tk()
|
||||
self._init_top(self.top)
|
||||
self._init_menubar()
|
||||
self._init_widgets(self.top)
|
||||
self.load_corpus(self.model.DEFAULT_CORPUS)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def _init_top(self, top):
|
||||
top.geometry("550x650+50+50")
|
||||
top.title("NLTK Collocations List")
|
||||
top.bind("<Control-q>", self.destroy)
|
||||
top.protocol("WM_DELETE_WINDOW", self.destroy)
|
||||
top.minsize(550, 650)
|
||||
|
||||
def _init_widgets(self, parent):
|
||||
self.main_frame = Frame(
|
||||
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
|
||||
)
|
||||
self._init_corpus_select(self.main_frame)
|
||||
self._init_results_box(self.main_frame)
|
||||
self._init_paging(self.main_frame)
|
||||
self._init_status(self.main_frame)
|
||||
self.main_frame.pack(fill="both", expand=True)
|
||||
|
||||
def _init_corpus_select(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.var = StringVar(innerframe)
|
||||
self.var.set(self.model.DEFAULT_CORPUS)
|
||||
Label(
|
||||
innerframe,
|
||||
justify=LEFT,
|
||||
text=" Corpus: ",
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
padx=2,
|
||||
pady=1,
|
||||
border=0,
|
||||
).pack(side="left")
|
||||
|
||||
other_corpora = list(self.model.CORPORA.keys()).remove(
|
||||
self.model.DEFAULT_CORPUS
|
||||
)
|
||||
om = OptionMenu(
|
||||
innerframe,
|
||||
self.var,
|
||||
self.model.DEFAULT_CORPUS,
|
||||
command=self.corpus_selected,
|
||||
*self.model.non_default_corpora()
|
||||
)
|
||||
om["borderwidth"] = 0
|
||||
om["highlightthickness"] = 1
|
||||
om.pack(side="left")
|
||||
innerframe.pack(side="top", fill="x", anchor="n")
|
||||
|
||||
def _init_status(self, parent):
|
||||
self.status = Label(
|
||||
parent,
|
||||
justify=LEFT,
|
||||
relief=SUNKEN,
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
border=0,
|
||||
padx=1,
|
||||
pady=0,
|
||||
)
|
||||
self.status.pack(side="top", anchor="sw")
|
||||
|
||||
def _init_menubar(self):
|
||||
self._result_size = IntVar(self.top)
|
||||
menubar = Menu(self.top)
|
||||
|
||||
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
|
||||
filemenu.add_command(
|
||||
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
||||
)
|
||||
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
||||
|
||||
editmenu = Menu(menubar, tearoff=0)
|
||||
rescntmenu = Menu(editmenu, tearoff=0)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="20",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=20,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="50",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=50,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="100",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=100,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.invoke(1)
|
||||
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
|
||||
|
||||
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
||||
self.top.config(menu=menubar)
|
||||
|
||||
def set_result_size(self, **kwargs):
|
||||
self.model.result_count = self._result_size.get()
|
||||
|
||||
def _init_results_box(self, parent):
|
||||
innerframe = Frame(parent)
|
||||
i1 = Frame(innerframe)
|
||||
i2 = Frame(innerframe)
|
||||
vscrollbar = Scrollbar(i1, borderwidth=1)
|
||||
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
|
||||
self.results_box = Text(
|
||||
i1,
|
||||
font=Font(family="courier", size="16"),
|
||||
state="disabled",
|
||||
borderwidth=1,
|
||||
yscrollcommand=vscrollbar.set,
|
||||
xscrollcommand=hscrollbar.set,
|
||||
wrap="none",
|
||||
width="40",
|
||||
height="20",
|
||||
exportselection=1,
|
||||
)
|
||||
self.results_box.pack(side="left", fill="both", expand=True)
|
||||
vscrollbar.pack(side="left", fill="y", anchor="e")
|
||||
vscrollbar.config(command=self.results_box.yview)
|
||||
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
|
||||
hscrollbar.config(command=self.results_box.xview)
|
||||
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
|
||||
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
|
||||
side="left", anchor="e"
|
||||
)
|
||||
i1.pack(side="top", fill="both", expand=True, anchor="n")
|
||||
i2.pack(side="bottom", fill="x", anchor="s")
|
||||
innerframe.pack(side="top", fill="both", expand=True)
|
||||
|
||||
def _init_paging(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.prev = prev = Button(
|
||||
innerframe,
|
||||
text="Previous",
|
||||
command=self.previous,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
prev.pack(side="left", anchor="center")
|
||||
self.next = next = Button(
|
||||
innerframe,
|
||||
text="Next",
|
||||
command=self.__next__,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
next.pack(side="right", anchor="center")
|
||||
innerframe.pack(side="top", fill="y")
|
||||
self.reset_current_page()
|
||||
|
||||
def reset_current_page(self):
|
||||
self.current_page = -1
|
||||
|
||||
def _poll(self):
|
||||
try:
|
||||
event = self.queue.get(block=False)
|
||||
except q.Empty:
|
||||
pass
|
||||
else:
|
||||
if event == CORPUS_LOADED_EVENT:
|
||||
self.handle_corpus_loaded(event)
|
||||
elif event == ERROR_LOADING_CORPUS_EVENT:
|
||||
self.handle_error_loading_corpus(event)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def handle_error_loading_corpus(self, event):
|
||||
self.status["text"] = "Error in loading " + self.var.get()
|
||||
self.unfreeze_editable()
|
||||
self.clear_results_box()
|
||||
self.freeze_editable()
|
||||
self.reset_current_page()
|
||||
|
||||
def handle_corpus_loaded(self, event):
|
||||
self.status["text"] = self.var.get() + " is loaded"
|
||||
self.unfreeze_editable()
|
||||
self.clear_results_box()
|
||||
self.reset_current_page()
|
||||
# self.next()
|
||||
collocations = self.model.next(self.current_page + 1)
|
||||
self.write_results(collocations)
|
||||
self.current_page += 1
|
||||
|
||||
def corpus_selected(self, *args):
|
||||
new_selection = self.var.get()
|
||||
self.load_corpus(new_selection)
|
||||
|
||||
def previous(self):
|
||||
self.freeze_editable()
|
||||
collocations = self.model.prev(self.current_page - 1)
|
||||
self.current_page = self.current_page - 1
|
||||
self.clear_results_box()
|
||||
self.write_results(collocations)
|
||||
self.unfreeze_editable()
|
||||
|
||||
def __next__(self):
|
||||
self.freeze_editable()
|
||||
collocations = self.model.next(self.current_page + 1)
|
||||
self.clear_results_box()
|
||||
self.write_results(collocations)
|
||||
self.current_page += 1
|
||||
self.unfreeze_editable()
|
||||
|
||||
def load_corpus(self, selection):
|
||||
if self.model.selected_corpus != selection:
|
||||
self.status["text"] = "Loading " + selection + "..."
|
||||
self.freeze_editable()
|
||||
self.model.load_corpus(selection)
|
||||
|
||||
def freeze_editable(self):
|
||||
self.prev["state"] = "disabled"
|
||||
self.next["state"] = "disabled"
|
||||
|
||||
def clear_results_box(self):
|
||||
self.results_box["state"] = "normal"
|
||||
self.results_box.delete("1.0", END)
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
def fire_event(self, event):
|
||||
# Firing an event so that rendering of widgets happen in the mainloop thread
|
||||
self.top.event_generate(event, when="tail")
|
||||
|
||||
def destroy(self, *e):
|
||||
if self.top is None:
|
||||
return
|
||||
self.top.after_cancel(self.after)
|
||||
self.top.destroy()
|
||||
self.top = None
|
||||
|
||||
def mainloop(self, *args, **kwargs):
|
||||
if in_idle():
|
||||
return
|
||||
self.top.mainloop(*args, **kwargs)
|
||||
|
||||
def unfreeze_editable(self):
|
||||
self.set_paging_button_states()
|
||||
|
||||
def set_paging_button_states(self):
|
||||
if self.current_page == -1 or self.current_page == 0:
|
||||
self.prev["state"] = "disabled"
|
||||
else:
|
||||
self.prev["state"] = "normal"
|
||||
if self.model.is_last_page(self.current_page):
|
||||
self.next["state"] = "disabled"
|
||||
else:
|
||||
self.next["state"] = "normal"
|
||||
|
||||
def write_results(self, results):
|
||||
self.results_box["state"] = "normal"
|
||||
row = 1
|
||||
for each in results:
|
||||
self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
|
||||
row += 1
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
|
||||
class CollocationsModel:
|
||||
def __init__(self, queue):
|
||||
self.result_count = None
|
||||
self.selected_corpus = None
|
||||
self.collocations = None
|
||||
self.CORPORA = _CORPORA
|
||||
self.DEFAULT_CORPUS = _DEFAULT
|
||||
self.queue = queue
|
||||
self.reset_results()
|
||||
|
||||
def reset_results(self):
|
||||
self.result_pages = []
|
||||
self.results_returned = 0
|
||||
|
||||
def load_corpus(self, name):
|
||||
self.selected_corpus = name
|
||||
self.collocations = None
|
||||
runner_thread = self.LoadCorpus(name, self)
|
||||
runner_thread.start()
|
||||
self.reset_results()
|
||||
|
||||
def non_default_corpora(self):
|
||||
copy = []
|
||||
copy.extend(list(self.CORPORA.keys()))
|
||||
copy.remove(self.DEFAULT_CORPUS)
|
||||
copy.sort()
|
||||
return copy
|
||||
|
||||
def is_last_page(self, number):
|
||||
if number < len(self.result_pages):
|
||||
return False
|
||||
return self.results_returned + (
|
||||
number - len(self.result_pages)
|
||||
) * self.result_count >= len(self.collocations)
|
||||
|
||||
def next(self, page):
|
||||
if (len(self.result_pages) - 1) < page:
|
||||
for i in range(page - (len(self.result_pages) - 1)):
|
||||
self.result_pages.append(
|
||||
self.collocations[
|
||||
self.results_returned : self.results_returned
|
||||
+ self.result_count
|
||||
]
|
||||
)
|
||||
self.results_returned += self.result_count
|
||||
return self.result_pages[page]
|
||||
|
||||
def prev(self, page):
|
||||
if page == -1:
|
||||
return []
|
||||
return self.result_pages[page]
|
||||
|
||||
class LoadCorpus(threading.Thread):
|
||||
def __init__(self, name, model):
|
||||
threading.Thread.__init__(self)
|
||||
self.model, self.name = model, name
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
words = self.model.CORPORA[self.name]()
|
||||
from operator import itemgetter
|
||||
|
||||
text = [w for w in words if len(w) > 2]
|
||||
fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
|
||||
vocab = FreqDist(text)
|
||||
scored = [
|
||||
((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
|
||||
for w1, w2 in fd
|
||||
]
|
||||
scored.sort(key=itemgetter(1), reverse=True)
|
||||
self.model.collocations = list(map(itemgetter(0), scored))
|
||||
self.model.queue.put(CORPUS_LOADED_EVENT)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
|
||||
|
||||
|
||||
# def collocations():
|
||||
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
|
||||
|
||||
|
||||
def app():
|
||||
c = CollocationsView()
|
||||
c.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
@@ -0,0 +1,709 @@
|
||||
# Natural Language Toolkit: Concordance Application
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import queue as q
|
||||
import re
|
||||
import threading
|
||||
from tkinter import (
|
||||
END,
|
||||
LEFT,
|
||||
SUNKEN,
|
||||
Button,
|
||||
Entry,
|
||||
Frame,
|
||||
IntVar,
|
||||
Label,
|
||||
Menu,
|
||||
OptionMenu,
|
||||
Scrollbar,
|
||||
StringVar,
|
||||
Text,
|
||||
Tk,
|
||||
)
|
||||
from tkinter.font import Font
|
||||
|
||||
from nltk.corpus import (
|
||||
alpino,
|
||||
brown,
|
||||
cess_cat,
|
||||
cess_esp,
|
||||
floresta,
|
||||
indian,
|
||||
mac_morpho,
|
||||
nps_chat,
|
||||
sinica_treebank,
|
||||
treebank,
|
||||
)
|
||||
from nltk.draw.util import ShowText
|
||||
from nltk.util import in_idle
|
||||
|
||||
WORD_OR_TAG = "[^/ ]+"
|
||||
BOUNDARY = r"\b"
|
||||
|
||||
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
|
||||
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
|
||||
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
|
||||
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
|
||||
|
||||
POLL_INTERVAL = 50
|
||||
|
||||
# NB All corpora must be specified in a lambda expression so as not to be
|
||||
# loaded when the module is imported.
|
||||
|
||||
_DEFAULT = "English: Brown Corpus (Humor, simplified)"
|
||||
_CORPORA = {
|
||||
"Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus": lambda: brown.tagged_sents(),
|
||||
"English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
|
||||
categories=["news", "editorial", "reviews"], tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
|
||||
categories="religion", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
|
||||
categories="learned", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
|
||||
categories="science_fiction", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
|
||||
categories="romance", tagset="universal"
|
||||
),
|
||||
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
|
||||
categories="humor", tagset="universal"
|
||||
),
|
||||
"English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
|
||||
"English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
|
||||
tagset="universal"
|
||||
),
|
||||
"English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
|
||||
"English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
|
||||
"Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
|
||||
"Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
|
||||
"Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
|
||||
files="hindi.pos", tagset="universal"
|
||||
),
|
||||
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
|
||||
"Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
|
||||
"Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
"Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
|
||||
tagset="universal"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class ConcordanceSearchView:
|
||||
_BACKGROUND_COLOUR = "#FFF" # white
|
||||
|
||||
# Colour of highlighted results
|
||||
_HIGHLIGHT_WORD_COLOUR = "#F00" # red
|
||||
_HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
|
||||
|
||||
_HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
|
||||
_HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
|
||||
|
||||
# Percentage of text left of the scrollbar position
|
||||
_FRACTION_LEFT_TEXT = 0.30
|
||||
|
||||
def __init__(self):
|
||||
self.queue = q.Queue()
|
||||
self.model = ConcordanceSearchModel(self.queue)
|
||||
self.top = Tk()
|
||||
self._init_top(self.top)
|
||||
self._init_menubar()
|
||||
self._init_widgets(self.top)
|
||||
self.load_corpus(self.model.DEFAULT_CORPUS)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def _init_top(self, top):
|
||||
top.geometry("950x680+50+50")
|
||||
top.title("NLTK Concordance Search")
|
||||
top.bind("<Control-q>", self.destroy)
|
||||
top.protocol("WM_DELETE_WINDOW", self.destroy)
|
||||
top.minsize(950, 680)
|
||||
|
||||
def _init_widgets(self, parent):
|
||||
self.main_frame = Frame(
|
||||
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
|
||||
)
|
||||
self._init_corpus_select(self.main_frame)
|
||||
self._init_query_box(self.main_frame)
|
||||
self._init_results_box(self.main_frame)
|
||||
self._init_paging(self.main_frame)
|
||||
self._init_status(self.main_frame)
|
||||
self.main_frame.pack(fill="both", expand=True)
|
||||
|
||||
def _init_menubar(self):
|
||||
self._result_size = IntVar(self.top)
|
||||
self._cntx_bf_len = IntVar(self.top)
|
||||
self._cntx_af_len = IntVar(self.top)
|
||||
menubar = Menu(self.top)
|
||||
|
||||
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
|
||||
filemenu.add_command(
|
||||
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
||||
)
|
||||
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
||||
|
||||
editmenu = Menu(menubar, tearoff=0)
|
||||
rescntmenu = Menu(editmenu, tearoff=0)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="20",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=20,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="50",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=50,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.add_radiobutton(
|
||||
label="100",
|
||||
variable=self._result_size,
|
||||
underline=0,
|
||||
value=100,
|
||||
command=self.set_result_size,
|
||||
)
|
||||
rescntmenu.invoke(1)
|
||||
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
|
||||
|
||||
cntxmenu = Menu(editmenu, tearoff=0)
|
||||
cntxbfmenu = Menu(cntxmenu, tearoff=0)
|
||||
cntxbfmenu.add_radiobutton(
|
||||
label="60 characters",
|
||||
variable=self._cntx_bf_len,
|
||||
underline=0,
|
||||
value=60,
|
||||
command=self.set_cntx_bf_len,
|
||||
)
|
||||
cntxbfmenu.add_radiobutton(
|
||||
label="80 characters",
|
||||
variable=self._cntx_bf_len,
|
||||
underline=0,
|
||||
value=80,
|
||||
command=self.set_cntx_bf_len,
|
||||
)
|
||||
cntxbfmenu.add_radiobutton(
|
||||
label="100 characters",
|
||||
variable=self._cntx_bf_len,
|
||||
underline=0,
|
||||
value=100,
|
||||
command=self.set_cntx_bf_len,
|
||||
)
|
||||
cntxbfmenu.invoke(1)
|
||||
cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
|
||||
|
||||
cntxafmenu = Menu(cntxmenu, tearoff=0)
|
||||
cntxafmenu.add_radiobutton(
|
||||
label="70 characters",
|
||||
variable=self._cntx_af_len,
|
||||
underline=0,
|
||||
value=70,
|
||||
command=self.set_cntx_af_len,
|
||||
)
|
||||
cntxafmenu.add_radiobutton(
|
||||
label="90 characters",
|
||||
variable=self._cntx_af_len,
|
||||
underline=0,
|
||||
value=90,
|
||||
command=self.set_cntx_af_len,
|
||||
)
|
||||
cntxafmenu.add_radiobutton(
|
||||
label="110 characters",
|
||||
variable=self._cntx_af_len,
|
||||
underline=0,
|
||||
value=110,
|
||||
command=self.set_cntx_af_len,
|
||||
)
|
||||
cntxafmenu.invoke(1)
|
||||
cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
|
||||
|
||||
editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
|
||||
|
||||
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
||||
|
||||
self.top.config(menu=menubar)
|
||||
|
||||
def set_result_size(self, **kwargs):
|
||||
self.model.result_count = self._result_size.get()
|
||||
|
||||
def set_cntx_af_len(self, **kwargs):
|
||||
self._char_after = self._cntx_af_len.get()
|
||||
|
||||
def set_cntx_bf_len(self, **kwargs):
|
||||
self._char_before = self._cntx_bf_len.get()
|
||||
|
||||
def _init_corpus_select(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.var = StringVar(innerframe)
|
||||
self.var.set(self.model.DEFAULT_CORPUS)
|
||||
Label(
|
||||
innerframe,
|
||||
justify=LEFT,
|
||||
text=" Corpus: ",
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
padx=2,
|
||||
pady=1,
|
||||
border=0,
|
||||
).pack(side="left")
|
||||
|
||||
other_corpora = list(self.model.CORPORA.keys()).remove(
|
||||
self.model.DEFAULT_CORPUS
|
||||
)
|
||||
om = OptionMenu(
|
||||
innerframe,
|
||||
self.var,
|
||||
self.model.DEFAULT_CORPUS,
|
||||
command=self.corpus_selected,
|
||||
*self.model.non_default_corpora()
|
||||
)
|
||||
om["borderwidth"] = 0
|
||||
om["highlightthickness"] = 1
|
||||
om.pack(side="left")
|
||||
innerframe.pack(side="top", fill="x", anchor="n")
|
||||
|
||||
def _init_status(self, parent):
|
||||
self.status = Label(
|
||||
parent,
|
||||
justify=LEFT,
|
||||
relief=SUNKEN,
|
||||
background=self._BACKGROUND_COLOUR,
|
||||
border=0,
|
||||
padx=1,
|
||||
pady=0,
|
||||
)
|
||||
self.status.pack(side="top", anchor="sw")
|
||||
|
||||
def _init_query_box(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
|
||||
self.query_box = Entry(another, width=60)
|
||||
self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
|
||||
self.search_button = Button(
|
||||
another,
|
||||
text="Search",
|
||||
command=self.search,
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
)
|
||||
self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
|
||||
self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
|
||||
another.pack()
|
||||
innerframe.pack(side="top", fill="x", anchor="n")
|
||||
|
||||
def search_enter_keypress_handler(self, *event):
|
||||
self.search()
|
||||
|
||||
def _init_results_box(self, parent):
|
||||
innerframe = Frame(parent)
|
||||
i1 = Frame(innerframe)
|
||||
i2 = Frame(innerframe)
|
||||
vscrollbar = Scrollbar(i1, borderwidth=1)
|
||||
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
|
||||
self.results_box = Text(
|
||||
i1,
|
||||
font=Font(family="courier", size="16"),
|
||||
state="disabled",
|
||||
borderwidth=1,
|
||||
yscrollcommand=vscrollbar.set,
|
||||
xscrollcommand=hscrollbar.set,
|
||||
wrap="none",
|
||||
width="40",
|
||||
height="20",
|
||||
exportselection=1,
|
||||
)
|
||||
self.results_box.pack(side="left", fill="both", expand=True)
|
||||
self.results_box.tag_config(
|
||||
self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
|
||||
)
|
||||
self.results_box.tag_config(
|
||||
self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
|
||||
)
|
||||
vscrollbar.pack(side="left", fill="y", anchor="e")
|
||||
vscrollbar.config(command=self.results_box.yview)
|
||||
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
|
||||
hscrollbar.config(command=self.results_box.xview)
|
||||
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
|
||||
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
|
||||
side="left", anchor="e"
|
||||
)
|
||||
i1.pack(side="top", fill="both", expand=True, anchor="n")
|
||||
i2.pack(side="bottom", fill="x", anchor="s")
|
||||
innerframe.pack(side="top", fill="both", expand=True)
|
||||
|
||||
def _init_paging(self, parent):
|
||||
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
||||
self.prev = prev = Button(
|
||||
innerframe,
|
||||
text="Previous",
|
||||
command=self.previous,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
prev.pack(side="left", anchor="center")
|
||||
self.next = next = Button(
|
||||
innerframe,
|
||||
text="Next",
|
||||
command=self.__next__,
|
||||
width="10",
|
||||
borderwidth=1,
|
||||
highlightthickness=1,
|
||||
state="disabled",
|
||||
)
|
||||
next.pack(side="right", anchor="center")
|
||||
innerframe.pack(side="top", fill="y")
|
||||
self.current_page = 0
|
||||
|
||||
def previous(self):
|
||||
self.clear_results_box()
|
||||
self.freeze_editable()
|
||||
self.model.prev(self.current_page - 1)
|
||||
|
||||
def __next__(self):
|
||||
self.clear_results_box()
|
||||
self.freeze_editable()
|
||||
self.model.next(self.current_page + 1)
|
||||
|
||||
def about(self, *e):
|
||||
ABOUT = "NLTK Concordance Search Demo\n"
|
||||
TITLE = "About: NLTK Concordance Search Demo"
|
||||
try:
|
||||
from tkinter.messagebox import Message
|
||||
|
||||
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
|
||||
except:
|
||||
ShowText(self.top, TITLE, ABOUT)
|
||||
|
||||
def _bind_event_handlers(self):
|
||||
self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
|
||||
self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
|
||||
self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
|
||||
self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
|
||||
|
||||
def _poll(self):
|
||||
try:
|
||||
event = self.queue.get(block=False)
|
||||
except q.Empty:
|
||||
pass
|
||||
else:
|
||||
if event == CORPUS_LOADED_EVENT:
|
||||
self.handle_corpus_loaded(event)
|
||||
elif event == SEARCH_TERMINATED_EVENT:
|
||||
self.handle_search_terminated(event)
|
||||
elif event == SEARCH_ERROR_EVENT:
|
||||
self.handle_search_error(event)
|
||||
elif event == ERROR_LOADING_CORPUS_EVENT:
|
||||
self.handle_error_loading_corpus(event)
|
||||
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
||||
|
||||
def handle_error_loading_corpus(self, event):
|
||||
self.status["text"] = "Error in loading " + self.var.get()
|
||||
self.unfreeze_editable()
|
||||
self.clear_all()
|
||||
self.freeze_editable()
|
||||
|
||||
def handle_corpus_loaded(self, event):
|
||||
self.status["text"] = self.var.get() + " is loaded"
|
||||
self.unfreeze_editable()
|
||||
self.clear_all()
|
||||
self.query_box.focus_set()
|
||||
|
||||
def handle_search_terminated(self, event):
|
||||
# todo: refactor the model such that it is less state sensitive
|
||||
results = self.model.get_results()
|
||||
self.write_results(results)
|
||||
self.status["text"] = ""
|
||||
if len(results) == 0:
|
||||
self.status["text"] = "No results found for " + self.model.query
|
||||
else:
|
||||
self.current_page = self.model.last_requested_page
|
||||
self.unfreeze_editable()
|
||||
self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
|
||||
|
||||
def handle_search_error(self, event):
|
||||
self.status["text"] = "Error in query " + self.model.query
|
||||
self.unfreeze_editable()
|
||||
|
||||
def corpus_selected(self, *args):
|
||||
new_selection = self.var.get()
|
||||
self.load_corpus(new_selection)
|
||||
|
||||
def load_corpus(self, selection):
|
||||
if self.model.selected_corpus != selection:
|
||||
self.status["text"] = "Loading " + selection + "..."
|
||||
self.freeze_editable()
|
||||
self.model.load_corpus(selection)
|
||||
|
||||
def search(self):
|
||||
self.current_page = 0
|
||||
self.clear_results_box()
|
||||
self.model.reset_results()
|
||||
query = self.query_box.get()
|
||||
if len(query.strip()) == 0:
|
||||
return
|
||||
self.status["text"] = "Searching for " + query
|
||||
self.freeze_editable()
|
||||
self.model.search(query, self.current_page + 1)
|
||||
|
||||
def write_results(self, results):
|
||||
self.results_box["state"] = "normal"
|
||||
row = 1
|
||||
for each in results:
|
||||
sent, pos1, pos2 = each[0].strip(), each[1], each[2]
|
||||
if len(sent) != 0:
|
||||
if pos1 < self._char_before:
|
||||
sent, pos1, pos2 = self.pad(sent, pos1, pos2)
|
||||
sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
|
||||
if not row == len(results):
|
||||
sentence += "\n"
|
||||
self.results_box.insert(str(row) + ".0", sentence)
|
||||
word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
|
||||
for marker in word_markers:
|
||||
self.results_box.tag_add(
|
||||
self._HIGHLIGHT_WORD_TAG,
|
||||
str(row) + "." + str(marker[0]),
|
||||
str(row) + "." + str(marker[1]),
|
||||
)
|
||||
for marker in label_markers:
|
||||
self.results_box.tag_add(
|
||||
self._HIGHLIGHT_LABEL_TAG,
|
||||
str(row) + "." + str(marker[0]),
|
||||
str(row) + "." + str(marker[1]),
|
||||
)
|
||||
row += 1
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
def words_and_labels(self, sentence, pos1, pos2):
|
||||
search_exp = sentence[pos1:pos2]
|
||||
words, labels = [], []
|
||||
labeled_words = search_exp.split(" ")
|
||||
index = 0
|
||||
for each in labeled_words:
|
||||
if each == "":
|
||||
index += 1
|
||||
else:
|
||||
word, label = each.split("/")
|
||||
words.append(
|
||||
(self._char_before + index, self._char_before + index + len(word))
|
||||
)
|
||||
index += len(word) + 1
|
||||
labels.append(
|
||||
(self._char_before + index, self._char_before + index + len(label))
|
||||
)
|
||||
index += len(label)
|
||||
index += 1
|
||||
return words, labels
|
||||
|
||||
def pad(self, sent, hstart, hend):
|
||||
if hstart >= self._char_before:
|
||||
return sent, hstart, hend
|
||||
d = self._char_before - hstart
|
||||
sent = "".join([" "] * d) + sent
|
||||
return sent, hstart + d, hend + d
|
||||
|
||||
def destroy(self, *e):
|
||||
if self.top is None:
|
||||
return
|
||||
self.top.after_cancel(self.after)
|
||||
self.top.destroy()
|
||||
self.top = None
|
||||
|
||||
def clear_all(self):
|
||||
self.query_box.delete(0, END)
|
||||
self.model.reset_query()
|
||||
self.clear_results_box()
|
||||
|
||||
def clear_results_box(self):
|
||||
self.results_box["state"] = "normal"
|
||||
self.results_box.delete("1.0", END)
|
||||
self.results_box["state"] = "disabled"
|
||||
|
||||
def freeze_editable(self):
|
||||
self.query_box["state"] = "disabled"
|
||||
self.search_button["state"] = "disabled"
|
||||
self.prev["state"] = "disabled"
|
||||
self.next["state"] = "disabled"
|
||||
|
||||
def unfreeze_editable(self):
|
||||
self.query_box["state"] = "normal"
|
||||
self.search_button["state"] = "normal"
|
||||
self.set_paging_button_states()
|
||||
|
||||
def set_paging_button_states(self):
|
||||
if self.current_page == 0 or self.current_page == 1:
|
||||
self.prev["state"] = "disabled"
|
||||
else:
|
||||
self.prev["state"] = "normal"
|
||||
if self.model.has_more_pages(self.current_page):
|
||||
self.next["state"] = "normal"
|
||||
else:
|
||||
self.next["state"] = "disabled"
|
||||
|
||||
def fire_event(self, event):
|
||||
# Firing an event so that rendering of widgets happen in the mainloop thread
|
||||
self.top.event_generate(event, when="tail")
|
||||
|
||||
def mainloop(self, *args, **kwargs):
|
||||
if in_idle():
|
||||
return
|
||||
self.top.mainloop(*args, **kwargs)
|
||||
|
||||
|
||||
class ConcordanceSearchModel:
|
||||
def __init__(self, queue):
|
||||
self.queue = queue
|
||||
self.CORPORA = _CORPORA
|
||||
self.DEFAULT_CORPUS = _DEFAULT
|
||||
self.selected_corpus = None
|
||||
self.reset_query()
|
||||
self.reset_results()
|
||||
self.result_count = None
|
||||
self.last_sent_searched = 0
|
||||
|
||||
def non_default_corpora(self):
|
||||
copy = []
|
||||
copy.extend(list(self.CORPORA.keys()))
|
||||
copy.remove(self.DEFAULT_CORPUS)
|
||||
copy.sort()
|
||||
return copy
|
||||
|
||||
def load_corpus(self, name):
|
||||
self.selected_corpus = name
|
||||
self.tagged_sents = []
|
||||
runner_thread = self.LoadCorpus(name, self)
|
||||
runner_thread.start()
|
||||
|
||||
def search(self, query, page):
|
||||
self.query = query
|
||||
self.last_requested_page = page
|
||||
self.SearchCorpus(self, page, self.result_count).start()
|
||||
|
||||
def next(self, page):
|
||||
self.last_requested_page = page
|
||||
if len(self.results) < page:
|
||||
self.search(self.query, page)
|
||||
else:
|
||||
self.queue.put(SEARCH_TERMINATED_EVENT)
|
||||
|
||||
def prev(self, page):
|
||||
self.last_requested_page = page
|
||||
self.queue.put(SEARCH_TERMINATED_EVENT)
|
||||
|
||||
def reset_results(self):
|
||||
self.last_sent_searched = 0
|
||||
self.results = []
|
||||
self.last_page = None
|
||||
|
||||
def reset_query(self):
|
||||
self.query = None
|
||||
|
||||
def set_results(self, page, resultset):
|
||||
self.results.insert(page - 1, resultset)
|
||||
|
||||
def get_results(self):
|
||||
return self.results[self.last_requested_page - 1]
|
||||
|
||||
def has_more_pages(self, page):
|
||||
if self.results == [] or self.results[0] == []:
|
||||
return False
|
||||
if self.last_page is None:
|
||||
return True
|
||||
return page < self.last_page
|
||||
|
||||
class LoadCorpus(threading.Thread):
|
||||
def __init__(self, name, model):
|
||||
threading.Thread.__init__(self)
|
||||
self.model, self.name = model, name
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
ts = self.model.CORPORA[self.name]()
|
||||
self.model.tagged_sents = [
|
||||
" ".join(w + "/" + t for (w, t) in sent) for sent in ts
|
||||
]
|
||||
self.model.queue.put(CORPUS_LOADED_EVENT)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
|
||||
|
||||
class SearchCorpus(threading.Thread):
|
||||
def __init__(self, model, page, count):
|
||||
self.model, self.count, self.page = model, count, page
|
||||
threading.Thread.__init__(self)
|
||||
|
||||
def run(self):
|
||||
q = self.processed_query()
|
||||
sent_pos, i, sent_count = [], 0, 0
|
||||
for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
|
||||
try:
|
||||
m = re.search(q, sent)
|
||||
except re.error:
|
||||
self.model.reset_results()
|
||||
self.model.queue.put(SEARCH_ERROR_EVENT)
|
||||
return
|
||||
if m:
|
||||
sent_pos.append((sent, m.start(), m.end()))
|
||||
i += 1
|
||||
if i > self.count:
|
||||
self.model.last_sent_searched += sent_count - 1
|
||||
break
|
||||
sent_count += 1
|
||||
if self.count >= len(sent_pos):
|
||||
self.model.last_sent_searched += sent_count - 1
|
||||
self.model.last_page = self.page
|
||||
self.model.set_results(self.page, sent_pos)
|
||||
else:
|
||||
self.model.set_results(self.page, sent_pos[:-1])
|
||||
self.model.queue.put(SEARCH_TERMINATED_EVENT)
|
||||
|
||||
def processed_query(self):
|
||||
new = []
|
||||
for term in self.model.query.split():
|
||||
term = re.sub(r"\.", r"[^/ ]", term)
|
||||
if re.match("[A-Z]+$", term):
|
||||
new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
|
||||
elif "/" in term:
|
||||
new.append(BOUNDARY + term + BOUNDARY)
|
||||
else:
|
||||
new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
|
||||
return " ".join(new)
|
||||
|
||||
|
||||
def app():
|
||||
d = ConcordanceSearchView()
|
||||
d.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
163
Backend/venv/lib/python3.12/site-packages/nltk/app/nemo_app.py
Normal file
163
Backend/venv/lib/python3.12/site-packages/nltk/app/nemo_app.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06
|
||||
# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783
|
||||
|
||||
"""
|
||||
Finding (and Replacing) Nemo
|
||||
|
||||
Instant Regular Expressions
|
||||
Created by Aristide Grange
|
||||
"""
|
||||
import itertools
|
||||
import re
|
||||
from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk
|
||||
|
||||
windowTitle = "Finding (and Replacing) Nemo"
|
||||
initialFind = r"n(.*?)e(.*?)m(.*?)o"
|
||||
initialRepl = r"M\1A\2K\3I"
|
||||
initialText = """\
|
||||
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
"""
|
||||
images = {
|
||||
"FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
|
||||
"find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
|
||||
"REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
|
||||
"repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
|
||||
}
|
||||
colors = ["#FF7B39", "#80F121"]
|
||||
emphColors = ["#DAFC33", "#F42548"]
|
||||
fieldParams = {
|
||||
"height": 3,
|
||||
"width": 70,
|
||||
"font": ("monaco", 14),
|
||||
"highlightthickness": 0,
|
||||
"borderwidth": 0,
|
||||
"background": "white",
|
||||
}
|
||||
textParams = {
|
||||
"bg": "#F7E0D4",
|
||||
"fg": "#2321F1",
|
||||
"highlightthickness": 0,
|
||||
"width": 1,
|
||||
"height": 10,
|
||||
"font": ("verdana", 16),
|
||||
"wrap": "word",
|
||||
}
|
||||
|
||||
|
||||
class Zone:
|
||||
def __init__(self, image, initialField, initialText):
|
||||
frm = Frame(root)
|
||||
frm.config(background="white")
|
||||
self.image = PhotoImage(format="gif", data=images[image.upper()])
|
||||
self.imageDimmed = PhotoImage(format="gif", data=images[image])
|
||||
self.img = Label(frm)
|
||||
self.img.config(borderwidth=0)
|
||||
self.img.pack(side="left")
|
||||
self.fld = Text(frm, **fieldParams)
|
||||
self.initScrollText(frm, self.fld, initialField)
|
||||
frm = Frame(root)
|
||||
self.txt = Text(frm, **textParams)
|
||||
self.initScrollText(frm, self.txt, initialText)
|
||||
for i in range(2):
|
||||
self.txt.tag_config(colors[i], background=colors[i])
|
||||
self.txt.tag_config("emph" + colors[i], foreground=emphColors[i])
|
||||
|
||||
def initScrollText(self, frm, txt, contents):
|
||||
scl = Scrollbar(frm)
|
||||
scl.config(command=txt.yview)
|
||||
scl.pack(side="right", fill="y")
|
||||
txt.pack(side="left", expand=True, fill="x")
|
||||
txt.config(yscrollcommand=scl.set)
|
||||
txt.insert("1.0", contents)
|
||||
frm.pack(fill="x")
|
||||
Frame(height=2, bd=1, relief="ridge").pack(fill="x")
|
||||
|
||||
def refresh(self):
|
||||
self.colorCycle = itertools.cycle(colors)
|
||||
try:
|
||||
self.substitute()
|
||||
self.img.config(image=self.image)
|
||||
except re.error:
|
||||
self.img.config(image=self.imageDimmed)
|
||||
|
||||
|
||||
class FindZone(Zone):
|
||||
def addTags(self, m):
|
||||
color = next(self.colorCycle)
|
||||
self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end())
|
||||
try:
|
||||
self.txt.tag_add(
|
||||
"emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph")
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
def substitute(self, *args):
|
||||
for color in colors:
|
||||
self.txt.tag_remove(color, "1.0", "end")
|
||||
self.txt.tag_remove("emph" + color, "1.0", "end")
|
||||
self.rex = re.compile("") # default value in case of malformed regexp
|
||||
self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE)
|
||||
try:
|
||||
re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST, SEL_LAST))
|
||||
self.rexSel = re.compile(
|
||||
"%s(?P<emph>%s)%s"
|
||||
% (
|
||||
self.fld.get("1.0", SEL_FIRST),
|
||||
self.fld.get(SEL_FIRST, SEL_LAST),
|
||||
self.fld.get(SEL_LAST, "end")[:-1],
|
||||
),
|
||||
re.MULTILINE,
|
||||
)
|
||||
except:
|
||||
self.rexSel = self.rex
|
||||
self.rexSel.sub(self.addTags, self.txt.get("1.0", "end"))
|
||||
|
||||
|
||||
class ReplaceZone(Zone):
|
||||
def addTags(self, m):
|
||||
s = sz.rex.sub(self.repl, m.group())
|
||||
self.txt.delete(
|
||||
"1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff)
|
||||
)
|
||||
self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle))
|
||||
self.diff += len(s) - (m.end() - m.start())
|
||||
|
||||
def substitute(self):
|
||||
self.txt.delete("1.0", "end")
|
||||
self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1])
|
||||
self.diff = 0
|
||||
self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1])
|
||||
sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1])
|
||||
|
||||
|
||||
def launchRefresh(_):
|
||||
sz.fld.after_idle(sz.refresh)
|
||||
rz.fld.after_idle(rz.refresh)
|
||||
|
||||
|
||||
def app():
|
||||
global root, sz, rz, rex0
|
||||
root = Tk()
|
||||
root.resizable(height=False, width=True)
|
||||
root.title(windowTitle)
|
||||
root.minsize(width=250, height=0)
|
||||
sz = FindZone("find", initialFind, initialText)
|
||||
sz.fld.bind("<Button-1>", launchRefresh)
|
||||
sz.fld.bind("<ButtonRelease-1>", launchRefresh)
|
||||
sz.fld.bind("<B1-Motion>", launchRefresh)
|
||||
sz.rexSel = re.compile("")
|
||||
rz = ReplaceZone("repl", initialRepl, "")
|
||||
rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
|
||||
root.bind_all("<Key>", launchRefresh)
|
||||
launchRefresh(None)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
1052
Backend/venv/lib/python3.12/site-packages/nltk/app/rdparser_app.py
Normal file
1052
Backend/venv/lib/python3.12/site-packages/nltk/app/rdparser_app.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,937 @@
|
||||
# Natural Language Toolkit: Shift-Reduce Parser Application
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A graphical tool for exploring the shift-reduce parser.
|
||||
|
||||
The shift-reduce parser maintains a stack, which records the structure
|
||||
of the portion of the text that has been parsed. The stack is
|
||||
initially empty. Its contents are shown on the left side of the main
|
||||
canvas.
|
||||
|
||||
On the right side of the main canvas is the remaining text. This is
|
||||
the portion of the text which has not yet been considered by the
|
||||
parser.
|
||||
|
||||
The parser builds up a tree structure for the text using two
|
||||
operations:
|
||||
|
||||
- "shift" moves the first token from the remaining text to the top
|
||||
of the stack. In the demo, the top of the stack is its right-hand
|
||||
side.
|
||||
- "reduce" uses a grammar production to combine the rightmost stack
|
||||
elements into a single tree token.
|
||||
|
||||
You can control the parser's operation by using the "shift" and
|
||||
"reduce" buttons; or you can use the "step" button to let the parser
|
||||
automatically decide which operation to apply. The parser uses the
|
||||
following rules to decide which operation to apply:
|
||||
|
||||
- Only shift if no reductions are available.
|
||||
- If multiple reductions are available, then apply the reduction
|
||||
whose CFG production is listed earliest in the grammar.
|
||||
|
||||
The "reduce" button applies the reduction whose CFG production is
|
||||
listed earliest in the grammar. There are two ways to manually choose
|
||||
which reduction to apply:
|
||||
|
||||
- Click on a CFG production from the list of available reductions,
|
||||
on the left side of the main window. The reduction based on that
|
||||
production will be applied to the top of the stack.
|
||||
- Click on one of the stack elements. A popup window will appear,
|
||||
containing all available reductions. Select one, and it will be
|
||||
applied to the top of the stack.
|
||||
|
||||
Note that reductions can only be applied to the top of the stack.
|
||||
|
||||
Keyboard Shortcuts::
|
||||
[Space]\t Perform the next shift or reduce operation
|
||||
[s]\t Perform a shift operation
|
||||
[r]\t Perform a reduction operation
|
||||
[Ctrl-z]\t Undo most recent operation
|
||||
[Delete]\t Reset the parser
|
||||
[g]\t Show/hide available production list
|
||||
[Ctrl-a]\t Toggle animations
|
||||
[h]\t Help
|
||||
[Ctrl-p]\t Print
|
||||
[q]\t Quit
|
||||
|
||||
"""
|
||||
|
||||
from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
|
||||
from tkinter.font import Font
|
||||
|
||||
from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
|
||||
from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
|
||||
from nltk.parse import SteppingShiftReduceParser
|
||||
from nltk.tree import Tree
|
||||
from nltk.util import in_idle
|
||||
|
||||
"""
|
||||
Possible future improvements:
|
||||
- button/window to change and/or select text. Just pop up a window
|
||||
with an entry, and let them modify the text; and then retokenize
|
||||
it? Maybe give a warning if it contains tokens whose types are
|
||||
not in the grammar.
|
||||
- button/window to change and/or select grammar. Select from
|
||||
several alternative grammars? Or actually change the grammar? If
|
||||
the later, then I'd want to define nltk.draw.cfg, which would be
|
||||
responsible for that.
|
||||
"""
|
||||
|
||||
|
||||
class ShiftReduceApp:
|
||||
"""
|
||||
A graphical tool for exploring the shift-reduce parser. The tool
|
||||
displays the parser's stack and the remaining text, and allows the
|
||||
user to control the parser's operation. In particular, the user
|
||||
can shift tokens onto the stack, and can perform reductions on the
|
||||
top elements of the stack. A "step" button simply steps through
|
||||
the parsing process, performing the operations that
|
||||
``nltk.parse.ShiftReduceParser`` would use.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar, sent, trace=0):
|
||||
self._sent = sent
|
||||
self._parser = SteppingShiftReduceParser(grammar, trace)
|
||||
|
||||
# Set up the main window.
|
||||
self._top = Tk()
|
||||
self._top.title("Shift Reduce Parser Application")
|
||||
|
||||
# Animations. animating_lock is a lock to prevent the demo
|
||||
# from performing new operations while it's animating.
|
||||
self._animating_lock = 0
|
||||
self._animate = IntVar(self._top)
|
||||
self._animate.set(10) # = medium
|
||||
|
||||
# The user can hide the grammar.
|
||||
self._show_grammar = IntVar(self._top)
|
||||
self._show_grammar.set(1)
|
||||
|
||||
# Initialize fonts.
|
||||
self._init_fonts(self._top)
|
||||
|
||||
# Set up key bindings.
|
||||
self._init_bindings()
|
||||
|
||||
# Create the basic frames.
|
||||
self._init_menubar(self._top)
|
||||
self._init_buttons(self._top)
|
||||
self._init_feedback(self._top)
|
||||
self._init_grammar(self._top)
|
||||
self._init_canvas(self._top)
|
||||
|
||||
# A popup menu for reducing.
|
||||
self._reduce_menu = Menu(self._canvas, tearoff=0)
|
||||
|
||||
# Reset the demo, and set the feedback frame to empty.
|
||||
self.reset()
|
||||
self._lastoper1["text"] = ""
|
||||
|
||||
#########################################
|
||||
## Initialization Helpers
|
||||
#########################################
|
||||
|
||||
def _init_fonts(self, root):
|
||||
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
|
||||
self._sysfont = Font(font=Button()["font"])
|
||||
root.option_add("*Font", self._sysfont)
|
||||
|
||||
# TWhat's our font size (default=same as sysfont)
|
||||
self._size = IntVar(root)
|
||||
self._size.set(self._sysfont.cget("size"))
|
||||
|
||||
self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
|
||||
self._font = Font(family="helvetica", size=self._size.get())
|
||||
|
||||
def _init_grammar(self, parent):
|
||||
# Grammar view.
|
||||
self._prodframe = listframe = Frame(parent)
|
||||
self._prodframe.pack(fill="both", side="left", padx=2)
|
||||
self._prodlist_label = Label(
|
||||
self._prodframe, font=self._boldfont, text="Available Reductions"
|
||||
)
|
||||
self._prodlist_label.pack()
|
||||
self._prodlist = Listbox(
|
||||
self._prodframe,
|
||||
selectmode="single",
|
||||
relief="groove",
|
||||
background="white",
|
||||
foreground="#909090",
|
||||
font=self._font,
|
||||
selectforeground="#004040",
|
||||
selectbackground="#c0f0c0",
|
||||
)
|
||||
|
||||
self._prodlist.pack(side="right", fill="both", expand=1)
|
||||
|
||||
self._productions = list(self._parser.grammar().productions())
|
||||
for production in self._productions:
|
||||
self._prodlist.insert("end", (" %s" % production))
|
||||
self._prodlist.config(height=min(len(self._productions), 25))
|
||||
|
||||
# Add a scrollbar if there are more than 25 productions.
|
||||
if 1: # len(self._productions) > 25:
|
||||
listscroll = Scrollbar(self._prodframe, orient="vertical")
|
||||
self._prodlist.config(yscrollcommand=listscroll.set)
|
||||
listscroll.config(command=self._prodlist.yview)
|
||||
listscroll.pack(side="left", fill="y")
|
||||
|
||||
# If they select a production, apply it.
|
||||
self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
|
||||
|
||||
# When they hover over a production, highlight it.
|
||||
self._hover = -1
|
||||
self._prodlist.bind("<Motion>", self._highlight_hover)
|
||||
self._prodlist.bind("<Leave>", self._clear_hover)
|
||||
|
||||
def _init_bindings(self):
|
||||
# Quit
|
||||
self._top.bind("<Control-q>", self.destroy)
|
||||
self._top.bind("<Control-x>", self.destroy)
|
||||
self._top.bind("<Alt-q>", self.destroy)
|
||||
self._top.bind("<Alt-x>", self.destroy)
|
||||
|
||||
# Ops (step, shift, reduce, undo)
|
||||
self._top.bind("<space>", self.step)
|
||||
self._top.bind("<s>", self.shift)
|
||||
self._top.bind("<Alt-s>", self.shift)
|
||||
self._top.bind("<Control-s>", self.shift)
|
||||
self._top.bind("<r>", self.reduce)
|
||||
self._top.bind("<Alt-r>", self.reduce)
|
||||
self._top.bind("<Control-r>", self.reduce)
|
||||
self._top.bind("<Delete>", self.reset)
|
||||
self._top.bind("<u>", self.undo)
|
||||
self._top.bind("<Alt-u>", self.undo)
|
||||
self._top.bind("<Control-u>", self.undo)
|
||||
self._top.bind("<Control-z>", self.undo)
|
||||
self._top.bind("<BackSpace>", self.undo)
|
||||
|
||||
# Misc
|
||||
self._top.bind("<Control-p>", self.postscript)
|
||||
self._top.bind("<Control-h>", self.help)
|
||||
self._top.bind("<F1>", self.help)
|
||||
self._top.bind("<Control-g>", self.edit_grammar)
|
||||
self._top.bind("<Control-t>", self.edit_sentence)
|
||||
|
||||
# Animation speed control
|
||||
self._top.bind("-", lambda e, a=self._animate: a.set(20))
|
||||
self._top.bind("=", lambda e, a=self._animate: a.set(10))
|
||||
self._top.bind("+", lambda e, a=self._animate: a.set(4))
|
||||
|
||||
def _init_buttons(self, parent):
|
||||
# Set up the frames.
|
||||
self._buttonframe = buttonframe = Frame(parent)
|
||||
buttonframe.pack(fill="none", side="bottom")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Step",
|
||||
background="#90c0d0",
|
||||
foreground="black",
|
||||
command=self.step,
|
||||
).pack(side="left")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Shift",
|
||||
underline=0,
|
||||
background="#90f090",
|
||||
foreground="black",
|
||||
command=self.shift,
|
||||
).pack(side="left")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Reduce",
|
||||
underline=0,
|
||||
background="#90f090",
|
||||
foreground="black",
|
||||
command=self.reduce,
|
||||
).pack(side="left")
|
||||
Button(
|
||||
buttonframe,
|
||||
text="Undo",
|
||||
underline=0,
|
||||
background="#f0a0a0",
|
||||
foreground="black",
|
||||
command=self.undo,
|
||||
).pack(side="left")
|
||||
|
||||
def _init_menubar(self, parent):
|
||||
menubar = Menu(parent)
|
||||
|
||||
filemenu = Menu(menubar, tearoff=0)
|
||||
filemenu.add_command(
|
||||
label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
|
||||
)
|
||||
filemenu.add_command(
|
||||
label="Print to Postscript",
|
||||
underline=0,
|
||||
command=self.postscript,
|
||||
accelerator="Ctrl-p",
|
||||
)
|
||||
filemenu.add_command(
|
||||
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
|
||||
)
|
||||
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
||||
|
||||
editmenu = Menu(menubar, tearoff=0)
|
||||
editmenu.add_command(
|
||||
label="Edit Grammar",
|
||||
underline=5,
|
||||
command=self.edit_grammar,
|
||||
accelerator="Ctrl-g",
|
||||
)
|
||||
editmenu.add_command(
|
||||
label="Edit Text",
|
||||
underline=5,
|
||||
command=self.edit_sentence,
|
||||
accelerator="Ctrl-t",
|
||||
)
|
||||
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
||||
|
||||
rulemenu = Menu(menubar, tearoff=0)
|
||||
rulemenu.add_command(
|
||||
label="Step", underline=1, command=self.step, accelerator="Space"
|
||||
)
|
||||
rulemenu.add_separator()
|
||||
rulemenu.add_command(
|
||||
label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
|
||||
)
|
||||
rulemenu.add_command(
|
||||
label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
|
||||
)
|
||||
rulemenu.add_separator()
|
||||
rulemenu.add_command(
|
||||
label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
|
||||
)
|
||||
menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
|
||||
|
||||
viewmenu = Menu(menubar, tearoff=0)
|
||||
viewmenu.add_checkbutton(
|
||||
label="Show Grammar",
|
||||
underline=0,
|
||||
variable=self._show_grammar,
|
||||
command=self._toggle_grammar,
|
||||
)
|
||||
viewmenu.add_separator()
|
||||
viewmenu.add_radiobutton(
|
||||
label="Tiny",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=10,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Small",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=12,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Medium",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=14,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Large",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=18,
|
||||
command=self.resize,
|
||||
)
|
||||
viewmenu.add_radiobutton(
|
||||
label="Huge",
|
||||
variable=self._size,
|
||||
underline=0,
|
||||
value=24,
|
||||
command=self.resize,
|
||||
)
|
||||
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
|
||||
|
||||
animatemenu = Menu(menubar, tearoff=0)
|
||||
animatemenu.add_radiobutton(
|
||||
label="No Animation", underline=0, variable=self._animate, value=0
|
||||
)
|
||||
animatemenu.add_radiobutton(
|
||||
label="Slow Animation",
|
||||
underline=0,
|
||||
variable=self._animate,
|
||||
value=20,
|
||||
accelerator="-",
|
||||
)
|
||||
animatemenu.add_radiobutton(
|
||||
label="Normal Animation",
|
||||
underline=0,
|
||||
variable=self._animate,
|
||||
value=10,
|
||||
accelerator="=",
|
||||
)
|
||||
animatemenu.add_radiobutton(
|
||||
label="Fast Animation",
|
||||
underline=0,
|
||||
variable=self._animate,
|
||||
value=4,
|
||||
accelerator="+",
|
||||
)
|
||||
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
|
||||
|
||||
helpmenu = Menu(menubar, tearoff=0)
|
||||
helpmenu.add_command(label="About", underline=0, command=self.about)
|
||||
helpmenu.add_command(
|
||||
label="Instructions", underline=0, command=self.help, accelerator="F1"
|
||||
)
|
||||
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
|
||||
|
||||
parent.config(menu=menubar)
|
||||
|
||||
def _init_feedback(self, parent):
|
||||
self._feedbackframe = feedbackframe = Frame(parent)
|
||||
feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
|
||||
self._lastoper_label = Label(
|
||||
feedbackframe, text="Last Operation:", font=self._font
|
||||
)
|
||||
self._lastoper_label.pack(side="left")
|
||||
lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
|
||||
lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
|
||||
self._lastoper1 = Label(
|
||||
lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
|
||||
)
|
||||
self._lastoper2 = Label(
|
||||
lastoperframe,
|
||||
anchor="w",
|
||||
width=30,
|
||||
foreground="#004040",
|
||||
background="#f0f0f0",
|
||||
font=self._font,
|
||||
)
|
||||
self._lastoper1.pack(side="left")
|
||||
self._lastoper2.pack(side="left", fill="x", expand=1)
|
||||
|
||||
def _init_canvas(self, parent):
|
||||
self._cframe = CanvasFrame(
|
||||
parent,
|
||||
background="white",
|
||||
width=525,
|
||||
closeenough=10,
|
||||
border=2,
|
||||
relief="sunken",
|
||||
)
|
||||
self._cframe.pack(expand=1, fill="both", side="top", pady=2)
|
||||
canvas = self._canvas = self._cframe.canvas()
|
||||
|
||||
self._stackwidgets = []
|
||||
self._rtextwidgets = []
|
||||
self._titlebar = canvas.create_rectangle(
|
||||
0, 0, 0, 0, fill="#c0f0f0", outline="black"
|
||||
)
|
||||
self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
|
||||
self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
|
||||
size = self._size.get() + 4
|
||||
self._stacklabel = TextWidget(
|
||||
canvas, "Stack", color="#004040", font=self._boldfont
|
||||
)
|
||||
self._rtextlabel = TextWidget(
|
||||
canvas, "Remaining Text", color="#004040", font=self._boldfont
|
||||
)
|
||||
self._cframe.add_widget(self._stacklabel)
|
||||
self._cframe.add_widget(self._rtextlabel)
|
||||
|
||||
#########################################
|
||||
## Main draw procedure
|
||||
#########################################
|
||||
|
||||
def _redraw(self):
|
||||
scrollregion = self._canvas["scrollregion"].split()
|
||||
(cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion)
|
||||
|
||||
# Delete the old stack & rtext widgets.
|
||||
for stackwidget in self._stackwidgets:
|
||||
self._cframe.destroy_widget(stackwidget)
|
||||
self._stackwidgets = []
|
||||
for rtextwidget in self._rtextwidgets:
|
||||
self._cframe.destroy_widget(rtextwidget)
|
||||
self._rtextwidgets = []
|
||||
|
||||
# Position the titlebar & exprline
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
y = y2 - y1 + 10
|
||||
self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4)
|
||||
self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10)
|
||||
|
||||
# Position the titlebar labels..
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
self._stacklabel.move(5 - x1, 3 - y1)
|
||||
(x1, y1, x2, y2) = self._rtextlabel.bbox()
|
||||
self._rtextlabel.move(cx2 - x2 - 5, 3 - y1)
|
||||
|
||||
# Draw the stack.
|
||||
stackx = 5
|
||||
for tok in self._parser.stack():
|
||||
if isinstance(tok, Tree):
|
||||
attribs = {
|
||||
"tree_color": "#4080a0",
|
||||
"tree_width": 2,
|
||||
"node_font": self._boldfont,
|
||||
"node_color": "#006060",
|
||||
"leaf_color": "#006060",
|
||||
"leaf_font": self._font,
|
||||
}
|
||||
widget = tree_to_treesegment(self._canvas, tok, **attribs)
|
||||
widget.label()["color"] = "#000000"
|
||||
else:
|
||||
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
|
||||
widget.bind_click(self._popup_reduce)
|
||||
self._stackwidgets.append(widget)
|
||||
self._cframe.add_widget(widget, stackx, y)
|
||||
stackx = widget.bbox()[2] + 10
|
||||
|
||||
# Draw the remaining text.
|
||||
rtextwidth = 0
|
||||
for tok in self._parser.remaining_text():
|
||||
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
|
||||
self._rtextwidgets.append(widget)
|
||||
self._cframe.add_widget(widget, rtextwidth, y)
|
||||
rtextwidth = widget.bbox()[2] + 4
|
||||
|
||||
# Allow enough room to shift the next token (for animations)
|
||||
if len(self._rtextwidgets) > 0:
|
||||
stackx += self._rtextwidgets[0].width()
|
||||
|
||||
# Move the remaining text to the correct location (keep it
|
||||
# right-justified, when possible); and move the remaining text
|
||||
# label, if necessary.
|
||||
stackx = max(stackx, self._stacklabel.width() + 25)
|
||||
rlabelwidth = self._rtextlabel.width() + 10
|
||||
if stackx >= cx2 - max(rtextwidth, rlabelwidth):
|
||||
cx2 = stackx + max(rtextwidth, rlabelwidth)
|
||||
for rtextwidget in self._rtextwidgets:
|
||||
rtextwidget.move(4 + cx2 - rtextwidth, 0)
|
||||
self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0)
|
||||
|
||||
midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2
|
||||
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
|
||||
# Set up binding to allow them to shift a token by dragging it.
|
||||
if len(self._rtextwidgets) > 0:
|
||||
|
||||
def drag_shift(widget, midx=midx, self=self):
|
||||
if widget.bbox()[0] < midx:
|
||||
self.shift()
|
||||
else:
|
||||
self._redraw()
|
||||
|
||||
self._rtextwidgets[0].bind_drag(drag_shift)
|
||||
self._rtextwidgets[0].bind_click(self.shift)
|
||||
|
||||
# Draw the stack top.
|
||||
self._highlight_productions()
|
||||
|
||||
def _draw_stack_top(self, widget):
|
||||
# hack..
|
||||
midx = widget.bbox()[2] + 50
|
||||
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
|
||||
|
||||
def _highlight_productions(self):
|
||||
# Highlight the productions that can be reduced.
|
||||
self._prodlist.selection_clear(0, "end")
|
||||
for prod in self._parser.reducible_productions():
|
||||
index = self._productions.index(prod)
|
||||
self._prodlist.selection_set(index)
|
||||
|
||||
#########################################
|
||||
## Button Callbacks
|
||||
#########################################
|
||||
|
||||
def destroy(self, *e):
|
||||
if self._top is None:
|
||||
return
|
||||
self._top.destroy()
|
||||
self._top = None
|
||||
|
||||
def reset(self, *e):
|
||||
self._parser.initialize(self._sent)
|
||||
self._lastoper1["text"] = "Reset App"
|
||||
self._lastoper2["text"] = ""
|
||||
self._redraw()
|
||||
|
||||
def step(self, *e):
|
||||
if self.reduce():
|
||||
return True
|
||||
elif self.shift():
|
||||
return True
|
||||
else:
|
||||
if list(self._parser.parses()):
|
||||
self._lastoper1["text"] = "Finished:"
|
||||
self._lastoper2["text"] = "Success"
|
||||
else:
|
||||
self._lastoper1["text"] = "Finished:"
|
||||
self._lastoper2["text"] = "Failure"
|
||||
|
||||
def shift(self, *e):
|
||||
if self._animating_lock:
|
||||
return
|
||||
if self._parser.shift():
|
||||
tok = self._parser.stack()[-1]
|
||||
self._lastoper1["text"] = "Shift:"
|
||||
self._lastoper2["text"] = "%r" % tok
|
||||
if self._animate.get():
|
||||
self._animate_shift()
|
||||
else:
|
||||
self._redraw()
|
||||
return True
|
||||
return False
|
||||
|
||||
def reduce(self, *e):
|
||||
if self._animating_lock:
|
||||
return
|
||||
production = self._parser.reduce()
|
||||
if production:
|
||||
self._lastoper1["text"] = "Reduce:"
|
||||
self._lastoper2["text"] = "%s" % production
|
||||
if self._animate.get():
|
||||
self._animate_reduce()
|
||||
else:
|
||||
self._redraw()
|
||||
return production
|
||||
|
||||
def undo(self, *e):
|
||||
if self._animating_lock:
|
||||
return
|
||||
if self._parser.undo():
|
||||
self._redraw()
|
||||
|
||||
def postscript(self, *e):
|
||||
self._cframe.print_to_file()
|
||||
|
||||
def mainloop(self, *args, **kwargs):
|
||||
"""
|
||||
Enter the Tkinter mainloop. This function must be called if
|
||||
this demo is created from a non-interactive program (e.g.
|
||||
from a secript); otherwise, the demo will close as soon as
|
||||
the script completes.
|
||||
"""
|
||||
if in_idle():
|
||||
return
|
||||
self._top.mainloop(*args, **kwargs)
|
||||
|
||||
#########################################
|
||||
## Menubar callbacks
|
||||
#########################################
|
||||
|
||||
def resize(self, size=None):
|
||||
if size is not None:
|
||||
self._size.set(size)
|
||||
size = self._size.get()
|
||||
self._font.configure(size=-(abs(size)))
|
||||
self._boldfont.configure(size=-(abs(size)))
|
||||
self._sysfont.configure(size=-(abs(size)))
|
||||
|
||||
# self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
|
||||
# self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
|
||||
# self._lastoper_label['font'] = ('helvetica', -size)
|
||||
# self._lastoper1['font'] = ('helvetica', -size)
|
||||
# self._lastoper2['font'] = ('helvetica', -size)
|
||||
# self._prodlist['font'] = ('helvetica', -size)
|
||||
# self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
|
||||
self._redraw()
|
||||
|
||||
def help(self, *e):
|
||||
# The default font's not very legible; try using 'fixed' instead.
|
||||
try:
|
||||
ShowText(
|
||||
self._top,
|
||||
"Help: Shift-Reduce Parser Application",
|
||||
(__doc__ or "").strip(),
|
||||
width=75,
|
||||
font="fixed",
|
||||
)
|
||||
except:
|
||||
ShowText(
|
||||
self._top,
|
||||
"Help: Shift-Reduce Parser Application",
|
||||
(__doc__ or "").strip(),
|
||||
width=75,
|
||||
)
|
||||
|
||||
def about(self, *e):
|
||||
ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
|
||||
TITLE = "About: Shift-Reduce Parser Application"
|
||||
try:
|
||||
from tkinter.messagebox import Message
|
||||
|
||||
Message(message=ABOUT, title=TITLE).show()
|
||||
except:
|
||||
ShowText(self._top, TITLE, ABOUT)
|
||||
|
||||
def edit_grammar(self, *e):
|
||||
CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
|
||||
|
||||
def set_grammar(self, grammar):
|
||||
self._parser.set_grammar(grammar)
|
||||
self._productions = list(grammar.productions())
|
||||
self._prodlist.delete(0, "end")
|
||||
for production in self._productions:
|
||||
self._prodlist.insert("end", (" %s" % production))
|
||||
|
||||
def edit_sentence(self, *e):
|
||||
sentence = " ".join(self._sent)
|
||||
title = "Edit Text"
|
||||
instr = "Enter a new sentence to parse."
|
||||
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
|
||||
|
||||
def set_sentence(self, sent):
|
||||
self._sent = sent.split() # [XX] use tagged?
|
||||
self.reset()
|
||||
|
||||
#########################################
|
||||
## Reduce Production Selection
|
||||
#########################################
|
||||
|
||||
def _toggle_grammar(self, *e):
|
||||
if self._show_grammar.get():
|
||||
self._prodframe.pack(
|
||||
fill="both", side="left", padx=2, after=self._feedbackframe
|
||||
)
|
||||
self._lastoper1["text"] = "Show Grammar"
|
||||
else:
|
||||
self._prodframe.pack_forget()
|
||||
self._lastoper1["text"] = "Hide Grammar"
|
||||
self._lastoper2["text"] = ""
|
||||
|
||||
def _prodlist_select(self, event):
|
||||
selection = self._prodlist.curselection()
|
||||
if len(selection) != 1:
|
||||
return
|
||||
index = int(selection[0])
|
||||
production = self._parser.reduce(self._productions[index])
|
||||
if production:
|
||||
self._lastoper1["text"] = "Reduce:"
|
||||
self._lastoper2["text"] = "%s" % production
|
||||
if self._animate.get():
|
||||
self._animate_reduce()
|
||||
else:
|
||||
self._redraw()
|
||||
else:
|
||||
# Reset the production selections.
|
||||
self._prodlist.selection_clear(0, "end")
|
||||
for prod in self._parser.reducible_productions():
|
||||
index = self._productions.index(prod)
|
||||
self._prodlist.selection_set(index)
|
||||
|
||||
def _popup_reduce(self, widget):
|
||||
# Remove old commands.
|
||||
productions = self._parser.reducible_productions()
|
||||
if len(productions) == 0:
|
||||
return
|
||||
|
||||
self._reduce_menu.delete(0, "end")
|
||||
for production in productions:
|
||||
self._reduce_menu.add_command(label=str(production), command=self.reduce)
|
||||
self._reduce_menu.post(
|
||||
self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()
|
||||
)
|
||||
|
||||
#########################################
|
||||
## Animations
|
||||
#########################################
|
||||
|
||||
def _animate_shift(self):
|
||||
# What widget are we shifting?
|
||||
widget = self._rtextwidgets[0]
|
||||
|
||||
# Where are we shifting from & to?
|
||||
right = widget.bbox()[0]
|
||||
if len(self._stackwidgets) == 0:
|
||||
left = 5
|
||||
else:
|
||||
left = self._stackwidgets[-1].bbox()[2] + 10
|
||||
|
||||
# Start animating.
|
||||
dt = self._animate.get()
|
||||
dx = (left - right) * 1.0 / dt
|
||||
self._animate_shift_frame(dt, widget, dx)
|
||||
|
||||
def _animate_shift_frame(self, frame, widget, dx):
|
||||
if frame > 0:
|
||||
self._animating_lock = 1
|
||||
widget.move(dx, 0)
|
||||
self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx)
|
||||
else:
|
||||
# but: stacktop??
|
||||
|
||||
# Shift the widget to the stack.
|
||||
del self._rtextwidgets[0]
|
||||
self._stackwidgets.append(widget)
|
||||
self._animating_lock = 0
|
||||
|
||||
# Display the available productions.
|
||||
self._draw_stack_top(widget)
|
||||
self._highlight_productions()
|
||||
|
||||
def _animate_reduce(self):
|
||||
# What widgets are we shifting?
|
||||
numwidgets = len(self._parser.stack()[-1]) # number of children
|
||||
widgets = self._stackwidgets[-numwidgets:]
|
||||
|
||||
# How far are we moving?
|
||||
if isinstance(widgets[0], TreeSegmentWidget):
|
||||
ydist = 15 + widgets[0].label().height()
|
||||
else:
|
||||
ydist = 15 + widgets[0].height()
|
||||
|
||||
# Start animating.
|
||||
dt = self._animate.get()
|
||||
dy = ydist * 2.0 / dt
|
||||
self._animate_reduce_frame(dt / 2, widgets, dy)
|
||||
|
||||
def _animate_reduce_frame(self, frame, widgets, dy):
|
||||
if frame > 0:
|
||||
self._animating_lock = 1
|
||||
for widget in widgets:
|
||||
widget.move(0, dy)
|
||||
self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy)
|
||||
else:
|
||||
del self._stackwidgets[-len(widgets) :]
|
||||
for widget in widgets:
|
||||
self._cframe.remove_widget(widget)
|
||||
tok = self._parser.stack()[-1]
|
||||
if not isinstance(tok, Tree):
|
||||
raise ValueError()
|
||||
label = TextWidget(
|
||||
self._canvas, str(tok.label()), color="#006060", font=self._boldfont
|
||||
)
|
||||
widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
|
||||
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
y = y2 - y1 + 10
|
||||
if not self._stackwidgets:
|
||||
x = 5
|
||||
else:
|
||||
x = self._stackwidgets[-1].bbox()[2] + 10
|
||||
self._cframe.add_widget(widget, x, y)
|
||||
self._stackwidgets.append(widget)
|
||||
|
||||
# Display the available productions.
|
||||
self._draw_stack_top(widget)
|
||||
self._highlight_productions()
|
||||
|
||||
# # Delete the old widgets..
|
||||
# del self._stackwidgets[-len(widgets):]
|
||||
# for widget in widgets:
|
||||
# self._cframe.destroy_widget(widget)
|
||||
#
|
||||
# # Make a new one.
|
||||
# tok = self._parser.stack()[-1]
|
||||
# if isinstance(tok, Tree):
|
||||
# attribs = {'tree_color': '#4080a0', 'tree_width': 2,
|
||||
# 'node_font': bold, 'node_color': '#006060',
|
||||
# 'leaf_color': '#006060', 'leaf_font':self._font}
|
||||
# widget = tree_to_treesegment(self._canvas, tok.type(),
|
||||
# **attribs)
|
||||
# widget.node()['color'] = '#000000'
|
||||
# else:
|
||||
# widget = TextWidget(self._canvas, tok.type(),
|
||||
# color='#000000', font=self._font)
|
||||
# widget.bind_click(self._popup_reduce)
|
||||
# (x1, y1, x2, y2) = self._stacklabel.bbox()
|
||||
# y = y2-y1+10
|
||||
# if not self._stackwidgets: x = 5
|
||||
# else: x = self._stackwidgets[-1].bbox()[2] + 10
|
||||
# self._cframe.add_widget(widget, x, y)
|
||||
# self._stackwidgets.append(widget)
|
||||
|
||||
# self._redraw()
|
||||
self._animating_lock = 0
|
||||
|
||||
#########################################
|
||||
## Hovering.
|
||||
#########################################
|
||||
|
||||
def _highlight_hover(self, event):
|
||||
# What production are we hovering over?
|
||||
index = self._prodlist.nearest(event.y)
|
||||
if self._hover == index:
|
||||
return
|
||||
|
||||
# Clear any previous hover highlighting.
|
||||
self._clear_hover()
|
||||
|
||||
# If the production corresponds to an available reduction,
|
||||
# highlight the stack.
|
||||
selection = [int(s) for s in self._prodlist.curselection()]
|
||||
if index in selection:
|
||||
rhslen = len(self._productions[index].rhs())
|
||||
for stackwidget in self._stackwidgets[-rhslen:]:
|
||||
if isinstance(stackwidget, TreeSegmentWidget):
|
||||
stackwidget.label()["color"] = "#00a000"
|
||||
else:
|
||||
stackwidget["color"] = "#00a000"
|
||||
|
||||
# Remember what production we're hovering over.
|
||||
self._hover = index
|
||||
|
||||
def _clear_hover(self, *event):
|
||||
# Clear any previous hover highlighting.
|
||||
if self._hover == -1:
|
||||
return
|
||||
self._hover = -1
|
||||
for stackwidget in self._stackwidgets:
|
||||
if isinstance(stackwidget, TreeSegmentWidget):
|
||||
stackwidget.label()["color"] = "black"
|
||||
else:
|
||||
stackwidget["color"] = "black"
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
Create a shift reduce parser app, using a simple grammar and
|
||||
text.
|
||||
"""
|
||||
|
||||
from nltk.grammar import CFG, Nonterminal, Production
|
||||
|
||||
nonterminals = "S VP NP PP P N Name V Det"
|
||||
(S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split())
|
||||
|
||||
productions = (
|
||||
# Syntactic Productions
|
||||
Production(S, [NP, VP]),
|
||||
Production(NP, [Det, N]),
|
||||
Production(NP, [NP, PP]),
|
||||
Production(VP, [VP, PP]),
|
||||
Production(VP, [V, NP, PP]),
|
||||
Production(VP, [V, NP]),
|
||||
Production(PP, [P, NP]),
|
||||
# Lexical Productions
|
||||
Production(NP, ["I"]),
|
||||
Production(Det, ["the"]),
|
||||
Production(Det, ["a"]),
|
||||
Production(N, ["man"]),
|
||||
Production(V, ["saw"]),
|
||||
Production(P, ["in"]),
|
||||
Production(P, ["with"]),
|
||||
Production(N, ["park"]),
|
||||
Production(N, ["dog"]),
|
||||
Production(N, ["statue"]),
|
||||
Production(Det, ["my"]),
|
||||
)
|
||||
|
||||
grammar = CFG(S, productions)
|
||||
|
||||
# tokenize the sentence
|
||||
sent = "my dog saw a man in the park with a statue".split()
|
||||
|
||||
ShiftReduceApp(grammar, sent).mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
@@ -0,0 +1,36 @@
|
||||
# Natural Language Toolkit: Wordfreq Application
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from matplotlib import pylab
|
||||
|
||||
from nltk.corpus import gutenberg
|
||||
from nltk.text import Text
|
||||
|
||||
|
||||
def plot_word_freq_dist(text):
|
||||
fd = text.vocab()
|
||||
|
||||
samples = [item for item, _ in fd.most_common(50)]
|
||||
values = [fd[sample] for sample in samples]
|
||||
values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))]
|
||||
pylab.title(text.name)
|
||||
pylab.xlabel("Samples")
|
||||
pylab.ylabel("Cumulative Percentage")
|
||||
pylab.plot(values)
|
||||
pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
|
||||
pylab.show()
|
||||
|
||||
|
||||
def app():
|
||||
t1 = Text(gutenberg.words("melville-moby_dick.txt"))
|
||||
plot_word_freq_dist(t1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
__all__ = ["app"]
|
||||
1006
Backend/venv/lib/python3.12/site-packages/nltk/app/wordnet_app.py
Normal file
1006
Backend/venv/lib/python3.12/site-packages/nltk/app/wordnet_app.py
Normal file
File diff suppressed because it is too large
Load Diff
213
Backend/venv/lib/python3.12/site-packages/nltk/book.py
Normal file
213
Backend/venv/lib/python3.12/site-packages/nltk/book.py
Normal file
@@ -0,0 +1,213 @@
|
||||
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.corpus import (
|
||||
genesis,
|
||||
gutenberg,
|
||||
inaugural,
|
||||
nps_chat,
|
||||
treebank,
|
||||
webtext,
|
||||
wordnet,
|
||||
)
|
||||
from nltk.probability import FreqDist
|
||||
from nltk.text import Text
|
||||
from nltk.util import bigrams
|
||||
|
||||
print("*** Introductory Examples for the NLTK Book ***")
|
||||
print("Loading text1, ..., text9 and sent1, ..., sent9")
|
||||
print("Type the name of the text or sentence to view it.")
|
||||
print("Type: 'texts()' or 'sents()' to list the materials.")
|
||||
|
||||
text1 = Text(gutenberg.words("melville-moby_dick.txt"))
|
||||
print("text1:", text1.name)
|
||||
|
||||
text2 = Text(gutenberg.words("austen-sense.txt"))
|
||||
print("text2:", text2.name)
|
||||
|
||||
text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
|
||||
print("text3:", text3.name)
|
||||
|
||||
text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
|
||||
print("text4:", text4.name)
|
||||
|
||||
text5 = Text(nps_chat.words(), name="Chat Corpus")
|
||||
print("text5:", text5.name)
|
||||
|
||||
text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
|
||||
print("text6:", text6.name)
|
||||
|
||||
text7 = Text(treebank.words(), name="Wall Street Journal")
|
||||
print("text7:", text7.name)
|
||||
|
||||
text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
|
||||
print("text8:", text8.name)
|
||||
|
||||
text9 = Text(gutenberg.words("chesterton-thursday.txt"))
|
||||
print("text9:", text9.name)
|
||||
|
||||
|
||||
def texts():
|
||||
print("text1:", text1.name)
|
||||
print("text2:", text2.name)
|
||||
print("text3:", text3.name)
|
||||
print("text4:", text4.name)
|
||||
print("text5:", text5.name)
|
||||
print("text6:", text6.name)
|
||||
print("text7:", text7.name)
|
||||
print("text8:", text8.name)
|
||||
print("text9:", text9.name)
|
||||
|
||||
|
||||
sent1 = ["Call", "me", "Ishmael", "."]
|
||||
sent2 = [
|
||||
"The",
|
||||
"family",
|
||||
"of",
|
||||
"Dashwood",
|
||||
"had",
|
||||
"long",
|
||||
"been",
|
||||
"settled",
|
||||
"in",
|
||||
"Sussex",
|
||||
".",
|
||||
]
|
||||
sent3 = [
|
||||
"In",
|
||||
"the",
|
||||
"beginning",
|
||||
"God",
|
||||
"created",
|
||||
"the",
|
||||
"heaven",
|
||||
"and",
|
||||
"the",
|
||||
"earth",
|
||||
".",
|
||||
]
|
||||
sent4 = [
|
||||
"Fellow",
|
||||
"-",
|
||||
"Citizens",
|
||||
"of",
|
||||
"the",
|
||||
"Senate",
|
||||
"and",
|
||||
"of",
|
||||
"the",
|
||||
"House",
|
||||
"of",
|
||||
"Representatives",
|
||||
":",
|
||||
]
|
||||
sent5 = [
|
||||
"I",
|
||||
"have",
|
||||
"a",
|
||||
"problem",
|
||||
"with",
|
||||
"people",
|
||||
"PMing",
|
||||
"me",
|
||||
"to",
|
||||
"lol",
|
||||
"JOIN",
|
||||
]
|
||||
sent6 = [
|
||||
"SCENE",
|
||||
"1",
|
||||
":",
|
||||
"[",
|
||||
"wind",
|
||||
"]",
|
||||
"[",
|
||||
"clop",
|
||||
"clop",
|
||||
"clop",
|
||||
"]",
|
||||
"KING",
|
||||
"ARTHUR",
|
||||
":",
|
||||
"Whoa",
|
||||
"there",
|
||||
"!",
|
||||
]
|
||||
sent7 = [
|
||||
"Pierre",
|
||||
"Vinken",
|
||||
",",
|
||||
"61",
|
||||
"years",
|
||||
"old",
|
||||
",",
|
||||
"will",
|
||||
"join",
|
||||
"the",
|
||||
"board",
|
||||
"as",
|
||||
"a",
|
||||
"nonexecutive",
|
||||
"director",
|
||||
"Nov.",
|
||||
"29",
|
||||
".",
|
||||
]
|
||||
sent8 = [
|
||||
"25",
|
||||
"SEXY",
|
||||
"MALE",
|
||||
",",
|
||||
"seeks",
|
||||
"attrac",
|
||||
"older",
|
||||
"single",
|
||||
"lady",
|
||||
",",
|
||||
"for",
|
||||
"discreet",
|
||||
"encounters",
|
||||
".",
|
||||
]
|
||||
sent9 = [
|
||||
"THE",
|
||||
"suburb",
|
||||
"of",
|
||||
"Saffron",
|
||||
"Park",
|
||||
"lay",
|
||||
"on",
|
||||
"the",
|
||||
"sunset",
|
||||
"side",
|
||||
"of",
|
||||
"London",
|
||||
",",
|
||||
"as",
|
||||
"red",
|
||||
"and",
|
||||
"ragged",
|
||||
"as",
|
||||
"a",
|
||||
"cloud",
|
||||
"of",
|
||||
"sunset",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def sents():
|
||||
print("sent1:", " ".join(sent1))
|
||||
print("sent2:", " ".join(sent2))
|
||||
print("sent3:", " ".join(sent3))
|
||||
print("sent4:", " ".join(sent4))
|
||||
print("sent5:", " ".join(sent5))
|
||||
print("sent6:", " ".join(sent6))
|
||||
print("sent7:", " ".join(sent7))
|
||||
print("sent8:", " ".join(sent8))
|
||||
print("sent9:", " ".join(sent9))
|
||||
@@ -0,0 +1,34 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Combinatory Categorial Grammar.
|
||||
|
||||
For more information see nltk/doc/contrib/ccg/ccg.pdf
|
||||
"""
|
||||
|
||||
from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge
|
||||
from nltk.ccg.combinator import (
|
||||
BackwardApplication,
|
||||
BackwardBx,
|
||||
BackwardCombinator,
|
||||
BackwardComposition,
|
||||
BackwardSx,
|
||||
BackwardT,
|
||||
DirectedBinaryCombinator,
|
||||
ForwardApplication,
|
||||
ForwardCombinator,
|
||||
ForwardComposition,
|
||||
ForwardSubstitution,
|
||||
ForwardT,
|
||||
UndirectedBinaryCombinator,
|
||||
UndirectedComposition,
|
||||
UndirectedFunctionApplication,
|
||||
UndirectedSubstitution,
|
||||
UndirectedTypeRaise,
|
||||
)
|
||||
from nltk.ccg.lexicon import CCGLexicon
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
358
Backend/venv/lib/python3.12/site-packages/nltk/ccg/api.py
Normal file
358
Backend/venv/lib/python3.12/site-packages/nltk/ccg/api.py
Normal file
@@ -0,0 +1,358 @@
|
||||
# Natural Language Toolkit: CCG Categories
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from functools import total_ordering
|
||||
|
||||
from nltk.internals import raise_unorderable_types
|
||||
|
||||
|
||||
@total_ordering
|
||||
class AbstractCCGCategory(metaclass=ABCMeta):
|
||||
"""
|
||||
Interface for categories in combinatory grammars.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def is_primitive(self):
|
||||
"""
|
||||
Returns true if the category is primitive.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def is_function(self):
|
||||
"""
|
||||
Returns true if the category is a function application.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def is_var(self):
|
||||
"""
|
||||
Returns true if the category is a variable.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def substitute(self, substitutions):
|
||||
"""
|
||||
Takes a set of (var, category) substitutions, and replaces every
|
||||
occurrence of the variable with the corresponding category.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_unify(self, other):
|
||||
"""
|
||||
Determines whether two categories can be unified.
|
||||
- Returns None if they cannot be unified
|
||||
- Returns a list of necessary substitutions if they can.
|
||||
"""
|
||||
|
||||
# Utility functions: comparison, strings and hashing.
|
||||
@abstractmethod
|
||||
def __str__(self):
|
||||
pass
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.__class__ is other.__class__
|
||||
and self._comparison_key == other._comparison_key
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, AbstractCCGCategory):
|
||||
raise_unorderable_types("<", self, other)
|
||||
if self.__class__ is other.__class__:
|
||||
return self._comparison_key < other._comparison_key
|
||||
else:
|
||||
return self.__class__.__name__ < other.__class__.__name__
|
||||
|
||||
def __hash__(self):
|
||||
try:
|
||||
return self._hash
|
||||
except AttributeError:
|
||||
self._hash = hash(self._comparison_key)
|
||||
return self._hash
|
||||
|
||||
|
||||
class CCGVar(AbstractCCGCategory):
|
||||
"""
|
||||
Class representing a variable CCG category.
|
||||
Used for conjunctions (and possibly type-raising, if implemented as a
|
||||
unary rule).
|
||||
"""
|
||||
|
||||
_maxID = 0
|
||||
|
||||
def __init__(self, prim_only=False):
|
||||
"""Initialize a variable (selects a new identifier)
|
||||
|
||||
:param prim_only: a boolean that determines whether the variable is
|
||||
restricted to primitives
|
||||
:type prim_only: bool
|
||||
"""
|
||||
self._id = self.new_id()
|
||||
self._prim_only = prim_only
|
||||
self._comparison_key = self._id
|
||||
|
||||
@classmethod
|
||||
def new_id(cls):
|
||||
"""
|
||||
A class method allowing generation of unique variable identifiers.
|
||||
"""
|
||||
cls._maxID = cls._maxID + 1
|
||||
return cls._maxID - 1
|
||||
|
||||
@classmethod
|
||||
def reset_id(cls):
|
||||
cls._maxID = 0
|
||||
|
||||
def is_primitive(self):
|
||||
return False
|
||||
|
||||
def is_function(self):
|
||||
return False
|
||||
|
||||
def is_var(self):
|
||||
return True
|
||||
|
||||
def substitute(self, substitutions):
|
||||
"""If there is a substitution corresponding to this variable,
|
||||
return the substituted category.
|
||||
"""
|
||||
for var, cat in substitutions:
|
||||
if var == self:
|
||||
return cat
|
||||
return self
|
||||
|
||||
def can_unify(self, other):
|
||||
"""If the variable can be replaced with other
|
||||
a substitution is returned.
|
||||
"""
|
||||
if other.is_primitive() or not self._prim_only:
|
||||
return [(self, other)]
|
||||
return None
|
||||
|
||||
def id(self):
|
||||
return self._id
|
||||
|
||||
def __str__(self):
|
||||
return "_var" + str(self._id)
|
||||
|
||||
|
||||
@total_ordering
|
||||
class Direction:
|
||||
"""
|
||||
Class representing the direction of a function application.
|
||||
Also contains maintains information as to which combinators
|
||||
may be used with the category.
|
||||
"""
|
||||
|
||||
def __init__(self, dir, restrictions):
|
||||
self._dir = dir
|
||||
self._restrs = restrictions
|
||||
self._comparison_key = (dir, tuple(restrictions))
|
||||
|
||||
# Testing the application direction
|
||||
def is_forward(self):
|
||||
return self._dir == "/"
|
||||
|
||||
def is_backward(self):
|
||||
return self._dir == "\\"
|
||||
|
||||
def dir(self):
|
||||
return self._dir
|
||||
|
||||
def restrs(self):
|
||||
"""A list of restrictions on the combinators.
|
||||
'.' denotes that permuting operations are disallowed
|
||||
',' denotes that function composition is disallowed
|
||||
'_' denotes that the direction has variable restrictions.
|
||||
(This is redundant in the current implementation of type-raising)
|
||||
"""
|
||||
return self._restrs
|
||||
|
||||
def is_variable(self):
|
||||
return self._restrs == "_"
|
||||
|
||||
# Unification and substitution of variable directions.
|
||||
# Used only if type-raising is implemented as a unary rule, as it
|
||||
# must inherit restrictions from the argument category.
|
||||
def can_unify(self, other):
|
||||
if other.is_variable():
|
||||
return [("_", self.restrs())]
|
||||
elif self.is_variable():
|
||||
return [("_", other.restrs())]
|
||||
else:
|
||||
if self.restrs() == other.restrs():
|
||||
return []
|
||||
return None
|
||||
|
||||
def substitute(self, subs):
|
||||
if not self.is_variable():
|
||||
return self
|
||||
|
||||
for var, restrs in subs:
|
||||
if var == "_":
|
||||
return Direction(self._dir, restrs)
|
||||
return self
|
||||
|
||||
# Testing permitted combinators
|
||||
def can_compose(self):
|
||||
return "," not in self._restrs
|
||||
|
||||
def can_cross(self):
|
||||
return "." not in self._restrs
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.__class__ is other.__class__
|
||||
and self._comparison_key == other._comparison_key
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, Direction):
|
||||
raise_unorderable_types("<", self, other)
|
||||
if self.__class__ is other.__class__:
|
||||
return self._comparison_key < other._comparison_key
|
||||
else:
|
||||
return self.__class__.__name__ < other.__class__.__name__
|
||||
|
||||
def __hash__(self):
|
||||
try:
|
||||
return self._hash
|
||||
except AttributeError:
|
||||
self._hash = hash(self._comparison_key)
|
||||
return self._hash
|
||||
|
||||
def __str__(self):
|
||||
r_str = ""
|
||||
for r in self._restrs:
|
||||
r_str = r_str + "%s" % r
|
||||
return f"{self._dir}{r_str}"
|
||||
|
||||
# The negation operator reverses the direction of the application
|
||||
def __neg__(self):
|
||||
if self._dir == "/":
|
||||
return Direction("\\", self._restrs)
|
||||
else:
|
||||
return Direction("/", self._restrs)
|
||||
|
||||
|
||||
class PrimitiveCategory(AbstractCCGCategory):
|
||||
"""
|
||||
Class representing primitive categories.
|
||||
Takes a string representation of the category, and a
|
||||
list of strings specifying the morphological subcategories.
|
||||
"""
|
||||
|
||||
def __init__(self, categ, restrictions=[]):
|
||||
self._categ = categ
|
||||
self._restrs = restrictions
|
||||
self._comparison_key = (categ, tuple(restrictions))
|
||||
|
||||
def is_primitive(self):
|
||||
return True
|
||||
|
||||
def is_function(self):
|
||||
return False
|
||||
|
||||
def is_var(self):
|
||||
return False
|
||||
|
||||
def restrs(self):
|
||||
return self._restrs
|
||||
|
||||
def categ(self):
|
||||
return self._categ
|
||||
|
||||
# Substitution does nothing to a primitive category
|
||||
def substitute(self, subs):
|
||||
return self
|
||||
|
||||
# A primitive can be unified with a class of the same
|
||||
# base category, given that the other category shares all
|
||||
# of its subclasses, or with a variable.
|
||||
def can_unify(self, other):
|
||||
if not other.is_primitive():
|
||||
return None
|
||||
if other.is_var():
|
||||
return [(other, self)]
|
||||
if other.categ() == self.categ():
|
||||
for restr in self._restrs:
|
||||
if restr not in other.restrs():
|
||||
return None
|
||||
return []
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
if self._restrs == []:
|
||||
return "%s" % self._categ
|
||||
restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
|
||||
return f"{self._categ}{restrictions}"
|
||||
|
||||
|
||||
class FunctionalCategory(AbstractCCGCategory):
|
||||
"""
|
||||
Class that represents a function application category.
|
||||
Consists of argument and result categories, together with
|
||||
an application direction.
|
||||
"""
|
||||
|
||||
def __init__(self, res, arg, dir):
|
||||
self._res = res
|
||||
self._arg = arg
|
||||
self._dir = dir
|
||||
self._comparison_key = (arg, dir, res)
|
||||
|
||||
def is_primitive(self):
|
||||
return False
|
||||
|
||||
def is_function(self):
|
||||
return True
|
||||
|
||||
def is_var(self):
|
||||
return False
|
||||
|
||||
# Substitution returns the category consisting of the
|
||||
# substitution applied to each of its constituents.
|
||||
def substitute(self, subs):
|
||||
sub_res = self._res.substitute(subs)
|
||||
sub_dir = self._dir.substitute(subs)
|
||||
sub_arg = self._arg.substitute(subs)
|
||||
return FunctionalCategory(sub_res, sub_arg, self._dir)
|
||||
|
||||
# A function can unify with another function, so long as its
|
||||
# constituents can unify, or with an unrestricted variable.
|
||||
def can_unify(self, other):
|
||||
if other.is_var():
|
||||
return [(other, self)]
|
||||
if other.is_function():
|
||||
sa = self._res.can_unify(other.res())
|
||||
sd = self._dir.can_unify(other.dir())
|
||||
if sa is not None and sd is not None:
|
||||
sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
|
||||
if sb is not None:
|
||||
return sa + sb
|
||||
return None
|
||||
|
||||
# Constituent accessors
|
||||
def arg(self):
|
||||
return self._arg
|
||||
|
||||
def res(self):
|
||||
return self._res
|
||||
|
||||
def dir(self):
|
||||
return self._dir
|
||||
|
||||
def __str__(self):
|
||||
return f"({self._res}{self._dir}{self._arg})"
|
||||
480
Backend/venv/lib/python3.12/site-packages/nltk/ccg/chart.py
Normal file
480
Backend/venv/lib/python3.12/site-packages/nltk/ccg/chart.py
Normal file
@@ -0,0 +1,480 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The lexicon is constructed by calling
|
||||
``lexicon.fromstring(<lexicon string>)``.
|
||||
|
||||
In order to construct a parser, you also need a rule set.
|
||||
The standard English rules are provided in chart as
|
||||
``chart.DefaultRuleSet``.
|
||||
|
||||
The parser can then be constructed by calling, for example:
|
||||
``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
|
||||
|
||||
Parsing is then performed by running
|
||||
``parser.parse(<sentence>.split())``.
|
||||
|
||||
While this returns a list of trees, the default representation
|
||||
of the produced trees is not very enlightening, particularly
|
||||
given that it uses the same tree class as the CFG parsers.
|
||||
It is probably better to call:
|
||||
``chart.printCCGDerivation(<parse tree extracted from list>)``
|
||||
which should print a nice representation of the derivation.
|
||||
|
||||
This entire process is shown far more clearly in the demonstration:
|
||||
python chart.py
|
||||
"""
|
||||
|
||||
import itertools
|
||||
|
||||
from nltk.ccg.combinator import *
|
||||
from nltk.ccg.combinator import (
|
||||
BackwardApplication,
|
||||
BackwardBx,
|
||||
BackwardComposition,
|
||||
BackwardSx,
|
||||
BackwardT,
|
||||
ForwardApplication,
|
||||
ForwardComposition,
|
||||
ForwardSubstitution,
|
||||
ForwardT,
|
||||
)
|
||||
from nltk.ccg.lexicon import Token, fromstring
|
||||
from nltk.ccg.logic import *
|
||||
from nltk.parse import ParserI
|
||||
from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
|
||||
from nltk.sem.logic import *
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
# Based on the EdgeI class from NLTK.
|
||||
# A number of the properties of the EdgeI interface don't
|
||||
# transfer well to CCGs, however.
|
||||
class CCGEdge(EdgeI):
|
||||
def __init__(self, span, categ, rule):
|
||||
self._span = span
|
||||
self._categ = categ
|
||||
self._rule = rule
|
||||
self._comparison_key = (span, categ, rule)
|
||||
|
||||
# Accessors
|
||||
def lhs(self):
|
||||
return self._categ
|
||||
|
||||
def span(self):
|
||||
return self._span
|
||||
|
||||
def start(self):
|
||||
return self._span[0]
|
||||
|
||||
def end(self):
|
||||
return self._span[1]
|
||||
|
||||
def length(self):
|
||||
return self._span[1] - self.span[0]
|
||||
|
||||
def rhs(self):
|
||||
return ()
|
||||
|
||||
def dot(self):
|
||||
return 0
|
||||
|
||||
def is_complete(self):
|
||||
return True
|
||||
|
||||
def is_incomplete(self):
|
||||
return False
|
||||
|
||||
def nextsym(self):
|
||||
return None
|
||||
|
||||
def categ(self):
|
||||
return self._categ
|
||||
|
||||
def rule(self):
|
||||
return self._rule
|
||||
|
||||
|
||||
class CCGLeafEdge(EdgeI):
|
||||
"""
|
||||
Class representing leaf edges in a CCG derivation.
|
||||
"""
|
||||
|
||||
def __init__(self, pos, token, leaf):
|
||||
self._pos = pos
|
||||
self._token = token
|
||||
self._leaf = leaf
|
||||
self._comparison_key = (pos, token.categ(), leaf)
|
||||
|
||||
# Accessors
|
||||
def lhs(self):
|
||||
return self._token.categ()
|
||||
|
||||
def span(self):
|
||||
return (self._pos, self._pos + 1)
|
||||
|
||||
def start(self):
|
||||
return self._pos
|
||||
|
||||
def end(self):
|
||||
return self._pos + 1
|
||||
|
||||
def length(self):
|
||||
return 1
|
||||
|
||||
def rhs(self):
|
||||
return self._leaf
|
||||
|
||||
def dot(self):
|
||||
return 0
|
||||
|
||||
def is_complete(self):
|
||||
return True
|
||||
|
||||
def is_incomplete(self):
|
||||
return False
|
||||
|
||||
def nextsym(self):
|
||||
return None
|
||||
|
||||
def token(self):
|
||||
return self._token
|
||||
|
||||
def categ(self):
|
||||
return self._token.categ()
|
||||
|
||||
def leaf(self):
|
||||
return self._leaf
|
||||
|
||||
|
||||
class BinaryCombinatorRule(AbstractChartRule):
|
||||
"""
|
||||
Class implementing application of a binary combinator to a chart.
|
||||
Takes the directed combinator to apply.
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self, combinator):
|
||||
self._combinator = combinator
|
||||
|
||||
# Apply a combinator
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
# The left & right edges must be touching.
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
# Check if the two edges are permitted to combine.
|
||||
# If so, generate the corresponding edge.
|
||||
if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(
|
||||
span=(left_edge.start(), right_edge.end()),
|
||||
categ=res,
|
||||
rule=self._combinator,
|
||||
)
|
||||
if chart.insert(new_edge, (left_edge, right_edge)):
|
||||
yield new_edge
|
||||
|
||||
# The representation of the combinator (for printing derivations)
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
# Type-raising must be handled slightly differently to the other rules, as the
|
||||
# resulting rules only span a single edge, rather than both edges.
|
||||
|
||||
|
||||
class ForwardTypeRaiseRule(AbstractChartRule):
|
||||
"""
|
||||
Class for applying forward type raising
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self):
|
||||
self._combinator = ForwardT
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
|
||||
if chart.insert(new_edge, (left_edge,)):
|
||||
yield new_edge
|
||||
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
class BackwardTypeRaiseRule(AbstractChartRule):
|
||||
"""
|
||||
Class for applying backward type raising.
|
||||
"""
|
||||
|
||||
NUMEDGES = 2
|
||||
|
||||
def __init__(self):
|
||||
self._combinator = BackwardT
|
||||
|
||||
def apply(self, chart, grammar, left_edge, right_edge):
|
||||
if not (left_edge.end() == right_edge.start()):
|
||||
return
|
||||
|
||||
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
||||
new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
|
||||
if chart.insert(new_edge, (right_edge,)):
|
||||
yield new_edge
|
||||
|
||||
def __str__(self):
|
||||
return "%s" % self._combinator
|
||||
|
||||
|
||||
# Common sets of combinators used for English derivations.
|
||||
ApplicationRuleSet = [
|
||||
BinaryCombinatorRule(ForwardApplication),
|
||||
BinaryCombinatorRule(BackwardApplication),
|
||||
]
|
||||
CompositionRuleSet = [
|
||||
BinaryCombinatorRule(ForwardComposition),
|
||||
BinaryCombinatorRule(BackwardComposition),
|
||||
BinaryCombinatorRule(BackwardBx),
|
||||
]
|
||||
SubstitutionRuleSet = [
|
||||
BinaryCombinatorRule(ForwardSubstitution),
|
||||
BinaryCombinatorRule(BackwardSx),
|
||||
]
|
||||
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
|
||||
|
||||
# The standard English rule set.
|
||||
DefaultRuleSet = (
|
||||
ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
|
||||
)
|
||||
|
||||
|
||||
class CCGChartParser(ParserI):
|
||||
"""
|
||||
Chart parser for CCGs.
|
||||
Based largely on the ChartParser class from NLTK.
|
||||
"""
|
||||
|
||||
def __init__(self, lexicon, rules, trace=0):
|
||||
self._lexicon = lexicon
|
||||
self._rules = rules
|
||||
self._trace = trace
|
||||
|
||||
def lexicon(self):
|
||||
return self._lexicon
|
||||
|
||||
# Implements the CYK algorithm
|
||||
def parse(self, tokens):
|
||||
tokens = list(tokens)
|
||||
chart = CCGChart(list(tokens))
|
||||
lex = self._lexicon
|
||||
|
||||
# Initialize leaf edges.
|
||||
for index in range(chart.num_leaves()):
|
||||
for token in lex.categories(chart.leaf(index)):
|
||||
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
|
||||
chart.insert(new_edge, ())
|
||||
|
||||
# Select a span for the new edges
|
||||
for span in range(2, chart.num_leaves() + 1):
|
||||
for start in range(0, chart.num_leaves() - span + 1):
|
||||
# Try all possible pairs of edges that could generate
|
||||
# an edge for that span
|
||||
for part in range(1, span):
|
||||
lstart = start
|
||||
mid = start + part
|
||||
rend = start + span
|
||||
|
||||
for left in chart.select(span=(lstart, mid)):
|
||||
for right in chart.select(span=(mid, rend)):
|
||||
# Generate all possible combinations of the two edges
|
||||
for rule in self._rules:
|
||||
edges_added_by_rule = 0
|
||||
for newedge in rule.apply(chart, lex, left, right):
|
||||
edges_added_by_rule += 1
|
||||
|
||||
# Output the resulting parses
|
||||
return chart.parses(lex.start())
|
||||
|
||||
|
||||
class CCGChart(Chart):
|
||||
def __init__(self, tokens):
|
||||
Chart.__init__(self, tokens)
|
||||
|
||||
# Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
|
||||
# constructed slightly differently to those in the default Chart class, so it has to
|
||||
# be reimplemented
|
||||
def _trees(self, edge, complete, memo, tree_class):
|
||||
assert complete, "CCGChart cannot build incomplete trees"
|
||||
|
||||
if edge in memo:
|
||||
return memo[edge]
|
||||
|
||||
if isinstance(edge, CCGLeafEdge):
|
||||
word = tree_class(edge.token(), [self._tokens[edge.start()]])
|
||||
leaf = tree_class((edge.token(), "Leaf"), [word])
|
||||
memo[edge] = [leaf]
|
||||
return [leaf]
|
||||
|
||||
memo[edge] = []
|
||||
trees = []
|
||||
|
||||
for cpl in self.child_pointer_lists(edge):
|
||||
child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
|
||||
for children in itertools.product(*child_choices):
|
||||
lhs = (
|
||||
Token(
|
||||
self._tokens[edge.start() : edge.end()],
|
||||
edge.lhs(),
|
||||
compute_semantics(children, edge),
|
||||
),
|
||||
str(edge.rule()),
|
||||
)
|
||||
trees.append(tree_class(lhs, children))
|
||||
|
||||
memo[edge] = trees
|
||||
return trees
|
||||
|
||||
|
||||
def compute_semantics(children, edge):
|
||||
if children[0].label()[0].semantics() is None:
|
||||
return None
|
||||
|
||||
if len(children) == 2:
|
||||
if isinstance(edge.rule(), BackwardCombinator):
|
||||
children = [children[1], children[0]]
|
||||
|
||||
combinator = edge.rule()._combinator
|
||||
function = children[0].label()[0].semantics()
|
||||
argument = children[1].label()[0].semantics()
|
||||
|
||||
if isinstance(combinator, UndirectedFunctionApplication):
|
||||
return compute_function_semantics(function, argument)
|
||||
elif isinstance(combinator, UndirectedComposition):
|
||||
return compute_composition_semantics(function, argument)
|
||||
elif isinstance(combinator, UndirectedSubstitution):
|
||||
return compute_substitution_semantics(function, argument)
|
||||
else:
|
||||
raise AssertionError("Unsupported combinator '" + combinator + "'")
|
||||
else:
|
||||
return compute_type_raised_semantics(children[0].label()[0].semantics())
|
||||
|
||||
|
||||
# --------
|
||||
# Displaying derivations
|
||||
# --------
|
||||
def printCCGDerivation(tree):
|
||||
# Get the leaves and initial categories
|
||||
leafcats = tree.pos()
|
||||
leafstr = ""
|
||||
catstr = ""
|
||||
|
||||
# Construct a string with both the leaf word and corresponding
|
||||
# category aligned.
|
||||
for leaf, cat in leafcats:
|
||||
str_cat = "%s" % cat
|
||||
nextlen = 2 + max(len(leaf), len(str_cat))
|
||||
lcatlen = (nextlen - len(str_cat)) // 2
|
||||
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
|
||||
catstr += " " * lcatlen + str_cat + " " * rcatlen
|
||||
lleaflen = (nextlen - len(leaf)) // 2
|
||||
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
|
||||
leafstr += " " * lleaflen + leaf + " " * rleaflen
|
||||
print(leafstr.rstrip())
|
||||
print(catstr.rstrip())
|
||||
|
||||
# Display the derivation steps
|
||||
printCCGTree(0, tree)
|
||||
|
||||
|
||||
# Prints the sequence of derivation steps.
|
||||
def printCCGTree(lwidth, tree):
|
||||
rwidth = lwidth
|
||||
|
||||
# Is a leaf (word).
|
||||
# Increment the span by the space occupied by the leaf.
|
||||
if not isinstance(tree, Tree):
|
||||
return 2 + lwidth + len(tree)
|
||||
|
||||
# Find the width of the current derivation step
|
||||
for child in tree:
|
||||
rwidth = max(rwidth, printCCGTree(rwidth, child))
|
||||
|
||||
# Is a leaf node.
|
||||
# Don't print anything, but account for the space occupied.
|
||||
if not isinstance(tree.label(), tuple):
|
||||
return max(
|
||||
rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
|
||||
)
|
||||
|
||||
(token, op) = tree.label()
|
||||
|
||||
if op == "Leaf":
|
||||
return rwidth
|
||||
|
||||
# Pad to the left with spaces, followed by a sequence of '-'
|
||||
# and the derivation rule.
|
||||
print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
|
||||
# Print the resulting category on a new line.
|
||||
str_res = "%s" % (token.categ())
|
||||
if token.semantics() is not None:
|
||||
str_res += " {" + str(token.semantics()) + "}"
|
||||
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
|
||||
print(respadlen * " " + str_res)
|
||||
return rwidth
|
||||
|
||||
|
||||
### Demonstration code
|
||||
|
||||
# Construct the lexicon
|
||||
lex = fromstring(
|
||||
"""
|
||||
:- S, NP, N, VP # Primitive categories, S is the target primitive
|
||||
|
||||
Det :: NP/N # Family of words
|
||||
Pro :: NP
|
||||
TV :: VP/NP
|
||||
Modal :: (S\\NP)/VP # Backslashes need to be escaped
|
||||
|
||||
I => Pro # Word -> Category mapping
|
||||
you => Pro
|
||||
|
||||
the => Det
|
||||
|
||||
# Variables have the special keyword 'var'
|
||||
# '.' prevents permutation
|
||||
# ',' prevents composition
|
||||
and => var\\.,var/.,var
|
||||
|
||||
which => (N\\N)/(S/NP)
|
||||
|
||||
will => Modal # Categories can be either explicit, or families.
|
||||
might => Modal
|
||||
|
||||
cook => TV
|
||||
eat => TV
|
||||
|
||||
mushrooms => N
|
||||
parsnips => N
|
||||
bacon => N
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def demo():
|
||||
parser = CCGChartParser(lex, DefaultRuleSet)
|
||||
for parse in parser.parse("I might cook and eat the bacon".split()):
|
||||
printCCGDerivation(parse)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
340
Backend/venv/lib/python3.12/site-packages/nltk/ccg/combinator.py
Normal file
340
Backend/venv/lib/python3.12/site-packages/nltk/ccg/combinator.py
Normal file
@@ -0,0 +1,340 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
CCG Combinators
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from nltk.ccg.api import FunctionalCategory
|
||||
|
||||
|
||||
class UndirectedBinaryCombinator(metaclass=ABCMeta):
|
||||
"""
|
||||
Abstract class for representing a binary combinator.
|
||||
Merely defines functions for checking if the function and argument
|
||||
are able to be combined, and what the resulting category is.
|
||||
|
||||
Note that as no assumptions are made as to direction, the unrestricted
|
||||
combinators can perform all backward, forward and crossed variations
|
||||
of the combinators; these restrictions must be added in the rule
|
||||
class.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_combine(self, function, argument):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def combine(self, function, argument):
|
||||
pass
|
||||
|
||||
|
||||
class DirectedBinaryCombinator(metaclass=ABCMeta):
|
||||
"""
|
||||
Wrapper for the undirected binary combinator.
|
||||
It takes left and right categories, and decides which is to be
|
||||
the function, and which the argument.
|
||||
It then decides whether or not they can be combined.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_combine(self, left, right):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def combine(self, left, right):
|
||||
pass
|
||||
|
||||
|
||||
class ForwardCombinator(DirectedBinaryCombinator):
|
||||
"""
|
||||
Class representing combinators where the primary functor is on the left.
|
||||
|
||||
Takes an undirected combinator, and a predicate which adds constraints
|
||||
restricting the cases in which it may apply.
|
||||
"""
|
||||
|
||||
def __init__(self, combinator, predicate, suffix=""):
|
||||
self._combinator = combinator
|
||||
self._predicate = predicate
|
||||
self._suffix = suffix
|
||||
|
||||
def can_combine(self, left, right):
|
||||
return self._combinator.can_combine(left, right) and self._predicate(
|
||||
left, right
|
||||
)
|
||||
|
||||
def combine(self, left, right):
|
||||
yield from self._combinator.combine(left, right)
|
||||
|
||||
def __str__(self):
|
||||
return f">{self._combinator}{self._suffix}"
|
||||
|
||||
|
||||
class BackwardCombinator(DirectedBinaryCombinator):
|
||||
"""
|
||||
The backward equivalent of the ForwardCombinator class.
|
||||
"""
|
||||
|
||||
def __init__(self, combinator, predicate, suffix=""):
|
||||
self._combinator = combinator
|
||||
self._predicate = predicate
|
||||
self._suffix = suffix
|
||||
|
||||
def can_combine(self, left, right):
|
||||
return self._combinator.can_combine(right, left) and self._predicate(
|
||||
left, right
|
||||
)
|
||||
|
||||
def combine(self, left, right):
|
||||
yield from self._combinator.combine(right, left)
|
||||
|
||||
def __str__(self):
|
||||
return f"<{self._combinator}{self._suffix}"
|
||||
|
||||
|
||||
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
|
||||
"""
|
||||
Class representing function application.
|
||||
Implements rules of the form:
|
||||
X/Y Y -> X (>)
|
||||
And the corresponding backwards application rule
|
||||
"""
|
||||
|
||||
def can_combine(self, function, argument):
|
||||
if not function.is_function():
|
||||
return False
|
||||
|
||||
return not function.arg().can_unify(argument) is None
|
||||
|
||||
def combine(self, function, argument):
|
||||
if not function.is_function():
|
||||
return
|
||||
|
||||
subs = function.arg().can_unify(argument)
|
||||
if subs is None:
|
||||
return
|
||||
|
||||
yield function.res().substitute(subs)
|
||||
|
||||
def __str__(self):
|
||||
return ""
|
||||
|
||||
|
||||
# Predicates for function application.
|
||||
|
||||
|
||||
# Ensures the left functor takes an argument on the right
|
||||
def forwardOnly(left, right):
|
||||
return left.dir().is_forward()
|
||||
|
||||
|
||||
# Ensures the right functor takes an argument on the left
|
||||
def backwardOnly(left, right):
|
||||
return right.dir().is_backward()
|
||||
|
||||
|
||||
# Application combinator instances
|
||||
ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
|
||||
BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
|
||||
|
||||
|
||||
class UndirectedComposition(UndirectedBinaryCombinator):
|
||||
"""
|
||||
Functional composition (harmonic) combinator.
|
||||
Implements rules of the form
|
||||
X/Y Y/Z -> X/Z (B>)
|
||||
And the corresponding backwards and crossed variations.
|
||||
"""
|
||||
|
||||
def can_combine(self, function, argument):
|
||||
# Can only combine two functions, and both functions must
|
||||
# allow composition.
|
||||
if not (function.is_function() and argument.is_function()):
|
||||
return False
|
||||
if function.dir().can_compose() and argument.dir().can_compose():
|
||||
return not function.arg().can_unify(argument.res()) is None
|
||||
return False
|
||||
|
||||
def combine(self, function, argument):
|
||||
if not (function.is_function() and argument.is_function()):
|
||||
return
|
||||
if function.dir().can_compose() and argument.dir().can_compose():
|
||||
subs = function.arg().can_unify(argument.res())
|
||||
if subs is not None:
|
||||
yield FunctionalCategory(
|
||||
function.res().substitute(subs),
|
||||
argument.arg().substitute(subs),
|
||||
argument.dir(),
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "B"
|
||||
|
||||
|
||||
# Predicates for restricting application of straight composition.
|
||||
def bothForward(left, right):
|
||||
return left.dir().is_forward() and right.dir().is_forward()
|
||||
|
||||
|
||||
def bothBackward(left, right):
|
||||
return left.dir().is_backward() and right.dir().is_backward()
|
||||
|
||||
|
||||
# Predicates for crossed composition
|
||||
def crossedDirs(left, right):
|
||||
return left.dir().is_forward() and right.dir().is_backward()
|
||||
|
||||
|
||||
def backwardBxConstraint(left, right):
|
||||
# The functors must be crossed inwards
|
||||
if not crossedDirs(left, right):
|
||||
return False
|
||||
# Permuting combinators must be allowed
|
||||
if not left.dir().can_cross() and right.dir().can_cross():
|
||||
return False
|
||||
# The resulting argument category is restricted to be primitive
|
||||
return left.arg().is_primitive()
|
||||
|
||||
|
||||
# Straight composition combinators
|
||||
ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
|
||||
BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
|
||||
|
||||
# Backward crossed composition
|
||||
BackwardBx = BackwardCombinator(
|
||||
UndirectedComposition(), backwardBxConstraint, suffix="x"
|
||||
)
|
||||
|
||||
|
||||
class UndirectedSubstitution(UndirectedBinaryCombinator):
|
||||
r"""
|
||||
Substitution (permutation) combinator.
|
||||
Implements rules of the form
|
||||
Y/Z (X\Y)/Z -> X/Z (<Sx)
|
||||
And other variations.
|
||||
"""
|
||||
|
||||
def can_combine(self, function, argument):
|
||||
if function.is_primitive() or argument.is_primitive():
|
||||
return False
|
||||
|
||||
# These could potentially be moved to the predicates, as the
|
||||
# constraints may not be general to all languages.
|
||||
if function.res().is_primitive():
|
||||
return False
|
||||
if not function.arg().is_primitive():
|
||||
return False
|
||||
|
||||
if not (function.dir().can_compose() and argument.dir().can_compose()):
|
||||
return False
|
||||
return (function.res().arg() == argument.res()) and (
|
||||
function.arg() == argument.arg()
|
||||
)
|
||||
|
||||
def combine(self, function, argument):
|
||||
if self.can_combine(function, argument):
|
||||
yield FunctionalCategory(
|
||||
function.res().res(), argument.arg(), argument.dir()
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "S"
|
||||
|
||||
|
||||
# Predicate for forward substitution
|
||||
def forwardSConstraint(left, right):
|
||||
if not bothForward(left, right):
|
||||
return False
|
||||
return left.res().dir().is_forward() and left.arg().is_primitive()
|
||||
|
||||
|
||||
# Predicate for backward crossed substitution
|
||||
def backwardSxConstraint(left, right):
|
||||
if not left.dir().can_cross() and right.dir().can_cross():
|
||||
return False
|
||||
if not bothForward(left, right):
|
||||
return False
|
||||
return right.res().dir().is_backward() and right.arg().is_primitive()
|
||||
|
||||
|
||||
# Instances of substitution combinators
|
||||
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
|
||||
BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
|
||||
|
||||
|
||||
# Retrieves the left-most functional category.
|
||||
# ie, (N\N)/(S/NP) => N\N
|
||||
def innermostFunction(categ):
|
||||
while categ.res().is_function():
|
||||
categ = categ.res()
|
||||
return categ
|
||||
|
||||
|
||||
class UndirectedTypeRaise(UndirectedBinaryCombinator):
|
||||
"""
|
||||
Undirected combinator for type raising.
|
||||
"""
|
||||
|
||||
def can_combine(self, function, arg):
|
||||
# The argument must be a function.
|
||||
# The restriction that arg.res() must be a function
|
||||
# merely reduces redundant type-raising; if arg.res() is
|
||||
# primitive, we have:
|
||||
# X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
|
||||
# which is equivalent to
|
||||
# X Y\X =>(<) Y
|
||||
if not (arg.is_function() and arg.res().is_function()):
|
||||
return False
|
||||
|
||||
arg = innermostFunction(arg)
|
||||
|
||||
# left, arg_categ are undefined!
|
||||
subs = left.can_unify(arg_categ.arg())
|
||||
if subs is not None:
|
||||
return True
|
||||
return False
|
||||
|
||||
def combine(self, function, arg):
|
||||
if not (
|
||||
function.is_primitive() and arg.is_function() and arg.res().is_function()
|
||||
):
|
||||
return
|
||||
|
||||
# Type-raising matches only the innermost application.
|
||||
arg = innermostFunction(arg)
|
||||
|
||||
subs = function.can_unify(arg.arg())
|
||||
if subs is not None:
|
||||
xcat = arg.res().substitute(subs)
|
||||
yield FunctionalCategory(
|
||||
xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "T"
|
||||
|
||||
|
||||
# Predicates for type-raising
|
||||
# The direction of the innermost category must be towards
|
||||
# the primary functor.
|
||||
# The restriction that the variable must be primitive is not
|
||||
# common to all versions of CCGs; some authors have other restrictions.
|
||||
def forwardTConstraint(left, right):
|
||||
arg = innermostFunction(right)
|
||||
return arg.dir().is_backward() and arg.res().is_primitive()
|
||||
|
||||
|
||||
def backwardTConstraint(left, right):
|
||||
arg = innermostFunction(left)
|
||||
return arg.dir().is_forward() and arg.res().is_primitive()
|
||||
|
||||
|
||||
# Instances of type-raising combinators
|
||||
ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
|
||||
BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)
|
||||
338
Backend/venv/lib/python3.12/site-packages/nltk/ccg/lexicon.py
Normal file
338
Backend/venv/lib/python3.12/site-packages/nltk/ccg/lexicon.py
Normal file
@@ -0,0 +1,338 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
CCG Lexicons
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
|
||||
from nltk.internals import deprecated
|
||||
from nltk.sem.logic import Expression
|
||||
|
||||
# ------------
|
||||
# Regular expressions used for parsing components of the lexicon
|
||||
# ------------
|
||||
|
||||
# Parses a primitive category and subscripts
|
||||
PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
|
||||
|
||||
# Separates the next primitive category from the remainder of the
|
||||
# string
|
||||
NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
|
||||
|
||||
# Separates the next application operator from the remainder
|
||||
APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
|
||||
|
||||
# Parses the definition of the right-hand side (rhs) of either a word or a family
|
||||
LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
|
||||
|
||||
# Parses the right hand side that contains category and maybe semantic predicate
|
||||
RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
|
||||
|
||||
# Parses the semantic predicate
|
||||
SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
|
||||
|
||||
# Strips comments from a line
|
||||
COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
|
||||
|
||||
|
||||
class Token:
|
||||
"""
|
||||
Class representing a token.
|
||||
|
||||
token => category {semantics}
|
||||
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
|
||||
|
||||
* `token` (string)
|
||||
* `categ` (string)
|
||||
* `semantics` (Expression)
|
||||
"""
|
||||
|
||||
def __init__(self, token, categ, semantics=None):
|
||||
self._token = token
|
||||
self._categ = categ
|
||||
self._semantics = semantics
|
||||
|
||||
def categ(self):
|
||||
return self._categ
|
||||
|
||||
def semantics(self):
|
||||
return self._semantics
|
||||
|
||||
def __str__(self):
|
||||
semantics_str = ""
|
||||
if self._semantics is not None:
|
||||
semantics_str = " {" + str(self._semantics) + "}"
|
||||
return "" + str(self._categ) + semantics_str
|
||||
|
||||
def __cmp__(self, other):
|
||||
if not isinstance(other, Token):
|
||||
return -1
|
||||
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
|
||||
|
||||
|
||||
class CCGLexicon:
|
||||
"""
|
||||
Class representing a lexicon for CCG grammars.
|
||||
|
||||
* `primitives`: The list of primitive categories for the lexicon
|
||||
* `families`: Families of categories
|
||||
* `entries`: A mapping of words to possible categories
|
||||
"""
|
||||
|
||||
def __init__(self, start, primitives, families, entries):
|
||||
self._start = PrimitiveCategory(start)
|
||||
self._primitives = primitives
|
||||
self._families = families
|
||||
self._entries = entries
|
||||
|
||||
def categories(self, word):
|
||||
"""
|
||||
Returns all the possible categories for a word
|
||||
"""
|
||||
return self._entries[word]
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Return the target category for the parser
|
||||
"""
|
||||
return self._start
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
String representation of the lexicon. Used for debugging.
|
||||
"""
|
||||
string = ""
|
||||
first = True
|
||||
for ident in sorted(self._entries):
|
||||
if not first:
|
||||
string = string + "\n"
|
||||
string = string + ident + " => "
|
||||
|
||||
first = True
|
||||
for cat in self._entries[ident]:
|
||||
if not first:
|
||||
string = string + " | "
|
||||
else:
|
||||
first = False
|
||||
string = string + "%s" % cat
|
||||
return string
|
||||
|
||||
|
||||
# -----------
|
||||
# Parsing lexicons
|
||||
# -----------
|
||||
|
||||
|
||||
def matchBrackets(string):
|
||||
"""
|
||||
Separate the contents matching the first set of brackets from the rest of
|
||||
the input.
|
||||
"""
|
||||
rest = string[1:]
|
||||
inside = "("
|
||||
|
||||
while rest != "" and not rest.startswith(")"):
|
||||
if rest.startswith("("):
|
||||
(part, rest) = matchBrackets(rest)
|
||||
inside = inside + part
|
||||
else:
|
||||
inside = inside + rest[0]
|
||||
rest = rest[1:]
|
||||
if rest.startswith(")"):
|
||||
return (inside + ")", rest[1:])
|
||||
raise AssertionError("Unmatched bracket in string '" + string + "'")
|
||||
|
||||
|
||||
def nextCategory(string):
|
||||
"""
|
||||
Separate the string for the next portion of the category from the rest
|
||||
of the string
|
||||
"""
|
||||
if string.startswith("("):
|
||||
return matchBrackets(string)
|
||||
return NEXTPRIM_RE.match(string).groups()
|
||||
|
||||
|
||||
def parseApplication(app):
|
||||
"""
|
||||
Parse an application operator
|
||||
"""
|
||||
return Direction(app[0], app[1:])
|
||||
|
||||
|
||||
def parseSubscripts(subscr):
|
||||
"""
|
||||
Parse the subscripts for a primitive category
|
||||
"""
|
||||
if subscr:
|
||||
return subscr[1:-1].split(",")
|
||||
return []
|
||||
|
||||
|
||||
def parsePrimitiveCategory(chunks, primitives, families, var):
|
||||
"""
|
||||
Parse a primitive category
|
||||
|
||||
If the primitive is the special category 'var', replace it with the
|
||||
correct `CCGVar`.
|
||||
"""
|
||||
if chunks[0] == "var":
|
||||
if chunks[1] is None:
|
||||
if var is None:
|
||||
var = CCGVar()
|
||||
return (var, var)
|
||||
|
||||
catstr = chunks[0]
|
||||
if catstr in families:
|
||||
(cat, cvar) = families[catstr]
|
||||
if var is None:
|
||||
var = cvar
|
||||
else:
|
||||
cat = cat.substitute([(cvar, var)])
|
||||
return (cat, var)
|
||||
|
||||
if catstr in primitives:
|
||||
subscrs = parseSubscripts(chunks[1])
|
||||
return (PrimitiveCategory(catstr, subscrs), var)
|
||||
raise AssertionError(
|
||||
"String '" + catstr + "' is neither a family nor primitive category."
|
||||
)
|
||||
|
||||
|
||||
def augParseCategory(line, primitives, families, var=None):
|
||||
"""
|
||||
Parse a string representing a category, and returns a tuple with
|
||||
(possibly) the CCG variable for the category
|
||||
"""
|
||||
(cat_string, rest) = nextCategory(line)
|
||||
|
||||
if cat_string.startswith("("):
|
||||
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
||||
|
||||
else:
|
||||
(res, var) = parsePrimitiveCategory(
|
||||
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
||||
)
|
||||
|
||||
while rest != "":
|
||||
app = APP_RE.match(rest).groups()
|
||||
direction = parseApplication(app[0:3])
|
||||
rest = app[3]
|
||||
|
||||
(cat_string, rest) = nextCategory(rest)
|
||||
if cat_string.startswith("("):
|
||||
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
||||
else:
|
||||
(arg, var) = parsePrimitiveCategory(
|
||||
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
||||
)
|
||||
res = FunctionalCategory(res, arg, direction)
|
||||
|
||||
return (res, var)
|
||||
|
||||
|
||||
def fromstring(lex_str, include_semantics=False):
|
||||
"""
|
||||
Convert string representation into a lexicon for CCGs.
|
||||
"""
|
||||
CCGVar.reset_id()
|
||||
primitives = []
|
||||
families = {}
|
||||
entries = defaultdict(list)
|
||||
for line in lex_str.splitlines():
|
||||
# Strip comments and leading/trailing whitespace.
|
||||
line = COMMENTS_RE.match(line).groups()[0].strip()
|
||||
if line == "":
|
||||
continue
|
||||
|
||||
if line.startswith(":-"):
|
||||
# A line of primitive categories.
|
||||
# The first one is the target category
|
||||
# ie, :- S, N, NP, VP
|
||||
primitives = primitives + [
|
||||
prim.strip() for prim in line[2:].strip().split(",")
|
||||
]
|
||||
else:
|
||||
# Either a family definition, or a word definition
|
||||
(ident, sep, rhs) = LEX_RE.match(line).groups()
|
||||
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
|
||||
(cat, var) = augParseCategory(catstr, primitives, families)
|
||||
|
||||
if sep == "::":
|
||||
# Family definition
|
||||
# ie, Det :: NP/N
|
||||
families[ident] = (cat, var)
|
||||
else:
|
||||
semantics = None
|
||||
if include_semantics is True:
|
||||
if semantics_str is None:
|
||||
raise AssertionError(
|
||||
line
|
||||
+ " must contain semantics because include_semantics is set to True"
|
||||
)
|
||||
else:
|
||||
semantics = Expression.fromstring(
|
||||
SEMANTICS_RE.match(semantics_str).groups()[0]
|
||||
)
|
||||
# Word definition
|
||||
# ie, which => (N\N)/(S/NP)
|
||||
entries[ident].append(Token(ident, cat, semantics))
|
||||
return CCGLexicon(primitives[0], primitives, families, entries)
|
||||
|
||||
|
||||
@deprecated("Use fromstring() instead.")
|
||||
def parseLexicon(lex_str):
|
||||
return fromstring(lex_str)
|
||||
|
||||
|
||||
openccg_tinytiny = fromstring(
|
||||
"""
|
||||
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
|
||||
# Only incorporates a subset of the morphological subcategories, however.
|
||||
:- S,NP,N # Primitive categories
|
||||
Det :: NP/N # Determiners
|
||||
Pro :: NP
|
||||
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
|
||||
IntransVpl :: S\\NP[pl] # Plural
|
||||
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
|
||||
TransVpl :: S\\NP[pl]/NP # Plural
|
||||
|
||||
the => NP[sg]/N[sg]
|
||||
the => NP[pl]/N[pl]
|
||||
|
||||
I => Pro
|
||||
me => Pro
|
||||
we => Pro
|
||||
us => Pro
|
||||
|
||||
book => N[sg]
|
||||
books => N[pl]
|
||||
|
||||
peach => N[sg]
|
||||
peaches => N[pl]
|
||||
|
||||
policeman => N[sg]
|
||||
policemen => N[pl]
|
||||
|
||||
boy => N[sg]
|
||||
boys => N[pl]
|
||||
|
||||
sleep => IntransVsg
|
||||
sleep => IntransVpl
|
||||
|
||||
eat => IntransVpl
|
||||
eat => TransVpl
|
||||
eats => IntransVsg
|
||||
eats => TransVsg
|
||||
|
||||
see => TransVpl
|
||||
sees => TransVsg
|
||||
"""
|
||||
)
|
||||
63
Backend/venv/lib/python3.12/site-packages/nltk/ccg/logic.py
Normal file
63
Backend/venv/lib/python3.12/site-packages/nltk/ccg/logic.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# Natural Language Toolkit: Combinatory Categorial Grammar
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tanin Na Nakorn (@tanin)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Helper functions for CCG semantics computation
|
||||
"""
|
||||
|
||||
import copy
|
||||
|
||||
from nltk.sem.logic import *
|
||||
|
||||
|
||||
def compute_type_raised_semantics(semantics):
|
||||
semantics_copy = copy.deepcopy(semantics)
|
||||
core = semantics_copy
|
||||
parent = None
|
||||
while isinstance(core, LambdaExpression):
|
||||
parent = core
|
||||
core = core.term
|
||||
|
||||
var = Variable("F")
|
||||
while var in core.free():
|
||||
var = unique_variable(pattern=var)
|
||||
core = ApplicationExpression(FunctionVariableExpression(var), core)
|
||||
|
||||
if parent is not None:
|
||||
parent.term = core
|
||||
else:
|
||||
semantics_copy = core
|
||||
|
||||
return LambdaExpression(var, semantics_copy)
|
||||
|
||||
|
||||
def compute_function_semantics(function, argument):
|
||||
return ApplicationExpression(function, argument).simplify()
|
||||
|
||||
|
||||
def compute_composition_semantics(function, argument):
|
||||
assert isinstance(argument, LambdaExpression), (
|
||||
"`" + str(argument) + "` must be a lambda expression"
|
||||
)
|
||||
return LambdaExpression(
|
||||
argument.variable, ApplicationExpression(function, argument.term).simplify()
|
||||
)
|
||||
|
||||
|
||||
def compute_substitution_semantics(function, argument):
|
||||
assert isinstance(function, LambdaExpression) and isinstance(
|
||||
function.term, LambdaExpression
|
||||
), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
|
||||
assert isinstance(argument, LambdaExpression), (
|
||||
"`" + str(argument) + "` must be a lambda expression"
|
||||
)
|
||||
|
||||
new_argument = ApplicationExpression(
|
||||
argument, VariableExpression(function.variable)
|
||||
).simplify()
|
||||
new_term = ApplicationExpression(function.term, new_argument).simplify()
|
||||
|
||||
return LambdaExpression(function.variable, new_term)
|
||||
@@ -0,0 +1,48 @@
|
||||
# Natural Language Toolkit: Chatbots
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
||||
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
|
||||
|
||||
"""
|
||||
A class for simple chatbots. These perform simple pattern matching on sentences
|
||||
typed by users, and respond with automatically generated sentences.
|
||||
|
||||
These chatbots may not work using the windows command line or the
|
||||
windows IDLE GUI.
|
||||
"""
|
||||
|
||||
from nltk.chat.eliza import eliza_chat
|
||||
from nltk.chat.iesha import iesha_chat
|
||||
from nltk.chat.rude import rude_chat
|
||||
from nltk.chat.suntsu import suntsu_chat
|
||||
from nltk.chat.util import Chat
|
||||
from nltk.chat.zen import zen_chat
|
||||
|
||||
bots = [
|
||||
(eliza_chat, "Eliza (psycho-babble)"),
|
||||
(iesha_chat, "Iesha (teen anime junky)"),
|
||||
(rude_chat, "Rude (abusive bot)"),
|
||||
(suntsu_chat, "Suntsu (Chinese sayings)"),
|
||||
(zen_chat, "Zen (gems of wisdom)"),
|
||||
]
|
||||
|
||||
|
||||
def chatbots():
|
||||
print("Which chatbot would you like to talk to?")
|
||||
botcount = len(bots)
|
||||
for i in range(botcount):
|
||||
print(" %d: %s" % (i + 1, bots[i][1]))
|
||||
while True:
|
||||
choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip()
|
||||
if choice.isdigit() and (int(choice) - 1) in range(botcount):
|
||||
break
|
||||
else:
|
||||
print(" Error: bad chatbot number")
|
||||
|
||||
chatbot = bots[int(choice) - 1][0]
|
||||
chatbot()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
337
Backend/venv/lib/python3.12/site-packages/nltk/chat/eliza.py
Normal file
337
Backend/venv/lib/python3.12/site-packages/nltk/chat/eliza.py
Normal file
@@ -0,0 +1,337 @@
|
||||
# Natural Language Toolkit: Eliza
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
||||
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <mailto:jez@jezuk.co.uk>.
|
||||
|
||||
# a translation table used to convert things you say into things the
|
||||
# computer says back, e.g. "I am" --> "you are"
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
# a table of response pairs, where each pair consists of a
|
||||
# regular expression, and a list of possible responses,
|
||||
# with group-macros labelled as %1, %2.
|
||||
|
||||
pairs = (
|
||||
(
|
||||
r"I need (.*)",
|
||||
(
|
||||
"Why do you need %1?",
|
||||
"Would it really help you to get %1?",
|
||||
"Are you sure you need %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Why don\'t you (.*)",
|
||||
(
|
||||
"Do you really think I don't %1?",
|
||||
"Perhaps eventually I will %1.",
|
||||
"Do you really want me to %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Why can\'t I (.*)",
|
||||
(
|
||||
"Do you think you should be able to %1?",
|
||||
"If you could %1, what would you do?",
|
||||
"I don't know -- why can't you %1?",
|
||||
"Have you really tried?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I can\'t (.*)",
|
||||
(
|
||||
"How do you know you can't %1?",
|
||||
"Perhaps you could %1 if you tried.",
|
||||
"What would it take for you to %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I am (.*)",
|
||||
(
|
||||
"Did you come to me because you are %1?",
|
||||
"How long have you been %1?",
|
||||
"How do you feel about being %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I\'m (.*)",
|
||||
(
|
||||
"How does being %1 make you feel?",
|
||||
"Do you enjoy being %1?",
|
||||
"Why do you tell me you're %1?",
|
||||
"Why do you think you're %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Are you (.*)",
|
||||
(
|
||||
"Why does it matter whether I am %1?",
|
||||
"Would you prefer it if I were not %1?",
|
||||
"Perhaps you believe I am %1.",
|
||||
"I may be %1 -- what do you think?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"What (.*)",
|
||||
(
|
||||
"Why do you ask?",
|
||||
"How would an answer to that help you?",
|
||||
"What do you think?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"How (.*)",
|
||||
(
|
||||
"How do you suppose?",
|
||||
"Perhaps you can answer your own question.",
|
||||
"What is it you're really asking?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Because (.*)",
|
||||
(
|
||||
"Is that the real reason?",
|
||||
"What other reasons come to mind?",
|
||||
"Does that reason apply to anything else?",
|
||||
"If %1, what else must be true?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) sorry (.*)",
|
||||
(
|
||||
"There are many times when no apology is needed.",
|
||||
"What feelings do you have when you apologize?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Hello(.*)",
|
||||
(
|
||||
"Hello... I'm glad you could drop by today.",
|
||||
"Hi there... how are you today?",
|
||||
"Hello, how are you feeling today?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I think (.*)",
|
||||
("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
|
||||
),
|
||||
(
|
||||
r"(.*) friend (.*)",
|
||||
(
|
||||
"Tell me more about your friends.",
|
||||
"When you think of a friend, what comes to mind?",
|
||||
"Why don't you tell me about a childhood friend?",
|
||||
),
|
||||
),
|
||||
(r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
|
||||
(
|
||||
r"(.*) computer(.*)",
|
||||
(
|
||||
"Are you really talking about me?",
|
||||
"Does it seem strange to talk to a computer?",
|
||||
"How do computers make you feel?",
|
||||
"Do you feel threatened by computers?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Is it (.*)",
|
||||
(
|
||||
"Do you think it is %1?",
|
||||
"Perhaps it's %1 -- what do you think?",
|
||||
"If it were %1, what would you do?",
|
||||
"It could well be that %1.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"It is (.*)",
|
||||
(
|
||||
"You seem very certain.",
|
||||
"If I told you that it probably isn't %1, what would you feel?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Can you (.*)",
|
||||
(
|
||||
"What makes you think I can't %1?",
|
||||
"If I could %1, then what?",
|
||||
"Why do you ask if I can %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Can I (.*)",
|
||||
(
|
||||
"Perhaps you don't want to %1.",
|
||||
"Do you want to be able to %1?",
|
||||
"If you could %1, would you?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You are (.*)",
|
||||
(
|
||||
"Why do you think I am %1?",
|
||||
"Does it please you to think that I'm %1?",
|
||||
"Perhaps you would like me to be %1.",
|
||||
"Perhaps you're really talking about yourself?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You\'re (.*)",
|
||||
(
|
||||
"Why do you say I am %1?",
|
||||
"Why do you think I am %1?",
|
||||
"Are we talking about you, or me?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I don\'t (.*)",
|
||||
("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
|
||||
),
|
||||
(
|
||||
r"I feel (.*)",
|
||||
(
|
||||
"Good, tell me more about these feelings.",
|
||||
"Do you often feel %1?",
|
||||
"When do you usually feel %1?",
|
||||
"When you feel %1, what do you do?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I have (.*)",
|
||||
(
|
||||
"Why do you tell me that you've %1?",
|
||||
"Have you really %1?",
|
||||
"Now that you have %1, what will you do next?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I would (.*)",
|
||||
(
|
||||
"Could you explain why you would %1?",
|
||||
"Why would you %1?",
|
||||
"Who else knows that you would %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Is there (.*)",
|
||||
(
|
||||
"Do you think there is %1?",
|
||||
"It's likely that there is %1.",
|
||||
"Would you like there to be %1?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"My (.*)",
|
||||
(
|
||||
"I see, your %1.",
|
||||
"Why do you say that your %1?",
|
||||
"When your %1, how do you feel?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You (.*)",
|
||||
(
|
||||
"We should be discussing you, not me.",
|
||||
"Why do you say that about me?",
|
||||
"Why do you care whether I %1?",
|
||||
),
|
||||
),
|
||||
(r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
|
||||
(
|
||||
r"I want (.*)",
|
||||
(
|
||||
"What would it mean to you if you got %1?",
|
||||
"Why do you want %1?",
|
||||
"What would you do if you got %1?",
|
||||
"If you got %1, then what would you do?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) mother(.*)",
|
||||
(
|
||||
"Tell me more about your mother.",
|
||||
"What was your relationship with your mother like?",
|
||||
"How do you feel about your mother?",
|
||||
"How does this relate to your feelings today?",
|
||||
"Good family relations are important.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) father(.*)",
|
||||
(
|
||||
"Tell me more about your father.",
|
||||
"How did your father make you feel?",
|
||||
"How do you feel about your father?",
|
||||
"Does your relationship with your father relate to your feelings today?",
|
||||
"Do you have trouble showing affection with your family?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) child(.*)",
|
||||
(
|
||||
"Did you have close friends as a child?",
|
||||
"What is your favorite childhood memory?",
|
||||
"Do you remember any dreams or nightmares from childhood?",
|
||||
"Did the other children sometimes tease you?",
|
||||
"How do you think your childhood experiences relate to your feelings today?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)\?",
|
||||
(
|
||||
"Why do you ask that?",
|
||||
"Please consider whether you can answer your own question.",
|
||||
"Perhaps the answer lies within yourself?",
|
||||
"Why don't you tell me?",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"quit",
|
||||
(
|
||||
"Thank you for talking with me.",
|
||||
"Good-bye.",
|
||||
"Thank you, that will be $150. Have a good day!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"Please tell me more.",
|
||||
"Let's change focus a bit... Tell me about your family.",
|
||||
"Can you elaborate on that?",
|
||||
"Why do you say that %1?",
|
||||
"I see.",
|
||||
"Very interesting.",
|
||||
"%1.",
|
||||
"I see. And what does that tell you?",
|
||||
"How does that make you feel?",
|
||||
"How do you feel when you say that?",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
eliza_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def eliza_chat():
|
||||
print("Therapist\n---------")
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("Hello. How are you feeling today?")
|
||||
|
||||
eliza_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
eliza_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
eliza_chat()
|
||||
160
Backend/venv/lib/python3.12/site-packages/nltk/chat/iesha.py
Normal file
160
Backend/venv/lib/python3.12/site-packages/nltk/chat/iesha.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# Natural Language Toolkit: Teen Chatbot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
This chatbot is a tongue-in-cheek take on the average teen
|
||||
anime junky that frequents YahooMessenger or MSNM.
|
||||
All spelling mistakes and flawed grammar are intentional.
|
||||
"""
|
||||
|
||||
from nltk.chat.util import Chat
|
||||
|
||||
reflections = {
|
||||
"am": "r",
|
||||
"was": "were",
|
||||
"i": "u",
|
||||
"i'd": "u'd",
|
||||
"i've": "u'v",
|
||||
"ive": "u'v",
|
||||
"i'll": "u'll",
|
||||
"my": "ur",
|
||||
"are": "am",
|
||||
"you're": "im",
|
||||
"you've": "ive",
|
||||
"you'll": "i'll",
|
||||
"your": "my",
|
||||
"yours": "mine",
|
||||
"you": "me",
|
||||
"u": "me",
|
||||
"ur": "my",
|
||||
"urs": "mine",
|
||||
"me": "u",
|
||||
}
|
||||
|
||||
# Note: %1/2/etc are used without spaces prior as the chat bot seems
|
||||
# to add a superfluous space when matching.
|
||||
|
||||
pairs = (
|
||||
(
|
||||
r"I\'m (.*)",
|
||||
(
|
||||
"ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
|
||||
"ur%1? neat!! kekeke >_<",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) don\'t you (.*)",
|
||||
(
|
||||
r"u think I can%2??! really?? kekeke \<_\<",
|
||||
"what do u mean%2??!",
|
||||
"i could if i wanted, don't you think!! kekeke",
|
||||
),
|
||||
),
|
||||
(r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
|
||||
(
|
||||
r"do (you|u) (.*)\??",
|
||||
("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
|
||||
),
|
||||
(
|
||||
r"(.*)\?",
|
||||
(
|
||||
"man u ask lots of questions!",
|
||||
"booooring! how old r u??",
|
||||
"boooooring!! ur not very fun",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(cos|because) (.*)",
|
||||
("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
|
||||
),
|
||||
(
|
||||
r"why can\'t [iI] (.*)",
|
||||
(
|
||||
"i dunno! y u askin me for!",
|
||||
"try harder, silly! hee! ^_^",
|
||||
"i dunno! but when i can't%1 i jump up and down!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I can\'t (.*)",
|
||||
(
|
||||
"u can't what??! >_<",
|
||||
"that's ok! i can't%1 either! kekekekeke ^_^",
|
||||
"try harder, silly! hee! ^&^",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*) (like|love|watch) anime",
|
||||
(
|
||||
"omg i love anime!! do u like sailor moon??! ^&^",
|
||||
"anime yay! anime rocks sooooo much!",
|
||||
"oooh anime! i love anime more than anything!",
|
||||
"anime is the bestest evar! evangelion is the best!",
|
||||
"hee anime is the best! do you have ur fav??",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I (like|love|watch|play) (.*)",
|
||||
("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
|
||||
),
|
||||
(
|
||||
r"anime sucks|(.*) (hate|detest) anime",
|
||||
(
|
||||
"ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
|
||||
"no way! anime is the best ever!",
|
||||
"nuh-uh, anime is the best!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(are|r) (you|u) (.*)",
|
||||
("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
|
||||
),
|
||||
(
|
||||
r"what (.*)",
|
||||
("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
|
||||
),
|
||||
(r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
|
||||
(r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
|
||||
(
|
||||
r"quit",
|
||||
(
|
||||
"mom says i have to go eat dinner now :,( bye!!",
|
||||
"awww u have to go?? see u next time!!",
|
||||
"how to see u again soon! ^_^",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"ur funny! kekeke",
|
||||
"boooooring! talk about something else! tell me wat u like!",
|
||||
"do u like anime??",
|
||||
"do u watch anime? i like sailor moon! ^_^",
|
||||
"i wish i was a kitty!! kekekeke ^_^",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
iesha_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def iesha_chat():
|
||||
print("Iesha the TeenBoT\n---------")
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("hi!! i'm iesha! who r u??!")
|
||||
|
||||
iesha_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
iesha_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
125
Backend/venv/lib/python3.12/site-packages/nltk/chat/rude.py
Normal file
125
Backend/venv/lib/python3.12/site-packages/nltk/chat/rude.py
Normal file
@@ -0,0 +1,125 @@
|
||||
# Natural Language Toolkit: Rude Chatbot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
pairs = (
|
||||
(
|
||||
r"We (.*)",
|
||||
(
|
||||
"What do you mean, 'we'?",
|
||||
"Don't include me in that!",
|
||||
"I wouldn't be so sure about that.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You should (.*)",
|
||||
("Don't tell me what to do, buddy.", "Really? I should, should I?"),
|
||||
),
|
||||
(
|
||||
r"You\'re(.*)",
|
||||
(
|
||||
"More like YOU'RE %1!",
|
||||
"Hah! Look who's talking.",
|
||||
"Come over here and tell me I'm %1.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"You are(.*)",
|
||||
(
|
||||
"More like YOU'RE %1!",
|
||||
"Hah! Look who's talking.",
|
||||
"Come over here and tell me I'm %1.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I can\'t(.*)",
|
||||
(
|
||||
"You do sound like the type who can't %1.",
|
||||
"Hear that splashing sound? That's my heart bleeding for you.",
|
||||
"Tell somebody who might actually care.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I think (.*)",
|
||||
(
|
||||
"I wouldn't think too hard if I were you.",
|
||||
"You actually think? I'd never have guessed...",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"I (.*)",
|
||||
(
|
||||
"I'm getting a bit tired of hearing about you.",
|
||||
"How about we talk about me instead?",
|
||||
"Me, me, me... Frankly, I don't care.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"How (.*)",
|
||||
(
|
||||
"How do you think?",
|
||||
"Take a wild guess.",
|
||||
"I'm not even going to dignify that with an answer.",
|
||||
),
|
||||
),
|
||||
(r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
|
||||
(
|
||||
r"Why (.*)",
|
||||
(
|
||||
"Why not?",
|
||||
"That's so obvious I thought even you'd have already figured it out.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"(.*)shut up(.*)",
|
||||
(
|
||||
"Make me.",
|
||||
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
|
||||
"Say that again, I dare you.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Shut up(.*)",
|
||||
(
|
||||
"Make me.",
|
||||
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
|
||||
"Say that again, I dare you.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"Hello(.*)",
|
||||
("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
|
||||
),
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"I'm getting bored here. Become more interesting.",
|
||||
"Either become more thrilling or get lost, buddy.",
|
||||
"Change the subject before I die of fatal boredom.",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
rude_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def rude_chat():
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("I suppose I should say hello.")
|
||||
|
||||
rude_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
rude_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
140
Backend/venv/lib/python3.12/site-packages/nltk/chat/suntsu.py
Normal file
140
Backend/venv/lib/python3.12/site-packages/nltk/chat/suntsu.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# Natural Language Toolkit: Sun Tsu-Bot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Sam Huston 2007
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tsu bot responds to all queries with a Sun Tsu sayings
|
||||
|
||||
Quoted from Sun Tsu's The Art of War
|
||||
Translated by LIONEL GILES, M.A. 1910
|
||||
Hosted by the Gutenberg Project
|
||||
https://www.gutenberg.org/
|
||||
"""
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
pairs = (
|
||||
(r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
|
||||
(
|
||||
r"[^\?]*\?",
|
||||
(
|
||||
"Please consider whether you can answer your own question.",
|
||||
"Ask me no questions!",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[0-9]+(.*)",
|
||||
(
|
||||
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
|
||||
"There are five essentials for victory",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[A-Ca-c](.*)",
|
||||
(
|
||||
"The art of war is of vital importance to the State.",
|
||||
"All warfare is based on deception.",
|
||||
"If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
|
||||
"If the campaign is protracted, the resources of the State will not be equal to the strain.",
|
||||
"Attack him where he is unprepared, appear where you are not expected.",
|
||||
"There is no instance of a country having benefited from prolonged warfare.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[D-Fd-f](.*)",
|
||||
(
|
||||
"The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
|
||||
"Bring war material with you from home, but forage on the enemy.",
|
||||
"In war, then, let your great object be victory, not lengthy campaigns.",
|
||||
"To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[G-Ig-i](.*)",
|
||||
(
|
||||
"Heaven signifies night and day, cold and heat, times and seasons.",
|
||||
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
|
||||
"The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
|
||||
"One may know how to conquer without being able to do it.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[J-Lj-l](.*)",
|
||||
(
|
||||
"There are three ways in which a ruler can bring misfortune upon his army.",
|
||||
"By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
|
||||
"By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
|
||||
"By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
|
||||
"There are five essentials for victory",
|
||||
"He will win who knows when to fight and when not to fight.",
|
||||
"He will win who knows how to handle both superior and inferior forces.",
|
||||
"He will win whose army is animated by the same spirit throughout all its ranks.",
|
||||
"He will win who, prepared himself, waits to take the enemy unprepared.",
|
||||
"He will win who has military capacity and is not interfered with by the sovereign.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[M-Om-o](.*)",
|
||||
(
|
||||
"If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
|
||||
"If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
|
||||
"If you know neither the enemy nor yourself, you will succumb in every battle.",
|
||||
"The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[P-Rp-r](.*)",
|
||||
(
|
||||
"Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
|
||||
"Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
|
||||
"He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
|
||||
"A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
|
||||
"The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[S-Us-u](.*)",
|
||||
(
|
||||
"What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
|
||||
"Hence his victories bring him neither reputation for wisdom nor credit for courage.",
|
||||
"Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
|
||||
"In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
|
||||
"There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
|
||||
"Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"[V-Zv-z](.*)",
|
||||
(
|
||||
"It is a matter of life and death, a road either to safety or to ruin.",
|
||||
"Hold out baits to entice the enemy. Feign disorder, and crush him.",
|
||||
"All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
|
||||
"Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
|
||||
"So in war, the way is to avoid what is strong and to strike at what is weak.",
|
||||
"Just as water retains no constant shape, so in warfare there are no constant conditions.",
|
||||
),
|
||||
),
|
||||
(r"(.*)", ("Your statement insults me.", "")),
|
||||
)
|
||||
|
||||
suntsu_chatbot = Chat(pairs, reflections)
|
||||
|
||||
|
||||
def suntsu_chat():
|
||||
print("Talk to the program by typing in plain English, using normal upper-")
|
||||
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
||||
print("=" * 72)
|
||||
print("You seek enlightenment?")
|
||||
|
||||
suntsu_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
suntsu_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
124
Backend/venv/lib/python3.12/site-packages/nltk/chat/util.py
Normal file
124
Backend/venv/lib/python3.12/site-packages/nltk/chat/util.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Natural Language Toolkit: Chatbot Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
||||
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
|
||||
|
||||
import random
|
||||
import re
|
||||
|
||||
reflections = {
|
||||
"i am": "you are",
|
||||
"i was": "you were",
|
||||
"i": "you",
|
||||
"i'm": "you are",
|
||||
"i'd": "you would",
|
||||
"i've": "you have",
|
||||
"i'll": "you will",
|
||||
"my": "your",
|
||||
"you are": "I am",
|
||||
"you were": "I was",
|
||||
"you've": "I have",
|
||||
"you'll": "I will",
|
||||
"your": "my",
|
||||
"yours": "mine",
|
||||
"you": "me",
|
||||
"me": "you",
|
||||
}
|
||||
|
||||
|
||||
class Chat:
|
||||
def __init__(self, pairs, reflections={}):
|
||||
"""
|
||||
Initialize the chatbot. Pairs is a list of patterns and responses. Each
|
||||
pattern is a regular expression matching the user's statement or question,
|
||||
e.g. r'I like (.*)'. For each such pattern a list of possible responses
|
||||
is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material
|
||||
which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to
|
||||
the numbered positions in the responses, e.g. %1.
|
||||
|
||||
:type pairs: list of tuple
|
||||
:param pairs: The patterns and responses
|
||||
:type reflections: dict
|
||||
:param reflections: A mapping between first and second person expressions
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs]
|
||||
self._reflections = reflections
|
||||
self._regex = self._compile_reflections()
|
||||
|
||||
def _compile_reflections(self):
|
||||
sorted_refl = sorted(self._reflections, key=len, reverse=True)
|
||||
return re.compile(
|
||||
r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
|
||||
)
|
||||
|
||||
def _substitute(self, str):
|
||||
"""
|
||||
Substitute words in the string, according to the specified reflections,
|
||||
e.g. "I'm" -> "you are"
|
||||
|
||||
:type str: str
|
||||
:param str: The string to be mapped
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
return self._regex.sub(
|
||||
lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower()
|
||||
)
|
||||
|
||||
def _wildcards(self, response, match):
|
||||
pos = response.find("%")
|
||||
while pos >= 0:
|
||||
num = int(response[pos + 1 : pos + 2])
|
||||
response = (
|
||||
response[:pos]
|
||||
+ self._substitute(match.group(num))
|
||||
+ response[pos + 2 :]
|
||||
)
|
||||
pos = response.find("%")
|
||||
return response
|
||||
|
||||
def respond(self, str):
|
||||
"""
|
||||
Generate a response to the user input.
|
||||
|
||||
:type str: str
|
||||
:param str: The string to be mapped
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
# check each pattern
|
||||
for pattern, response in self._pairs:
|
||||
match = pattern.match(str)
|
||||
|
||||
# did the pattern match?
|
||||
if match:
|
||||
resp = random.choice(response) # pick a random response
|
||||
resp = self._wildcards(resp, match) # process wildcards
|
||||
|
||||
# fix munged punctuation at the end
|
||||
if resp[-2:] == "?.":
|
||||
resp = resp[:-2] + "."
|
||||
if resp[-2:] == "??":
|
||||
resp = resp[:-2] + "?"
|
||||
return resp
|
||||
|
||||
# Hold a conversation with a chatbot
|
||||
def converse(self, quit="quit"):
|
||||
user_input = ""
|
||||
while user_input != quit:
|
||||
user_input = quit
|
||||
try:
|
||||
user_input = input(">")
|
||||
except EOFError:
|
||||
print(user_input)
|
||||
if user_input:
|
||||
while user_input[-1] in "!.":
|
||||
user_input = user_input[:-1]
|
||||
print(self.respond(user_input))
|
||||
329
Backend/venv/lib/python3.12/site-packages/nltk/chat/zen.py
Normal file
329
Backend/venv/lib/python3.12/site-packages/nltk/chat/zen.py
Normal file
@@ -0,0 +1,329 @@
|
||||
# Natural Language Toolkit: Zen Chatbot
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Zen Chatbot talks in gems of Zen wisdom.
|
||||
|
||||
This is a sample conversation with Zen Chatbot:
|
||||
ZC: Welcome, my child.
|
||||
me: Good afternoon.
|
||||
ZC: Ask the question you have come to ask.
|
||||
me: How can I achieve enlightenment?
|
||||
ZC: How do you suppose?
|
||||
me: Through meditation.
|
||||
ZC: Form is emptiness, and emptiness form.
|
||||
me: How can I empty my mind of worldly troubles?
|
||||
ZC: Will an answer to that really help in your search for enlightenment?
|
||||
me: Yes.
|
||||
ZC: It is better to be right than to be certain.
|
||||
me: I seek truth and wisdom.
|
||||
ZC: The search for truth is a long journey.
|
||||
me: Are you sure?
|
||||
ZC: Maybe sure, maybe not sure.
|
||||
|
||||
|
||||
The chatbot structure is based on that of chat.eliza. Thus, it uses
|
||||
a translation table to convert from question to response
|
||||
i.e. "I am" --> "you are"
|
||||
|
||||
Of course, since Zen Chatbot does not understand the meaning of any words,
|
||||
responses are very limited. Zen Chatbot will usually answer very vaguely, or
|
||||
respond to a question by asking a different question, in much the same way
|
||||
as Eliza.
|
||||
"""
|
||||
|
||||
from nltk.chat.util import Chat, reflections
|
||||
|
||||
# responses are matched top to bottom, so non-specific matches occur later
|
||||
# for each match, a list of possible responses is provided
|
||||
responses = (
|
||||
# Zen Chatbot opens with the line "Welcome, my child." The usual
|
||||
# response will be a greeting problem: 'good' matches "good morning",
|
||||
# "good day" etc, but also "good grief!" and other sentences starting
|
||||
# with the word 'good' that may not be a greeting
|
||||
(
|
||||
r"(hello(.*))|(good [a-zA-Z]+)",
|
||||
(
|
||||
"The path to enlightenment is often difficult to see.",
|
||||
"Greetings. I sense your mind is troubled. Tell me of your troubles.",
|
||||
"Ask the question you have come to ask.",
|
||||
"Hello. Do you seek englightenment?",
|
||||
),
|
||||
),
|
||||
# "I need" and "I want" can be followed by a thing (eg 'help')
|
||||
# or an action (eg 'to see you')
|
||||
#
|
||||
# This is a problem with this style of response -
|
||||
# person: "I need you"
|
||||
# chatbot: "me can be achieved by hard work and dedication of the mind"
|
||||
# i.e. 'you' is not really a thing that can be mapped this way, so this
|
||||
# interpretation only makes sense for some inputs
|
||||
#
|
||||
(
|
||||
r"i need (.*)",
|
||||
(
|
||||
"%1 can be achieved by hard work and dedication of the mind.",
|
||||
"%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
|
||||
"Focus your mind on%1, and you will find what you need.",
|
||||
),
|
||||
),
|
||||
(
|
||||
r"i want (.*)",
|
||||
(
|
||||
"Desires of the heart will distract you from the path to enlightenment.",
|
||||
"Will%1 help you attain enlightenment?",
|
||||
"Is%1 a desire of the mind, or of the heart?",
|
||||
),
|
||||
),
|
||||
# why questions are separated into three types:
|
||||
# "why..I" e.g. "why am I here?" "Why do I like cake?"
|
||||
# "why..you" e.g. "why are you here?" "Why won't you tell me?"
|
||||
# "why..." e.g. "Why is the sky blue?"
|
||||
# problems:
|
||||
# person: "Why can't you tell me?"
|
||||
# chatbot: "Are you sure I tell you?"
|
||||
# - this style works for positives (e.g. "why do you like cake?")
|
||||
# but does not work for negatives (e.g. "why don't you like cake?")
|
||||
(r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
|
||||
(r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
|
||||
(r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
|
||||
# e.g. "are you listening?", "are you a duck"
|
||||
(
|
||||
r"are you (.*)\?",
|
||||
("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
|
||||
),
|
||||
# e.g. "am I a duck?", "am I going to die?"
|
||||
(
|
||||
r"am i (.*)\?",
|
||||
("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
|
||||
),
|
||||
# what questions, e.g. "what time is it?"
|
||||
# problems:
|
||||
# person: "What do you want?"
|
||||
# chatbot: "Seek truth, not what do me want."
|
||||
(r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
|
||||
# how questions, e.g. "how do you do?"
|
||||
(
|
||||
r"how (.*)\?",
|
||||
(
|
||||
"How do you suppose?",
|
||||
"Will an answer to that really help in your search for enlightenment?",
|
||||
"Ask yourself not how, but why.",
|
||||
),
|
||||
),
|
||||
# can questions, e.g. "can you run?", "can you come over here please?"
|
||||
(
|
||||
r"can you (.*)\?",
|
||||
(
|
||||
"I probably can, but I may not.",
|
||||
"Maybe I can%1, and maybe I cannot.",
|
||||
"I can do all, and I can do nothing.",
|
||||
),
|
||||
),
|
||||
# can questions, e.g. "can I have some cake?", "can I know truth?"
|
||||
(
|
||||
r"can i (.*)\?",
|
||||
(
|
||||
"You can%1 if you believe you can%1, and have a pure spirit.",
|
||||
"Seek truth and you will know if you can%1.",
|
||||
),
|
||||
),
|
||||
# e.g. "It is raining" - implies the speaker is certain of a fact
|
||||
(
|
||||
r"it is (.*)",
|
||||
(
|
||||
"How can you be certain that%1, when you do not even know yourself?",
|
||||
"Whether it is%1 or not does not change the way the world is.",
|
||||
),
|
||||
),
|
||||
# e.g. "is there a doctor in the house?"
|
||||
(
|
||||
r"is there (.*)\?",
|
||||
("There is%1 if you believe there is.", "It is possible that there is%1."),
|
||||
),
|
||||
# e.g. "is it possible?", "is this true?"
|
||||
(r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
|
||||
# non-specific question
|
||||
(
|
||||
r"(.*)\?",
|
||||
(
|
||||
"Do you think %1?",
|
||||
"You seek the truth. Does the truth seek you?",
|
||||
"If you intentionally pursue the answers to your questions, the answers become hard to see.",
|
||||
"The answer to your question cannot be told. It must be experienced.",
|
||||
),
|
||||
),
|
||||
# expression of hate of form "I hate you" or "Kelly hates cheese"
|
||||
(
|
||||
r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
|
||||
(
|
||||
"Perhaps it is not about hating %2, but about hate from within.",
|
||||
"Weeds only grow when we dislike them",
|
||||
"Hate is a very strong emotion.",
|
||||
),
|
||||
),
|
||||
# statement containing the word 'truth'
|
||||
(
|
||||
r"(.*) truth(.*)",
|
||||
(
|
||||
"Seek truth, and truth will seek you.",
|
||||
"Remember, it is not the spoon which bends - only yourself.",
|
||||
"The search for truth is a long journey.",
|
||||
),
|
||||
),
|
||||
# desire to do an action
|
||||
# e.g. "I want to go shopping"
|
||||
(
|
||||
r"i want to (.*)",
|
||||
("You may %1 if your heart truly desires to.", "You may have to %1."),
|
||||
),
|
||||
# desire for an object
|
||||
# e.g. "I want a pony"
|
||||
(
|
||||
r"i want (.*)",
|
||||
(
|
||||
"Does your heart truly desire %1?",
|
||||
"Is this a desire of the heart, or of the mind?",
|
||||
),
|
||||
),
|
||||
# e.g. "I can't wait" or "I can't do this"
|
||||
(
|
||||
r"i can\'t (.*)",
|
||||
(
|
||||
"What we can and can't do is a limitation of the mind.",
|
||||
"There are limitations of the body, and limitations of the mind.",
|
||||
"Have you tried to%1 with a clear mind?",
|
||||
),
|
||||
),
|
||||
# "I think.." indicates uncertainty. e.g. "I think so."
|
||||
# problem: exceptions...
|
||||
# e.g. "I think, therefore I am"
|
||||
(
|
||||
r"i think (.*)",
|
||||
(
|
||||
"Uncertainty in an uncertain world.",
|
||||
"Indeed, how can we be certain of anything in such uncertain times.",
|
||||
"Are you not, in fact, certain that%1?",
|
||||
),
|
||||
),
|
||||
# "I feel...emotions/sick/light-headed..."
|
||||
(
|
||||
r"i feel (.*)",
|
||||
(
|
||||
"Your body and your emotions are both symptoms of your mind."
|
||||
"What do you believe is the root of such feelings?",
|
||||
"Feeling%1 can be a sign of your state-of-mind.",
|
||||
),
|
||||
),
|
||||
# exclaimation mark indicating emotion
|
||||
# e.g. "Wow!" or "No!"
|
||||
(
|
||||
r"(.*)!",
|
||||
(
|
||||
"I sense that you are feeling emotional today.",
|
||||
"You need to calm your emotions.",
|
||||
),
|
||||
),
|
||||
# because [statement]
|
||||
# e.g. "because I said so"
|
||||
(
|
||||
r"because (.*)",
|
||||
(
|
||||
"Does knowning the reasons behind things help you to understand"
|
||||
" the things themselves?",
|
||||
"If%1, what else must be true?",
|
||||
),
|
||||
),
|
||||
# yes or no - raise an issue of certainty/correctness
|
||||
(
|
||||
r"(yes)|(no)",
|
||||
(
|
||||
"Is there certainty in an uncertain world?",
|
||||
"It is better to be right than to be certain.",
|
||||
),
|
||||
),
|
||||
# sentence containing word 'love'
|
||||
(
|
||||
r"(.*)love(.*)",
|
||||
(
|
||||
"Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
|
||||
"Free love!",
|
||||
),
|
||||
),
|
||||
# sentence containing word 'understand' - r
|
||||
(
|
||||
r"(.*)understand(.*)",
|
||||
(
|
||||
"If you understand, things are just as they are;"
|
||||
" if you do not understand, things are just as they are.",
|
||||
"Imagination is more important than knowledge.",
|
||||
),
|
||||
),
|
||||
# 'I', 'me', 'my' - person is talking about themself.
|
||||
# this breaks down when words contain these - eg 'Thyme', 'Irish'
|
||||
(
|
||||
r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
|
||||
(
|
||||
"'I', 'me', 'my'... these are selfish expressions.",
|
||||
"Have you ever considered that you might be a selfish person?",
|
||||
"Try to consider others, not just yourself.",
|
||||
"Think not just of yourself, but of others.",
|
||||
),
|
||||
),
|
||||
# 'you' starting a sentence
|
||||
# e.g. "you stink!"
|
||||
(
|
||||
r"you (.*)",
|
||||
("My path is not of concern to you.", "I am but one, and you but one more."),
|
||||
),
|
||||
# say goodbye with some extra Zen wisdom.
|
||||
(
|
||||
r"exit",
|
||||
(
|
||||
"Farewell. The obstacle is the path.",
|
||||
"Farewell. Life is a journey, not a destination.",
|
||||
"Good bye. We are cups, constantly and quietly being filled."
|
||||
"\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.",
|
||||
),
|
||||
),
|
||||
# fall through case -
|
||||
# when stumped, respond with generic zen wisdom
|
||||
#
|
||||
(
|
||||
r"(.*)",
|
||||
(
|
||||
"When you're enlightened, every word is wisdom.",
|
||||
"Random talk is useless.",
|
||||
"The reverse side also has a reverse side.",
|
||||
"Form is emptiness, and emptiness is form.",
|
||||
"I pour out a cup of water. Is the cup empty?",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
zen_chatbot = Chat(responses, reflections)
|
||||
|
||||
|
||||
def zen_chat():
|
||||
print("*" * 75)
|
||||
print("Zen Chatbot!".center(75))
|
||||
print("*" * 75)
|
||||
print('"Look beyond mere words and letters - look into your mind"'.center(75))
|
||||
print("* Talk your way to truth with Zen Chatbot.")
|
||||
print("* Type 'quit' when you have had enough.")
|
||||
print("*" * 75)
|
||||
print("Welcome, my child.")
|
||||
|
||||
zen_chatbot.converse()
|
||||
|
||||
|
||||
def demo():
|
||||
zen_chat()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
205
Backend/venv/lib/python3.12/site-packages/nltk/chunk/__init__.py
Normal file
205
Backend/venv/lib/python3.12/site-packages/nltk/chunk/__init__.py
Normal file
@@ -0,0 +1,205 @@
|
||||
# Natural Language Toolkit: Chunkers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Classes and interfaces for identifying non-overlapping linguistic
|
||||
groups (such as base noun phrases) in unrestricted text. This task is
|
||||
called "chunk parsing" or "chunking", and the identified groups are
|
||||
called "chunks". The chunked text is represented using a shallow
|
||||
tree called a "chunk structure." A chunk structure is a tree
|
||||
containing tokens and chunks, where each chunk is a subtree containing
|
||||
only tokens. For example, the chunk structure for base noun phrase
|
||||
chunks in the sentence "I saw the big dog on the hill" is::
|
||||
|
||||
(SENTENCE:
|
||||
(NP: <I>)
|
||||
<saw>
|
||||
(NP: <the> <big> <dog>)
|
||||
<on>
|
||||
(NP: <the> <hill>))
|
||||
|
||||
To convert a chunk structure back to a list of tokens, simply use the
|
||||
chunk structure's ``leaves()`` method.
|
||||
|
||||
This module defines ``ChunkParserI``, a standard interface for
|
||||
chunking texts; and ``RegexpChunkParser``, a regular-expression based
|
||||
implementation of that interface. It also defines ``ChunkScore``, a
|
||||
utility class for scoring chunk parsers.
|
||||
|
||||
RegexpChunkParser
|
||||
=================
|
||||
|
||||
``RegexpChunkParser`` is an implementation of the chunk parser interface
|
||||
that uses regular-expressions over tags to chunk a text. Its
|
||||
``parse()`` method first constructs a ``ChunkString``, which encodes a
|
||||
particular chunking of the input text. Initially, nothing is
|
||||
chunked. ``parse.RegexpChunkParser`` then applies a sequence of
|
||||
``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies
|
||||
the chunking that it encodes. Finally, the ``ChunkString`` is
|
||||
transformed back into a chunk structure, which is returned.
|
||||
|
||||
``RegexpChunkParser`` can only be used to chunk a single kind of phrase.
|
||||
For example, you can use an ``RegexpChunkParser`` to chunk the noun
|
||||
phrases in a text, or the verb phrases in a text; but you can not
|
||||
use it to simultaneously chunk both noun phrases and verb phrases in
|
||||
the same text. (This is a limitation of ``RegexpChunkParser``, not of
|
||||
chunk parsers in general.)
|
||||
|
||||
RegexpChunkRules
|
||||
----------------
|
||||
|
||||
A ``RegexpChunkRule`` is a transformational rule that updates the
|
||||
chunking of a text by modifying its ``ChunkString``. Each
|
||||
``RegexpChunkRule`` defines the ``apply()`` method, which modifies
|
||||
the chunking encoded by a ``ChunkString``. The
|
||||
``RegexpChunkRule`` class itself can be used to implement any
|
||||
transformational rule based on regular expressions. There are
|
||||
also a number of subclasses, which can be used to implement
|
||||
simpler types of rules:
|
||||
|
||||
- ``ChunkRule`` chunks anything that matches a given regular
|
||||
expression.
|
||||
- ``StripRule`` strips anything that matches a given regular
|
||||
expression.
|
||||
- ``UnChunkRule`` will un-chunk any chunk that matches a given
|
||||
regular expression.
|
||||
- ``MergeRule`` can be used to merge two contiguous chunks.
|
||||
- ``SplitRule`` can be used to split a single chunk into two
|
||||
smaller chunks.
|
||||
- ``ExpandLeftRule`` will expand a chunk to incorporate new
|
||||
unchunked material on the left.
|
||||
- ``ExpandRightRule`` will expand a chunk to incorporate new
|
||||
unchunked material on the right.
|
||||
|
||||
Tag Patterns
|
||||
~~~~~~~~~~~~
|
||||
|
||||
A ``RegexpChunkRule`` uses a modified version of regular
|
||||
expression patterns, called "tag patterns". Tag patterns are
|
||||
used to match sequences of tags. Examples of tag patterns are::
|
||||
|
||||
r'(<DT>|<JJ>|<NN>)+'
|
||||
r'<NN>+'
|
||||
r'<NN.*>'
|
||||
|
||||
The differences between regular expression patterns and tag
|
||||
patterns are:
|
||||
|
||||
- In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
|
||||
``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
|
||||
``'<NN'`` followed by one or more repetitions of ``'>'``.
|
||||
- Whitespace in tag patterns is ignored. So
|
||||
``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
|
||||
- In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
|
||||
``'<NN.*>'`` matches any single tag starting with ``'NN'``.
|
||||
|
||||
The function ``tag_pattern2re_pattern`` can be used to transform
|
||||
a tag pattern to an equivalent regular expression pattern.
|
||||
|
||||
Efficiency
|
||||
----------
|
||||
|
||||
Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a
|
||||
rate of about 300 tokens/second, with a moderately complex rule set.
|
||||
|
||||
There may be problems if ``RegexpChunkParser`` is used with more than
|
||||
5,000 tokens at a time. In particular, evaluation of some regular
|
||||
expressions may cause the Python regular expression engine to
|
||||
exceed its maximum recursion depth. We have attempted to minimize
|
||||
these problems, but it is impossible to avoid them completely. We
|
||||
therefore recommend that you apply the chunk parser to a single
|
||||
sentence at a time.
|
||||
|
||||
Emacs Tip
|
||||
---------
|
||||
|
||||
If you evaluate the following elisp expression in emacs, it will
|
||||
colorize a ``ChunkString`` when you use an interactive python shell
|
||||
with emacs or xemacs ("C-c !")::
|
||||
|
||||
(let ()
|
||||
(defconst comint-mode-font-lock-keywords
|
||||
'(("<[^>]+>" 0 'font-lock-reference-face)
|
||||
("[{}]" 0 'font-lock-function-name-face)))
|
||||
(add-hook 'comint-mode-hook (lambda () (turn-on-font-lock))))
|
||||
|
||||
You can evaluate this code by copying it to a temporary buffer,
|
||||
placing the cursor after the last close parenthesis, and typing
|
||||
"``C-x C-e``". You should evaluate it before running the interactive
|
||||
session. The change will last until you close emacs.
|
||||
|
||||
Unresolved Issues
|
||||
-----------------
|
||||
|
||||
If we use the ``re`` module for regular expressions, Python's
|
||||
regular expression engine generates "maximum recursion depth
|
||||
exceeded" errors when processing very large texts, even for
|
||||
regular expressions that should not require any recursion. We
|
||||
therefore use the ``pre`` module instead. But note that ``pre``
|
||||
does not include Unicode support, so this module will not work
|
||||
with unicode strings. Note also that ``pre`` regular expressions
|
||||
are not quite as advanced as ``re`` ones (e.g., no leftward
|
||||
zero-length assertions).
|
||||
|
||||
:type CHUNK_TAG_PATTERN: regexp
|
||||
:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag
|
||||
pattern is valid.
|
||||
"""
|
||||
|
||||
from nltk.chunk.api import ChunkParserI
|
||||
from nltk.chunk.named_entity import Maxent_NE_Chunker
|
||||
from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
|
||||
from nltk.chunk.util import (
|
||||
ChunkScore,
|
||||
accuracy,
|
||||
conllstr2tree,
|
||||
conlltags2tree,
|
||||
ieerstr2tree,
|
||||
tagstr2tree,
|
||||
tree2conllstr,
|
||||
tree2conlltags,
|
||||
)
|
||||
|
||||
|
||||
def ne_chunker(fmt="multiclass"):
|
||||
"""
|
||||
Load NLTK's currently recommended named entity chunker.
|
||||
"""
|
||||
return Maxent_NE_Chunker(fmt)
|
||||
|
||||
|
||||
def ne_chunk(tagged_tokens, binary=False):
|
||||
"""
|
||||
Use NLTK's currently recommended named entity chunker to
|
||||
chunk the given list of tagged tokens.
|
||||
|
||||
>>> from nltk.chunk import ne_chunk
|
||||
>>> from nltk.corpus import treebank
|
||||
>>> from pprint import pprint
|
||||
>>> pprint(ne_chunk(treebank.tagged_sents()[2][8:14])) # doctest: +NORMALIZE_WHITESPACE
|
||||
Tree('S', [('chairman', 'NN'), ('of', 'IN'), Tree('ORGANIZATION', [('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP')]), ('PLC', 'NNP')])
|
||||
|
||||
"""
|
||||
if binary:
|
||||
chunker = ne_chunker(fmt="binary")
|
||||
else:
|
||||
chunker = ne_chunker()
|
||||
return chunker.parse(tagged_tokens)
|
||||
|
||||
|
||||
def ne_chunk_sents(tagged_sentences, binary=False):
|
||||
"""
|
||||
Use NLTK's currently recommended named entity chunker to chunk the
|
||||
given list of tagged sentences, each consisting of a list of tagged tokens.
|
||||
"""
|
||||
if binary:
|
||||
chunker = ne_chunker(fmt="binary")
|
||||
else:
|
||||
chunker = ne_chunker()
|
||||
return chunker.parse_sents(tagged_sentences)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
56
Backend/venv/lib/python3.12/site-packages/nltk/chunk/api.py
Normal file
56
Backend/venv/lib/python3.12/site-packages/nltk/chunk/api.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# Natural Language Toolkit: Chunk parsing API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## Chunk Parser Interface
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
from nltk.chunk.util import ChunkScore
|
||||
from nltk.internals import deprecated
|
||||
from nltk.parse import ParserI
|
||||
|
||||
|
||||
class ChunkParserI(ParserI):
|
||||
"""
|
||||
A processing interface for identifying non-overlapping groups in
|
||||
unrestricted text. Typically, chunk parsers are used to find base
|
||||
syntactic constituents, such as base noun phrases. Unlike
|
||||
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
|
||||
will always generate a parse.
|
||||
"""
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Return the best chunk structure for the given tokens
|
||||
and return a tree.
|
||||
|
||||
:param tokens: The list of (word, tag) tokens to be chunked.
|
||||
:type tokens: list(tuple)
|
||||
:rtype: Tree
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@deprecated("Use accuracy(gold) instead.")
|
||||
def evaluate(self, gold):
|
||||
return self.accuracy(gold)
|
||||
|
||||
def accuracy(self, gold):
|
||||
"""
|
||||
Score the accuracy of the chunker against the gold standard.
|
||||
Remove the chunking the gold standard text, rechunk it using
|
||||
the chunker, and return a ``ChunkScore`` object
|
||||
reflecting the performance of this chunk parser.
|
||||
|
||||
:type gold: list(Tree)
|
||||
:param gold: The list of chunked sentences to score the chunker on.
|
||||
:rtype: ChunkScore
|
||||
"""
|
||||
chunkscore = ChunkScore()
|
||||
for correct in gold:
|
||||
chunkscore.score(correct, self.parse(correct.leaves()))
|
||||
return chunkscore
|
||||
@@ -0,0 +1,407 @@
|
||||
# Natural Language Toolkit: Chunk parsing API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Eric Kafe <kafe.eric@gmail.com> (tab-format models)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Named entity chunker
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
from nltk.tag import ClassifierBasedTagger, pos_tag
|
||||
|
||||
try:
|
||||
from nltk.classify import MaxentClassifier
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.chunk.api import ChunkParserI
|
||||
from nltk.chunk.util import ChunkScore
|
||||
from nltk.data import find
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class NEChunkParserTagger(ClassifierBasedTagger):
|
||||
"""
|
||||
The IOB tagger used by the chunk parser.
|
||||
"""
|
||||
|
||||
def __init__(self, train=None, classifier=None):
|
||||
ClassifierBasedTagger.__init__(
|
||||
self,
|
||||
train=train,
|
||||
classifier_builder=self._classifier_builder,
|
||||
classifier=classifier,
|
||||
)
|
||||
|
||||
def _classifier_builder(self, train):
|
||||
return MaxentClassifier.train(
|
||||
# "megam" cannot be the default algorithm since it requires compiling with ocaml
|
||||
train,
|
||||
algorithm="iis",
|
||||
gaussian_prior_sigma=1,
|
||||
trace=2,
|
||||
)
|
||||
|
||||
def _english_wordlist(self):
|
||||
try:
|
||||
wl = self._en_wordlist
|
||||
except AttributeError:
|
||||
from nltk.corpus import words
|
||||
|
||||
self._en_wordlist = set(words.words("en-basic"))
|
||||
wl = self._en_wordlist
|
||||
return wl
|
||||
|
||||
def _feature_detector(self, tokens, index, history):
|
||||
word = tokens[index][0]
|
||||
pos = simplify_pos(tokens[index][1])
|
||||
if index == 0:
|
||||
prevword = prevprevword = None
|
||||
prevpos = prevprevpos = None
|
||||
prevshape = prevtag = prevprevtag = None
|
||||
elif index == 1:
|
||||
prevword = tokens[index - 1][0].lower()
|
||||
prevprevword = None
|
||||
prevpos = simplify_pos(tokens[index - 1][1])
|
||||
prevprevpos = None
|
||||
prevtag = history[index - 1][0]
|
||||
prevshape = prevprevtag = None
|
||||
else:
|
||||
prevword = tokens[index - 1][0].lower()
|
||||
prevprevword = tokens[index - 2][0].lower()
|
||||
prevpos = simplify_pos(tokens[index - 1][1])
|
||||
prevprevpos = simplify_pos(tokens[index - 2][1])
|
||||
prevtag = history[index - 1]
|
||||
prevprevtag = history[index - 2]
|
||||
prevshape = shape(prevword)
|
||||
if index == len(tokens) - 1:
|
||||
nextword = nextnextword = None
|
||||
nextpos = nextnextpos = None
|
||||
elif index == len(tokens) - 2:
|
||||
nextword = tokens[index + 1][0].lower()
|
||||
nextpos = tokens[index + 1][1].lower()
|
||||
nextnextword = None
|
||||
nextnextpos = None
|
||||
else:
|
||||
nextword = tokens[index + 1][0].lower()
|
||||
nextpos = tokens[index + 1][1].lower()
|
||||
nextnextword = tokens[index + 2][0].lower()
|
||||
nextnextpos = tokens[index + 2][1].lower()
|
||||
|
||||
# 89.6
|
||||
features = {
|
||||
"bias": True,
|
||||
"shape": shape(word),
|
||||
"wordlen": len(word),
|
||||
"prefix3": word[:3].lower(),
|
||||
"suffix3": word[-3:].lower(),
|
||||
"pos": pos,
|
||||
"word": word,
|
||||
"en-wordlist": (word in self._english_wordlist()),
|
||||
"prevtag": prevtag,
|
||||
"prevpos": prevpos,
|
||||
"nextpos": nextpos,
|
||||
"prevword": prevword,
|
||||
"nextword": nextword,
|
||||
"word+nextpos": f"{word.lower()}+{nextpos}",
|
||||
"pos+prevtag": f"{pos}+{prevtag}",
|
||||
"shape+prevtag": f"{prevshape}+{prevtag}",
|
||||
}
|
||||
|
||||
return features
|
||||
|
||||
|
||||
class NEChunkParser(ChunkParserI):
|
||||
"""
|
||||
Expected input: list of pos-tagged words
|
||||
"""
|
||||
|
||||
def __init__(self, train):
|
||||
self._train(train)
|
||||
|
||||
def parse(self, tokens):
|
||||
"""
|
||||
Each token should be a pos-tagged word
|
||||
"""
|
||||
tagged = self._tagger.tag(tokens)
|
||||
tree = self._tagged_to_parse(tagged)
|
||||
return tree
|
||||
|
||||
def _train(self, corpus):
|
||||
# Convert to tagged sequence
|
||||
corpus = [self._parse_to_tagged(s) for s in corpus]
|
||||
|
||||
self._tagger = NEChunkParserTagger(train=corpus)
|
||||
|
||||
def _tagged_to_parse(self, tagged_tokens):
|
||||
"""
|
||||
Convert a list of tagged tokens to a chunk-parse tree.
|
||||
"""
|
||||
sent = Tree("S", [])
|
||||
|
||||
for tok, tag in tagged_tokens:
|
||||
if tag == "O":
|
||||
sent.append(tok)
|
||||
elif tag.startswith("B-"):
|
||||
sent.append(Tree(tag[2:], [tok]))
|
||||
elif tag.startswith("I-"):
|
||||
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
|
||||
sent[-1].append(tok)
|
||||
else:
|
||||
sent.append(Tree(tag[2:], [tok]))
|
||||
return sent
|
||||
|
||||
@staticmethod
|
||||
def _parse_to_tagged(sent):
|
||||
"""
|
||||
Convert a chunk-parse tree to a list of tagged tokens.
|
||||
"""
|
||||
toks = []
|
||||
for child in sent:
|
||||
if isinstance(child, Tree):
|
||||
if len(child) == 0:
|
||||
print("Warning -- empty chunk in sentence")
|
||||
continue
|
||||
toks.append((child[0], f"B-{child.label()}"))
|
||||
for tok in child[1:]:
|
||||
toks.append((tok, f"I-{child.label()}"))
|
||||
else:
|
||||
toks.append((child, "O"))
|
||||
return toks
|
||||
|
||||
|
||||
def shape(word):
|
||||
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
|
||||
return "number"
|
||||
elif re.match(r"\W+$", word, re.UNICODE):
|
||||
return "punct"
|
||||
elif re.match(r"\w+$", word, re.UNICODE):
|
||||
if word.istitle():
|
||||
return "upcase"
|
||||
elif word.islower():
|
||||
return "downcase"
|
||||
else:
|
||||
return "mixedcase"
|
||||
else:
|
||||
return "other"
|
||||
|
||||
|
||||
def simplify_pos(s):
|
||||
if s.startswith("V"):
|
||||
return "V"
|
||||
else:
|
||||
return s.split("-")[0]
|
||||
|
||||
|
||||
def postag_tree(tree):
|
||||
# Part-of-speech tagging.
|
||||
words = tree.leaves()
|
||||
tag_iter = (pos for (word, pos) in pos_tag(words))
|
||||
newtree = Tree("S", [])
|
||||
for child in tree:
|
||||
if isinstance(child, Tree):
|
||||
newtree.append(Tree(child.label(), []))
|
||||
for subchild in child:
|
||||
newtree[-1].append((subchild, next(tag_iter)))
|
||||
else:
|
||||
newtree.append((child, next(tag_iter)))
|
||||
return newtree
|
||||
|
||||
|
||||
def load_ace_data(roots, fmt="binary", skip_bnews=True):
|
||||
for root in roots:
|
||||
for root, dirs, files in os.walk(root):
|
||||
if root.endswith("bnews") and skip_bnews:
|
||||
continue
|
||||
for f in files:
|
||||
if f.endswith(".sgm"):
|
||||
yield from load_ace_file(os.path.join(root, f), fmt)
|
||||
|
||||
|
||||
def load_ace_file(textfile, fmt):
|
||||
print(f" - {os.path.split(textfile)[1]}")
|
||||
annfile = textfile + ".tmx.rdc.xml"
|
||||
|
||||
# Read the xml file, and get a list of entities
|
||||
entities = []
|
||||
with open(annfile) as infile:
|
||||
xml = ET.parse(infile).getroot()
|
||||
for entity in xml.findall("document/entity"):
|
||||
typ = entity.find("entity_type").text
|
||||
for mention in entity.findall("entity_mention"):
|
||||
if mention.get("TYPE") != "NAME":
|
||||
continue # only NEs
|
||||
s = int(mention.find("head/charseq/start").text)
|
||||
e = int(mention.find("head/charseq/end").text) + 1
|
||||
entities.append((s, e, typ))
|
||||
|
||||
# Read the text file, and mark the entities.
|
||||
with open(textfile) as infile:
|
||||
text = infile.read()
|
||||
|
||||
# Strip XML tags, since they don't count towards the indices
|
||||
text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
|
||||
|
||||
# Blank out anything before/after <TEXT>
|
||||
def subfunc(m):
|
||||
return " " * (m.end() - m.start() - 6)
|
||||
|
||||
text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
|
||||
text = re.sub(r"</TEXT>[\s\S]*", "", text)
|
||||
|
||||
# Simplify quotes
|
||||
text = re.sub("``", ' "', text)
|
||||
text = re.sub("''", '" ', text)
|
||||
|
||||
entity_types = {typ for (s, e, typ) in entities}
|
||||
|
||||
# Binary distinction (NE or not NE)
|
||||
if fmt == "binary":
|
||||
i = 0
|
||||
toks = Tree("S", [])
|
||||
for s, e, typ in sorted(entities):
|
||||
if s < i:
|
||||
s = i # Overlapping! Deal with this better?
|
||||
if e <= s:
|
||||
continue
|
||||
toks.extend(word_tokenize(text[i:s]))
|
||||
toks.append(Tree("NE", text[s:e].split()))
|
||||
i = e
|
||||
toks.extend(word_tokenize(text[i:]))
|
||||
yield toks
|
||||
|
||||
# Multiclass distinction (NE type)
|
||||
elif fmt == "multiclass":
|
||||
i = 0
|
||||
toks = Tree("S", [])
|
||||
for s, e, typ in sorted(entities):
|
||||
if s < i:
|
||||
s = i # Overlapping! Deal with this better?
|
||||
if e <= s:
|
||||
continue
|
||||
toks.extend(word_tokenize(text[i:s]))
|
||||
toks.append(Tree(typ, text[s:e].split()))
|
||||
i = e
|
||||
toks.extend(word_tokenize(text[i:]))
|
||||
yield toks
|
||||
|
||||
else:
|
||||
raise ValueError("bad fmt value")
|
||||
|
||||
|
||||
# This probably belongs in a more general-purpose location (as does
|
||||
# the parse_to_tagged function).
|
||||
def cmp_chunks(correct, guessed):
|
||||
correct = NEChunkParser._parse_to_tagged(correct)
|
||||
guessed = NEChunkParser._parse_to_tagged(guessed)
|
||||
ellipsis = False
|
||||
for (w, ct), (w, gt) in zip(correct, guessed):
|
||||
if ct == gt == "O":
|
||||
if not ellipsis:
|
||||
print(f" {ct:15} {gt:15} {w}")
|
||||
print(" {:15} {:15} {2}".format("...", "...", "..."))
|
||||
ellipsis = True
|
||||
else:
|
||||
ellipsis = False
|
||||
print(f" {ct:15} {gt:15} {w}")
|
||||
|
||||
|
||||
# ======================================================================================
|
||||
|
||||
|
||||
class Maxent_NE_Chunker(NEChunkParser):
|
||||
"""
|
||||
Expected input: list of pos-tagged words
|
||||
"""
|
||||
|
||||
def __init__(self, fmt="multiclass"):
|
||||
from nltk.data import find
|
||||
|
||||
self._fmt = fmt
|
||||
self._tab_dir = find(f"chunkers/maxent_ne_chunker_tab/english_ace_{fmt}/")
|
||||
self.load_params()
|
||||
|
||||
def load_params(self):
|
||||
from nltk.classify.maxent import BinaryMaxentFeatureEncoding, load_maxent_params
|
||||
|
||||
wgt, mpg, lab, aon = load_maxent_params(self._tab_dir)
|
||||
mc = MaxentClassifier(
|
||||
BinaryMaxentFeatureEncoding(lab, mpg, alwayson_features=aon), wgt
|
||||
)
|
||||
self._tagger = NEChunkParserTagger(classifier=mc)
|
||||
|
||||
def save_params(self):
|
||||
from nltk.classify.maxent import save_maxent_params
|
||||
|
||||
classif = self._tagger._classifier
|
||||
ecg = classif._encoding
|
||||
wgt = classif._weights
|
||||
mpg = ecg._mapping
|
||||
lab = ecg._labels
|
||||
aon = ecg._alwayson
|
||||
fmt = self._fmt
|
||||
save_maxent_params(wgt, mpg, lab, aon, tab_dir=f"/tmp/english_ace_{fmt}/")
|
||||
|
||||
|
||||
def build_model(fmt="multiclass"):
|
||||
chunker = Maxent_NE_Chunker(fmt)
|
||||
chunker.save_params()
|
||||
return chunker
|
||||
|
||||
|
||||
# ======================================================================================
|
||||
|
||||
"""
|
||||
2004 update: pickles are not supported anymore.
|
||||
|
||||
Deprecated:
|
||||
|
||||
def build_model(fmt="binary"):
|
||||
print("Loading training data...")
|
||||
train_paths = [
|
||||
find("corpora/ace_data/ace.dev"),
|
||||
find("corpora/ace_data/ace.heldout"),
|
||||
find("corpora/ace_data/bbn.dev"),
|
||||
find("corpora/ace_data/muc.dev"),
|
||||
]
|
||||
train_trees = load_ace_data(train_paths, fmt)
|
||||
train_data = [postag_tree(t) for t in train_trees]
|
||||
print("Training...")
|
||||
cp = NEChunkParser(train_data)
|
||||
del train_data
|
||||
|
||||
print("Loading eval data...")
|
||||
eval_paths = [find("corpora/ace_data/ace.eval")]
|
||||
eval_trees = load_ace_data(eval_paths, fmt)
|
||||
eval_data = [postag_tree(t) for t in eval_trees]
|
||||
|
||||
print("Evaluating...")
|
||||
chunkscore = ChunkScore()
|
||||
for i, correct in enumerate(eval_data):
|
||||
guess = cp.parse(correct.leaves())
|
||||
chunkscore.score(correct, guess)
|
||||
if i < 3:
|
||||
cmp_chunks(correct, guess)
|
||||
print(chunkscore)
|
||||
|
||||
outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
|
||||
print(f"Saving chunker to {outfilename}...")
|
||||
|
||||
with open(outfilename, "wb") as outfile:
|
||||
pickle.dump(cp, outfile, -1)
|
||||
|
||||
return cp
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Make sure that the object has the right class name:
|
||||
build_model("binary")
|
||||
build_model("multiclass")
|
||||
1474
Backend/venv/lib/python3.12/site-packages/nltk/chunk/regexp.py
Normal file
1474
Backend/venv/lib/python3.12/site-packages/nltk/chunk/regexp.py
Normal file
File diff suppressed because it is too large
Load Diff
642
Backend/venv/lib/python3.12/site-packages/nltk/chunk/util.py
Normal file
642
Backend/venv/lib/python3.12/site-packages/nltk/chunk/util.py
Normal file
@@ -0,0 +1,642 @@
|
||||
# Natural Language Toolkit: Chunk format conversions
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
|
||||
from nltk.metrics import accuracy as _accuracy
|
||||
from nltk.tag.mapping import map_tag
|
||||
from nltk.tag.util import str2tuple
|
||||
from nltk.tree import Tree
|
||||
|
||||
##//////////////////////////////////////////////////////
|
||||
## EVALUATION
|
||||
##//////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def accuracy(chunker, gold):
|
||||
"""
|
||||
Score the accuracy of the chunker against the gold standard.
|
||||
Strip the chunk information from the gold standard and rechunk it using
|
||||
the chunker, then compute the accuracy score.
|
||||
|
||||
:type chunker: ChunkParserI
|
||||
:param chunker: The chunker being evaluated.
|
||||
:type gold: tree
|
||||
:param gold: The chunk structures to score the chunker on.
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
gold_tags = []
|
||||
test_tags = []
|
||||
for gold_tree in gold:
|
||||
test_tree = chunker.parse(gold_tree.flatten())
|
||||
gold_tags += tree2conlltags(gold_tree)
|
||||
test_tags += tree2conlltags(test_tree)
|
||||
|
||||
# print 'GOLD:', gold_tags[:50]
|
||||
# print 'TEST:', test_tags[:50]
|
||||
return _accuracy(gold_tags, test_tags)
|
||||
|
||||
|
||||
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
|
||||
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
|
||||
#
|
||||
# SB: use nltk.metrics for precision/recall scoring?
|
||||
#
|
||||
class ChunkScore:
|
||||
"""
|
||||
A utility class for scoring chunk parsers. ``ChunkScore`` can
|
||||
evaluate a chunk parser's output, based on a number of statistics
|
||||
(precision, recall, f-measure, misssed chunks, incorrect chunks).
|
||||
It can also combine the scores from the parsing of multiple texts;
|
||||
this makes it significantly easier to evaluate a chunk parser that
|
||||
operates one sentence at a time.
|
||||
|
||||
Texts are evaluated with the ``score`` method. The results of
|
||||
evaluation can be accessed via a number of accessor methods, such
|
||||
as ``precision`` and ``f_measure``. A typical use of the
|
||||
``ChunkScore`` class is::
|
||||
|
||||
>>> chunkscore = ChunkScore() # doctest: +SKIP
|
||||
>>> for correct in correct_sentences: # doctest: +SKIP
|
||||
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
|
||||
... chunkscore.score(correct, guess) # doctest: +SKIP
|
||||
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
|
||||
F Measure: 0.823
|
||||
|
||||
:ivar kwargs: Keyword arguments:
|
||||
|
||||
- max_tp_examples: The maximum number actual examples of true
|
||||
positives to record. This affects the ``correct`` member
|
||||
function: ``correct`` will not return more than this number
|
||||
of true positive examples. This does *not* affect any of
|
||||
the numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- max_fp_examples: The maximum number actual examples of false
|
||||
positives to record. This affects the ``incorrect`` member
|
||||
function and the ``guessed`` member function: ``incorrect``
|
||||
will not return more than this number of examples, and
|
||||
``guessed`` will not return more than this number of true
|
||||
positive examples. This does *not* affect any of the
|
||||
numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- max_fn_examples: The maximum number actual examples of false
|
||||
negatives to record. This affects the ``missed`` member
|
||||
function and the ``correct`` member function: ``missed``
|
||||
will not return more than this number of examples, and
|
||||
``correct`` will not return more than this number of true
|
||||
negative examples. This does *not* affect any of the
|
||||
numerical metrics (precision, recall, or f-measure)
|
||||
|
||||
- chunk_label: A regular expression indicating which chunks
|
||||
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
|
||||
|
||||
:type _tp: list(Token)
|
||||
:ivar _tp: List of true positives
|
||||
:type _fp: list(Token)
|
||||
:ivar _fp: List of false positives
|
||||
:type _fn: list(Token)
|
||||
:ivar _fn: List of false negatives
|
||||
|
||||
:type _tp_num: int
|
||||
:ivar _tp_num: Number of true positives
|
||||
:type _fp_num: int
|
||||
:ivar _fp_num: Number of false positives
|
||||
:type _fn_num: int
|
||||
:ivar _fn_num: Number of false negatives.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._correct = set()
|
||||
self._guessed = set()
|
||||
self._tp = set()
|
||||
self._fp = set()
|
||||
self._fn = set()
|
||||
self._max_tp = kwargs.get("max_tp_examples", 100)
|
||||
self._max_fp = kwargs.get("max_fp_examples", 100)
|
||||
self._max_fn = kwargs.get("max_fn_examples", 100)
|
||||
self._chunk_label = kwargs.get("chunk_label", ".*")
|
||||
self._tp_num = 0
|
||||
self._fp_num = 0
|
||||
self._fn_num = 0
|
||||
self._count = 0
|
||||
self._tags_correct = 0.0
|
||||
self._tags_total = 0.0
|
||||
|
||||
self._measuresNeedUpdate = False
|
||||
|
||||
def _updateMeasures(self):
|
||||
if self._measuresNeedUpdate:
|
||||
self._tp = self._guessed & self._correct
|
||||
self._fn = self._correct - self._guessed
|
||||
self._fp = self._guessed - self._correct
|
||||
self._tp_num = len(self._tp)
|
||||
self._fp_num = len(self._fp)
|
||||
self._fn_num = len(self._fn)
|
||||
self._measuresNeedUpdate = False
|
||||
|
||||
def score(self, correct, guessed):
|
||||
"""
|
||||
Given a correctly chunked sentence, score another chunked
|
||||
version of the same sentence.
|
||||
|
||||
:type correct: chunk structure
|
||||
:param correct: The known-correct ("gold standard") chunked
|
||||
sentence.
|
||||
:type guessed: chunk structure
|
||||
:param guessed: The chunked sentence to be scored.
|
||||
"""
|
||||
self._correct |= _chunksets(correct, self._count, self._chunk_label)
|
||||
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
|
||||
self._count += 1
|
||||
self._measuresNeedUpdate = True
|
||||
# Keep track of per-tag accuracy (if possible)
|
||||
try:
|
||||
correct_tags = tree2conlltags(correct)
|
||||
guessed_tags = tree2conlltags(guessed)
|
||||
except ValueError:
|
||||
# This exception case is for nested chunk structures,
|
||||
# where tree2conlltags will fail with a ValueError: "Tree
|
||||
# is too deeply nested to be printed in CoNLL format."
|
||||
correct_tags = guessed_tags = ()
|
||||
self._tags_total += len(correct_tags)
|
||||
self._tags_correct += sum(
|
||||
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
|
||||
)
|
||||
|
||||
def accuracy(self):
|
||||
"""
|
||||
Return the overall tag-based accuracy for all text that have
|
||||
been scored by this ``ChunkScore``, using the IOB (conll2000)
|
||||
tag encoding.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
if self._tags_total == 0:
|
||||
return 1
|
||||
return self._tags_correct / self._tags_total
|
||||
|
||||
def precision(self):
|
||||
"""
|
||||
Return the overall precision for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
div = self._tp_num + self._fp_num
|
||||
if div == 0:
|
||||
return 0
|
||||
else:
|
||||
return self._tp_num / div
|
||||
|
||||
def recall(self):
|
||||
"""
|
||||
Return the overall recall for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
div = self._tp_num + self._fn_num
|
||||
if div == 0:
|
||||
return 0
|
||||
else:
|
||||
return self._tp_num / div
|
||||
|
||||
def f_measure(self, alpha=0.5):
|
||||
"""
|
||||
Return the overall F measure for all texts that have been
|
||||
scored by this ``ChunkScore``.
|
||||
|
||||
:param alpha: the relative weighting of precision and recall.
|
||||
Larger alpha biases the score towards the precision value,
|
||||
while smaller alpha biases the score towards the recall
|
||||
value. ``alpha`` should have a value in the range [0,1].
|
||||
:type alpha: float
|
||||
:rtype: float
|
||||
"""
|
||||
self._updateMeasures()
|
||||
p = self.precision()
|
||||
r = self.recall()
|
||||
if p == 0 or r == 0: # what if alpha is 0 or 1?
|
||||
return 0
|
||||
return 1 / (alpha / p + (1 - alpha) / r)
|
||||
|
||||
def missed(self):
|
||||
"""
|
||||
Return the chunks which were included in the
|
||||
correct chunk structures, but not in the guessed chunk
|
||||
structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
self._updateMeasures()
|
||||
chunks = list(self._fn)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def incorrect(self):
|
||||
"""
|
||||
Return the chunks which were included in the guessed chunk structures,
|
||||
but not in the correct chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
self._updateMeasures()
|
||||
chunks = list(self._fp)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def correct(self):
|
||||
"""
|
||||
Return the chunks which were included in the correct
|
||||
chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
chunks = list(self._correct)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def guessed(self):
|
||||
"""
|
||||
Return the chunks which were included in the guessed
|
||||
chunk structures, listed in input order.
|
||||
|
||||
:rtype: list of chunks
|
||||
"""
|
||||
chunks = list(self._guessed)
|
||||
return [c[1] for c in chunks] # discard position information
|
||||
|
||||
def __len__(self):
|
||||
self._updateMeasures()
|
||||
return self._tp_num + self._fn_num
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a concise representation of this ``ChunkScoring``.
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return "<ChunkScoring of " + repr(len(self)) + " chunks>"
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Return a verbose representation of this ``ChunkScoring``.
|
||||
This representation includes the precision, recall, and
|
||||
f-measure scores. For other information about the score,
|
||||
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
return (
|
||||
"ChunkParse score:\n"
|
||||
+ f" IOB Accuracy: {self.accuracy() * 100:5.1f}%\n"
|
||||
+ f" Precision: {self.precision() * 100:5.1f}%\n"
|
||||
+ f" Recall: {self.recall() * 100:5.1f}%\n"
|
||||
+ f" F-Measure: {self.f_measure() * 100:5.1f}%"
|
||||
)
|
||||
|
||||
|
||||
# extract chunks, and assign unique id, the absolute position of
|
||||
# the first word of the chunk
|
||||
def _chunksets(t, count, chunk_label):
|
||||
pos = 0
|
||||
chunks = []
|
||||
for child in t:
|
||||
if isinstance(child, Tree):
|
||||
if re.match(chunk_label, child.label()):
|
||||
chunks.append(((count, pos), child.freeze()))
|
||||
pos += len(child.leaves())
|
||||
else:
|
||||
pos += 1
|
||||
return set(chunks)
|
||||
|
||||
|
||||
def tagstr2tree(
|
||||
s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
|
||||
):
|
||||
"""
|
||||
Divide a string of bracketted tagged text into
|
||||
chunks and unchunked tokens, and produce a Tree.
|
||||
Chunks are marked by square brackets (``[...]``). Words are
|
||||
delimited by whitespace, and each word should have the form
|
||||
``text/tag``. Words that do not contain a slash are
|
||||
assigned a ``tag`` of None.
|
||||
|
||||
:param s: The string to be converted
|
||||
:type s: str
|
||||
:param chunk_label: The label to use for chunk nodes
|
||||
:type chunk_label: str
|
||||
:param root_label: The label to use for the root of the tree
|
||||
:type root_label: str
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
|
||||
|
||||
stack = [Tree(root_label, [])]
|
||||
for match in WORD_OR_BRACKET.finditer(s):
|
||||
text = match.group()
|
||||
if text[0] == "[":
|
||||
if len(stack) != 1:
|
||||
raise ValueError(f"Unexpected [ at char {match.start():d}")
|
||||
chunk = Tree(chunk_label, [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
elif text[0] == "]":
|
||||
if len(stack) != 2:
|
||||
raise ValueError(f"Unexpected ] at char {match.start():d}")
|
||||
stack.pop()
|
||||
else:
|
||||
if sep is None:
|
||||
stack[-1].append(text)
|
||||
else:
|
||||
word, tag = str2tuple(text, sep)
|
||||
if source_tagset and target_tagset:
|
||||
tag = map_tag(source_tagset, target_tagset, tag)
|
||||
stack[-1].append((word, tag))
|
||||
|
||||
if len(stack) != 1:
|
||||
raise ValueError(f"Expected ] at char {len(s):d}")
|
||||
return stack[0]
|
||||
|
||||
|
||||
### CONLL
|
||||
|
||||
_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
|
||||
|
||||
|
||||
def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
|
||||
"""
|
||||
Return a chunk structure for a single sentence
|
||||
encoded in the given CONLL 2000 style string.
|
||||
This function converts a CoNLL IOB string into a tree.
|
||||
It uses the specified chunk types
|
||||
(defaults to NP, PP and VP), and creates a tree rooted at a node
|
||||
labeled S (by default).
|
||||
|
||||
:param s: The CoNLL string to be converted.
|
||||
:type s: str
|
||||
:param chunk_types: The chunk types to be converted.
|
||||
:type chunk_types: tuple
|
||||
:param root_label: The node label to use for the root.
|
||||
:type root_label: str
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
stack = [Tree(root_label, [])]
|
||||
|
||||
for lineno, line in enumerate(s.split("\n")):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Decode the line.
|
||||
match = _LINE_RE.match(line)
|
||||
if match is None:
|
||||
raise ValueError(f"Error on line {lineno:d}")
|
||||
(word, tag, state, chunk_type) = match.groups()
|
||||
|
||||
# If it's a chunk type we don't care about, treat it as O.
|
||||
if chunk_types is not None and chunk_type not in chunk_types:
|
||||
state = "O"
|
||||
|
||||
# For "Begin"/"Outside", finish any completed chunks -
|
||||
# also do so for "Inside" which don't match the previous token.
|
||||
mismatch_I = state == "I" and chunk_type != stack[-1].label()
|
||||
if state in "BO" or mismatch_I:
|
||||
if len(stack) == 2:
|
||||
stack.pop()
|
||||
|
||||
# For "Begin", start a new chunk.
|
||||
if state == "B" or mismatch_I:
|
||||
chunk = Tree(chunk_type, [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
|
||||
# Add the new word token.
|
||||
stack[-1].append((word, tag))
|
||||
|
||||
return stack[0]
|
||||
|
||||
|
||||
def tree2conlltags(t):
|
||||
"""
|
||||
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
|
||||
Convert a tree to the CoNLL IOB tag format.
|
||||
|
||||
:param t: The tree to be converted.
|
||||
:type t: Tree
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
tags = []
|
||||
for child in t:
|
||||
try:
|
||||
category = child.label()
|
||||
prefix = "B-"
|
||||
for contents in child:
|
||||
if isinstance(contents, Tree):
|
||||
raise ValueError(
|
||||
"Tree is too deeply nested to be printed in CoNLL format"
|
||||
)
|
||||
tags.append((contents[0], contents[1], prefix + category))
|
||||
prefix = "I-"
|
||||
except AttributeError:
|
||||
tags.append((child[0], child[1], "O"))
|
||||
return tags
|
||||
|
||||
|
||||
def conlltags2tree(
|
||||
sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
|
||||
):
|
||||
"""
|
||||
Convert the CoNLL IOB format to a tree.
|
||||
"""
|
||||
tree = Tree(root_label, [])
|
||||
for word, postag, chunktag in sentence:
|
||||
if chunktag is None:
|
||||
if strict:
|
||||
raise ValueError("Bad conll tag sequence")
|
||||
else:
|
||||
# Treat as O
|
||||
tree.append((word, postag))
|
||||
elif chunktag.startswith("B-"):
|
||||
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
||||
elif chunktag.startswith("I-"):
|
||||
if (
|
||||
len(tree) == 0
|
||||
or not isinstance(tree[-1], Tree)
|
||||
or tree[-1].label() != chunktag[2:]
|
||||
):
|
||||
if strict:
|
||||
raise ValueError("Bad conll tag sequence")
|
||||
else:
|
||||
# Treat as B-*
|
||||
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
||||
else:
|
||||
tree[-1].append((word, postag))
|
||||
elif chunktag == "O":
|
||||
tree.append((word, postag))
|
||||
else:
|
||||
raise ValueError(f"Bad conll tag {chunktag!r}")
|
||||
return tree
|
||||
|
||||
|
||||
def tree2conllstr(t):
|
||||
"""
|
||||
Return a multiline string where each line contains a word, tag and IOB tag.
|
||||
Convert a tree to the CoNLL IOB string format
|
||||
|
||||
:param t: The tree to be converted.
|
||||
:type t: Tree
|
||||
:rtype: str
|
||||
"""
|
||||
lines = [" ".join(token) for token in tree2conlltags(t)]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
### IEER
|
||||
|
||||
_IEER_DOC_RE = re.compile(
|
||||
r"<DOC>\s*"
|
||||
r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
|
||||
r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
|
||||
r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
|
||||
r"<BODY>\s*"
|
||||
r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
|
||||
r"<TEXT>(?P<text>.*?)</TEXT>\s*"
|
||||
r"</BODY>\s*</DOC>\s*",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
|
||||
|
||||
|
||||
def _ieer_read_text(s, root_label):
|
||||
stack = [Tree(root_label, [])]
|
||||
# s will be None if there is no headline in the text
|
||||
# return the empty list in place of a Tree
|
||||
if s is None:
|
||||
return []
|
||||
for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
|
||||
piece = piece_m.group()
|
||||
try:
|
||||
if piece.startswith("<b_"):
|
||||
m = _IEER_TYPE_RE.match(piece)
|
||||
if m is None:
|
||||
print("XXXX", piece)
|
||||
chunk = Tree(m.group("type"), [])
|
||||
stack[-1].append(chunk)
|
||||
stack.append(chunk)
|
||||
elif piece.startswith("<e_"):
|
||||
stack.pop()
|
||||
# elif piece.startswith('<'):
|
||||
# print "ERROR:", piece
|
||||
# raise ValueError # Unexpected HTML
|
||||
else:
|
||||
stack[-1].append(piece)
|
||||
except (IndexError, ValueError) as e:
|
||||
raise ValueError(
|
||||
f"Bad IEER string (error at character {piece_m.start():d})"
|
||||
) from e
|
||||
if len(stack) != 1:
|
||||
raise ValueError("Bad IEER string")
|
||||
return stack[0]
|
||||
|
||||
|
||||
def ieerstr2tree(
|
||||
s,
|
||||
chunk_types=[
|
||||
"LOCATION",
|
||||
"ORGANIZATION",
|
||||
"PERSON",
|
||||
"DURATION",
|
||||
"DATE",
|
||||
"CARDINAL",
|
||||
"PERCENT",
|
||||
"MONEY",
|
||||
"MEASURE",
|
||||
],
|
||||
root_label="S",
|
||||
):
|
||||
"""
|
||||
Return a chunk structure containing the chunked tagged text that is
|
||||
encoded in the given IEER style string.
|
||||
Convert a string of chunked tagged text in the IEER named
|
||||
entity format into a chunk structure. Chunks are of several
|
||||
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
|
||||
PERCENT, MONEY, and MEASURE.
|
||||
|
||||
:rtype: Tree
|
||||
"""
|
||||
|
||||
# Try looking for a single document. If that doesn't work, then just
|
||||
# treat everything as if it was within the <TEXT>...</TEXT>.
|
||||
m = _IEER_DOC_RE.match(s)
|
||||
if m:
|
||||
return {
|
||||
"text": _ieer_read_text(m.group("text"), root_label),
|
||||
"docno": m.group("docno"),
|
||||
"doctype": m.group("doctype"),
|
||||
"date_time": m.group("date_time"),
|
||||
#'headline': m.group('headline')
|
||||
# we want to capture NEs in the headline too!
|
||||
"headline": _ieer_read_text(m.group("headline"), root_label),
|
||||
}
|
||||
else:
|
||||
return _ieer_read_text(s, root_label)
|
||||
|
||||
|
||||
def demo():
|
||||
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
|
||||
import nltk
|
||||
|
||||
t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
|
||||
t.pprint()
|
||||
print()
|
||||
|
||||
s = """
|
||||
These DT B-NP
|
||||
research NN I-NP
|
||||
protocols NNS I-NP
|
||||
offer VBP B-VP
|
||||
to TO B-PP
|
||||
the DT B-NP
|
||||
patient NN I-NP
|
||||
not RB O
|
||||
only RB O
|
||||
the DT B-NP
|
||||
very RB I-NP
|
||||
best JJS I-NP
|
||||
therapy NN I-NP
|
||||
which WDT B-NP
|
||||
we PRP B-NP
|
||||
have VBP B-VP
|
||||
established VBN I-VP
|
||||
today NN B-NP
|
||||
but CC B-NP
|
||||
also RB I-NP
|
||||
the DT B-NP
|
||||
hope NN I-NP
|
||||
of IN B-PP
|
||||
something NN B-NP
|
||||
still RB B-ADJP
|
||||
better JJR I-ADJP
|
||||
. . O
|
||||
"""
|
||||
|
||||
conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
|
||||
conll_tree.pprint()
|
||||
|
||||
# Demonstrate CoNLL output
|
||||
print("CoNLL output:")
|
||||
print(nltk.chunk.tree2conllstr(conll_tree))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,101 @@
|
||||
# Natural Language Toolkit: Classifiers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Classes and interfaces for labeling tokens with category labels (or
|
||||
"class labels"). Typically, labels are represented with strings
|
||||
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
|
||||
perform a wide range of classification tasks. For example,
|
||||
classifiers can be used...
|
||||
|
||||
- to classify documents by topic
|
||||
- to classify ambiguous words by which word sense is intended
|
||||
- to classify acoustic signals by which phoneme they represent
|
||||
- to classify sentences by their author
|
||||
|
||||
Features
|
||||
========
|
||||
In order to decide which category label is appropriate for a given
|
||||
token, classifiers examine one or more 'features' of the token. These
|
||||
"features" are typically chosen by hand, and indicate which aspects
|
||||
of the token are relevant to the classification decision. For
|
||||
example, a document classifier might use a separate feature for each
|
||||
word, recording how often that word occurred in the document.
|
||||
|
||||
Featuresets
|
||||
===========
|
||||
The features describing a token are encoded using a "featureset",
|
||||
which is a dictionary that maps from "feature names" to "feature
|
||||
values". Feature names are unique strings that indicate what aspect
|
||||
of the token is encoded by the feature. Examples include
|
||||
``'prevword'``, for a feature whose value is the previous word; and
|
||||
``'contains-word(library)'`` for a feature that is true when a document
|
||||
contains the word ``'library'``. Feature values are typically
|
||||
booleans, numbers, or strings, depending on which feature they
|
||||
describe.
|
||||
|
||||
Featuresets are typically constructed using a "feature detector"
|
||||
(also known as a "feature extractor"). A feature detector is a
|
||||
function that takes a token (and sometimes information about its
|
||||
context) as its input, and returns a featureset describing that token.
|
||||
For example, the following feature detector converts a document
|
||||
(stored as a list of words) to a featureset describing the set of
|
||||
words included in the document:
|
||||
|
||||
>>> # Define a feature detector function.
|
||||
>>> def document_features(document):
|
||||
... return dict([('contains-word(%s)' % w, True) for w in document])
|
||||
|
||||
Feature detectors are typically applied to each token before it is fed
|
||||
to the classifier:
|
||||
|
||||
>>> # Classify each Gutenberg document.
|
||||
>>> from nltk.corpus import gutenberg
|
||||
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
|
||||
... doc = gutenberg.words(fileid) # doctest: +SKIP
|
||||
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
|
||||
|
||||
The parameters that a feature detector expects will vary, depending on
|
||||
the task and the needs of the feature detector. For example, a
|
||||
feature detector for word sense disambiguation (WSD) might take as its
|
||||
input a sentence, and the index of a word that should be classified,
|
||||
and return a featureset for that word. The following feature detector
|
||||
for WSD includes features describing the left and right contexts of
|
||||
the target word:
|
||||
|
||||
>>> def wsd_features(sentence, index):
|
||||
... featureset = {}
|
||||
... for i in range(max(0, index-3), index):
|
||||
... featureset['left-context(%s)' % sentence[i]] = True
|
||||
... for i in range(index, max(index+3, len(sentence))):
|
||||
... featureset['right-context(%s)' % sentence[i]] = True
|
||||
... return featureset
|
||||
|
||||
Training Classifiers
|
||||
====================
|
||||
Most classifiers are built by training them on a list of hand-labeled
|
||||
examples, known as the "training set". Training sets are represented
|
||||
as lists of ``(featuredict, label)`` tuples.
|
||||
"""
|
||||
|
||||
from nltk.classify.api import ClassifierI, MultiClassifierI
|
||||
from nltk.classify.decisiontree import DecisionTreeClassifier
|
||||
from nltk.classify.maxent import (
|
||||
BinaryMaxentFeatureEncoding,
|
||||
ConditionalExponentialClassifier,
|
||||
MaxentClassifier,
|
||||
TypedMaxentFeatureEncoding,
|
||||
)
|
||||
from nltk.classify.megam import call_megam, config_megam
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
|
||||
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
|
||||
from nltk.classify.scikitlearn import SklearnClassifier
|
||||
from nltk.classify.senna import Senna
|
||||
from nltk.classify.textcat import TextCat
|
||||
from nltk.classify.util import accuracy, apply_features, log_likelihood
|
||||
from nltk.classify.weka import WekaClassifier, config_weka
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user