This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,11 @@
# Natural Language Toolkit: Miscellaneous modules
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.misc.babelfish import babelize_shell
from nltk.misc.chomsky import generate_chomsky
from nltk.misc.minimalset import MinimalSet
from nltk.misc.wordfinder import word_finder

View File

@@ -0,0 +1,10 @@
"""
This module previously provided an interface to Babelfish online
translation service; this service is no longer available; this
module is kept in NLTK source code in order to provide better error
messages for people following the NLTK Book 2.0.
"""
def babelize_shell():
print("Babelfish online translation service is no longer available.")

View File

@@ -0,0 +1,134 @@
# Chomsky random text generator, version 1.1, Raymond Hettinger, 2005/09/13
# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/440546
"""
CHOMSKY is an aid to writing linguistic papers in the style
of the great master. It is based on selected phrases taken
from actual books and articles written by Noam Chomsky.
Upon request, it assembles the phrases in the elegant
stylistic patterns that Chomsky is noted for.
To generate n sentences of linguistic wisdom, type
(CHOMSKY n) -- for example
(CHOMSKY 5) generates half a screen of linguistic truth.
"""
leadins = """To characterize a linguistic level L,
On the other hand,
This suggests that
It appears that
Furthermore,
We will bring evidence in favor of the following thesis:
To provide a constituent structure for T(Z,K),
From C1, it follows that
For any transformation which is sufficiently diversified in \
application to be of any interest,
Analogously,
Clearly,
Note that
Of course,
Suppose, for instance, that
Thus
With this clarification,
Conversely,
We have already seen that
By combining adjunctions and certain deformations,
I suggested that these results would follow from the assumption that
If the position of the trace in (99c) were only relatively \
inaccessible to movement,
However, this assumption is not correct, since
Comparing these examples with their parasitic gap counterparts in \
(96) and (97), we see that
In the discussion of resumptive pronouns following (81),
So far,
Nevertheless,
For one thing,
Summarizing, then, we assume that
A consequence of the approach just outlined is that
Presumably,
On our assumptions,
It may be, then, that
It must be emphasized, once again, that
Let us continue to suppose that
Notice, incidentally, that """
# List of LEADINs to buy time.
subjects = """ the notion of level of grammaticalness
a case of semigrammaticalness of a different sort
most of the methodological work in modern linguistics
a subset of English sentences interesting on quite independent grounds
the natural general principle that will subsume this case
an important property of these three types of EC
any associated supporting element
the appearance of parasitic gaps in domains relatively inaccessible \
to ordinary extraction
the speaker-hearer's linguistic intuition
the descriptive power of the base component
the earlier discussion of deviance
this analysis of a formative as a pair of sets of features
this selectionally introduced contextual feature
a descriptively adequate grammar
the fundamental error of regarding functional notions as categorial
relational information
the systematic use of complex symbols
the theory of syntactic features developed earlier"""
# List of SUBJECTs chosen for maximum professorial macho.
verbs = """can be defined in such a way as to impose
delimits
suffices to account for
cannot be arbitrary in
is not subject to
does not readily tolerate
raises serious doubts about
is not quite equivalent to
does not affect the structure of
may remedy and, at the same time, eliminate
is not to be considered in determining
is to be regarded as
is unspecified with respect to
is, apparently, determined by
is necessary to impose an interpretation on
appears to correlate rather closely with
is rather different from"""
# List of VERBs chosen for autorecursive obfuscation.
objects = """ problems of phonemic and morphological analysis.
a corpus of utterance tokens upon which conformity has been defined \
by the paired utterance test.
the traditional practice of grammarians.
the levels of acceptability from fairly high (e.g. (99a)) to virtual \
gibberish (e.g. (98d)).
a stipulation to place the constructions into these various categories.
a descriptive fact.
a parasitic gap construction.
the extended c-command discussed in connection with (34).
the ultimate standard that determines the accuracy of any proposed grammar.
the system of base rules exclusive of the lexicon.
irrelevant intervening contexts in selectional rules.
nondistinctness in the sense of distinctive feature theory.
a general convention regarding the forms of the grammar.
an abstract underlying order.
an important distinction in language use.
the requirement that branching is not tolerated within the dominance \
scope of a complex symbol.
the strong generative capacity of the theory."""
# List of OBJECTs selected for profound sententiousness.
import random
import textwrap
from itertools import chain, islice
def generate_chomsky(times=5, line_length=72):
parts = []
for part in (leadins, subjects, verbs, objects):
phraselist = list(map(str.strip, part.splitlines()))
random.shuffle(phraselist)
parts.append(phraselist)
output = chain.from_iterable(islice(zip(*parts), 0, times))
print(textwrap.fill(" ".join(output), line_length))
if __name__ == "__main__":
generate_chomsky()

View File

@@ -0,0 +1,85 @@
# Natural Language Toolkit: Minimal Sets
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from collections import defaultdict
class MinimalSet:
"""
Find contexts where more than one possible target value can
appear. E.g. if targets are word-initial letters, and contexts
are the remainders of words, then we would like to find cases like
"fat" vs "cat", and "training" vs "draining". If targets are
parts-of-speech and contexts are words, then we would like to find
cases like wind (noun) 'air in rapid motion', vs wind (verb)
'coil, wrap'.
"""
def __init__(self, parameters=None):
"""
Create a new minimal set.
:param parameters: The (context, target, display) tuples for the item
:type parameters: list(tuple(str, str, str))
"""
self._targets = set() # the contrastive information
self._contexts = set() # what we are controlling for
self._seen = defaultdict(set) # to record what we have seen
self._displays = {} # what we will display
if parameters:
for context, target, display in parameters:
self.add(context, target, display)
def add(self, context, target, display):
"""
Add a new item to the minimal set, having the specified
context, target, and display form.
:param context: The context in which the item of interest appears
:type context: str
:param target: The item of interest
:type target: str
:param display: The information to be reported for each item
:type display: str
"""
# Store the set of targets that occurred in this context
self._seen[context].add(target)
# Keep track of which contexts and targets we have seen
self._contexts.add(context)
self._targets.add(target)
# For a given context and target, store the display form
self._displays[(context, target)] = display
def contexts(self, minimum=2):
"""
Determine which contexts occurred with enough distinct targets.
:param minimum: the minimum number of distinct target forms
:type minimum: int
:rtype: list
"""
return [c for c in self._contexts if len(self._seen[c]) >= minimum]
def display(self, context, target, default=""):
if (context, target) in self._displays:
return self._displays[(context, target)]
else:
return default
def display_all(self, context):
result = []
for target in self._targets:
x = self.display(context, target)
if x:
result.append(x)
return result
def targets(self):
return self._targets

View File

@@ -0,0 +1,176 @@
# Natural Language Toolkit: List Sorting
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
This module provides a variety of list sorting algorithms, to
illustrate the many different algorithms (recipes) for solving a
problem, and how to analyze algorithms experimentally.
"""
# These algorithms are taken from:
# Levitin (2004) The Design and Analysis of Algorithms
##################################################################
# Selection Sort
##################################################################
def selection(a):
"""
Selection Sort: scan the list to find its smallest element, then
swap it with the first element. The remainder of the list is one
element smaller; apply the same method to this list, and so on.
"""
count = 0
for i in range(len(a) - 1):
min = i
for j in range(i + 1, len(a)):
if a[j] < a[min]:
min = j
count += 1
a[min], a[i] = a[i], a[min]
return count
##################################################################
# Bubble Sort
##################################################################
def bubble(a):
"""
Bubble Sort: compare adjacent elements of the list left-to-right,
and swap them if they are out of order. After one pass through
the list swapping adjacent items, the largest item will be in
the rightmost position. The remainder is one element smaller;
apply the same method to this list, and so on.
"""
count = 0
for i in range(len(a) - 1):
for j in range(len(a) - i - 1):
if a[j + 1] < a[j]:
a[j], a[j + 1] = a[j + 1], a[j]
count += 1
return count
##################################################################
# Merge Sort
##################################################################
def _merge_lists(b, c):
count = 0
i = j = 0
a = []
while i < len(b) and j < len(c):
count += 1
if b[i] <= c[j]:
a.append(b[i])
i += 1
else:
a.append(c[j])
j += 1
if i == len(b):
a += c[j:]
else:
a += b[i:]
return a, count
def merge(a):
"""
Merge Sort: split the list in half, and sort each half, then
combine the sorted halves.
"""
count = 0
if len(a) > 1:
midpoint = len(a) // 2
b = a[:midpoint]
c = a[midpoint:]
count_b = merge(b)
count_c = merge(c)
result, count_a = _merge_lists(b, c)
a[:] = result # copy the result back into a.
count = count_a + count_b + count_c
return count
##################################################################
# Quick Sort
##################################################################
def _partition(a, l, r):
p = a[l]
i = l
j = r + 1
count = 0
while True:
while i < r:
i += 1
if a[i] >= p:
break
while j > l:
j -= 1
if j < l or a[j] <= p:
break
a[i], a[j] = a[j], a[i] # swap
count += 1
if i >= j:
break
a[i], a[j] = a[j], a[i] # undo last swap
a[l], a[j] = a[j], a[l]
return j, count
def _quick(a, l, r):
count = 0
if l < r:
s, count = _partition(a, l, r)
count += _quick(a, l, s - 1)
count += _quick(a, s + 1, r)
return count
def quick(a):
return _quick(a, 0, len(a) - 1)
##################################################################
# Demonstration
##################################################################
def demo():
from random import shuffle
for size in (10, 20, 50, 100, 200, 500, 1000):
a = list(range(size))
# various sort methods
shuffle(a)
count_selection = selection(a)
shuffle(a)
count_bubble = bubble(a)
shuffle(a)
count_merge = merge(a)
shuffle(a)
count_quick = quick(a)
print(
("size=%5d: selection=%8d, bubble=%8d, " "merge=%6d, quick=%6d")
% (size, count_selection, count_bubble, count_merge, count_quick)
)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,139 @@
# Natural Language Toolkit: Word Finder
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# Simplified from PHP version by Robert Klein <brathna@gmail.com>
# http://fswordfinder.sourceforge.net/
import random
# reverse a word with probability 0.5
def revword(word):
if random.randint(1, 2) == 1:
return word[::-1]
return word
# try to insert word at position x,y; direction encoded in xf,yf
def step(word, x, xf, y, yf, grid):
for i in range(len(word)):
if grid[xf(i)][yf(i)] != "" and grid[xf(i)][yf(i)] != word[i]:
return False
for i in range(len(word)):
grid[xf(i)][yf(i)] = word[i]
return True
# try to insert word at position x,y, in direction dir
def check(word, dir, x, y, grid, rows, cols):
if dir == 1:
if x - len(word) < 0 or y - len(word) < 0:
return False
return step(word, x, lambda i: x - i, y, lambda i: y - i, grid)
elif dir == 2:
if x - len(word) < 0:
return False
return step(word, x, lambda i: x - i, y, lambda i: y, grid)
elif dir == 3:
if x - len(word) < 0 or y + (len(word) - 1) >= cols:
return False
return step(word, x, lambda i: x - i, y, lambda i: y + i, grid)
elif dir == 4:
if y - len(word) < 0:
return False
return step(word, x, lambda i: x, y, lambda i: y - i, grid)
def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
"""
Attempt to arrange words into a letter-grid with the specified
number of rows and columns. Try each word in several positions
and directions, until it can be fitted into the grid, or the
maximum number of allowable attempts is exceeded. Returns a tuple
consisting of the grid and the words that were successfully
placed.
:param words: the list of words to be put into the grid
:type words: list
:param rows: the number of rows in the grid
:type rows: int
:param cols: the number of columns in the grid
:type cols: int
:param attempts: the number of times to attempt placing a word
:type attempts: int
:param alph: the alphabet, to be used for filling blank cells
:type alph: list
:rtype: tuple
"""
# place longer words first
words = sorted(words, key=len, reverse=True)
grid = [] # the letter grid
used = [] # the words we used
# initialize the grid
for i in range(rows):
grid.append([""] * cols)
# try to place each word
for word in words:
word = word.strip().upper() # normalize
save = word # keep a record of the word
word = revword(word)
for attempt in range(attempts):
r = random.randint(0, len(word))
dir = random.choice([1, 2, 3, 4])
x = random.randint(0, rows)
y = random.randint(0, cols)
if dir == 1:
x += r
y += r
elif dir == 2:
x += r
elif dir == 3:
x += r
y -= r
elif dir == 4:
y += r
if 0 <= x < rows and 0 <= y < cols:
if check(word, dir, x, y, grid, rows, cols):
# used.append((save, dir, x, y, word))
used.append(save)
break
# Fill up the remaining spaces
for i in range(rows):
for j in range(cols):
if grid[i][j] == "":
grid[i][j] = random.choice(alph)
return grid, used
def word_finder():
from nltk.corpus import words
wordlist = words.words()
random.shuffle(wordlist)
wordlist = wordlist[:200]
wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
grid, used = wordfinder(wordlist)
print("Word Finder\n")
for i in range(len(grid)):
for j in range(len(grid[i])):
print(grid[i][j], end=" ")
print()
print()
for i in range(len(used)):
print("%d:" % (i + 1), used[i])
if __name__ == "__main__":
word_finder()