updates
This commit is contained in:
235
Backend/venv/lib/python3.12/site-packages/nltk/lm/__init__.py
Normal file
235
Backend/venv/lib/python3.12/site-packages/nltk/lm/__init__.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
NLTK Language Modeling Module.
|
||||
------------------------------
|
||||
|
||||
Currently this module covers only ngram language models, but it should be easy
|
||||
to extend to neural models.
|
||||
|
||||
|
||||
Preparing Data
|
||||
==============
|
||||
|
||||
Before we train our ngram models it is necessary to make sure the data we put in
|
||||
them is in the right format.
|
||||
Let's say we have a text that is a list of sentences, where each sentence is
|
||||
a list of strings. For simplicity we just consider a text consisting of
|
||||
characters instead of words.
|
||||
|
||||
>>> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
|
||||
|
||||
If we want to train a bigram model, we need to turn this text into bigrams.
|
||||
Here's what the first sentence of our text would look like if we use a function
|
||||
from NLTK for this.
|
||||
|
||||
>>> from nltk.util import bigrams
|
||||
>>> list(bigrams(text[0]))
|
||||
[('a', 'b'), ('b', 'c')]
|
||||
|
||||
Notice how "b" occurs both as the first and second member of different bigrams
|
||||
but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences
|
||||
start with "a" and end with "c"?
|
||||
A standard way to deal with this is to add special "padding" symbols to the
|
||||
sentence before splitting it into ngrams.
|
||||
Fortunately, NLTK also has a function for that, let's see what it does to the
|
||||
first sentence.
|
||||
|
||||
>>> from nltk.util import pad_sequence
|
||||
>>> list(pad_sequence(text[0],
|
||||
... pad_left=True,
|
||||
... left_pad_symbol="<s>",
|
||||
... pad_right=True,
|
||||
... right_pad_symbol="</s>",
|
||||
... n=2))
|
||||
['<s>', 'a', 'b', 'c', '</s>']
|
||||
|
||||
Note the `n` argument, that tells the function we need padding for bigrams.
|
||||
Now, passing all these parameters every time is tedious and in most cases they
|
||||
can be safely assumed as defaults anyway.
|
||||
Thus our module provides a convenience function that has all these arguments
|
||||
already set while the other arguments remain the same as for `pad_sequence`.
|
||||
|
||||
>>> from nltk.lm.preprocessing import pad_both_ends
|
||||
>>> list(pad_both_ends(text[0], n=2))
|
||||
['<s>', 'a', 'b', 'c', '</s>']
|
||||
|
||||
Combining the two parts discussed so far we get the following preparation steps
|
||||
for one sentence.
|
||||
|
||||
>>> list(bigrams(pad_both_ends(text[0], n=2)))
|
||||
[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]
|
||||
|
||||
To make our model more robust we could also train it on unigrams (single words)
|
||||
as well as bigrams, its main source of information.
|
||||
NLTK once again helpfully provides a function called `everygrams`.
|
||||
While not the most efficient, it is conceptually simple.
|
||||
|
||||
|
||||
>>> from nltk.util import everygrams
|
||||
>>> padded_bigrams = list(pad_both_ends(text[0], n=2))
|
||||
>>> list(everygrams(padded_bigrams, max_len=2))
|
||||
[('<s>',), ('<s>', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', '</s>'), ('</s>',)]
|
||||
|
||||
We are almost ready to start counting ngrams, just one more step left.
|
||||
During training and evaluation our model will rely on a vocabulary that
|
||||
defines which words are "known" to the model.
|
||||
To create this vocabulary we need to pad our sentences (just like for counting
|
||||
ngrams) and then combine the sentences into one flat stream of words.
|
||||
|
||||
>>> from nltk.lm.preprocessing import flatten
|
||||
>>> list(flatten(pad_both_ends(sent, n=2) for sent in text))
|
||||
['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']
|
||||
|
||||
In most cases we want to use the same text as the source for both vocabulary
|
||||
and ngram counts.
|
||||
Now that we understand what this means for our preprocessing, we can simply import
|
||||
a function that does everything for us.
|
||||
|
||||
>>> from nltk.lm.preprocessing import padded_everygram_pipeline
|
||||
>>> train, vocab = padded_everygram_pipeline(2, text)
|
||||
|
||||
So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy
|
||||
iterators. They are evaluated on demand at training time.
|
||||
|
||||
|
||||
Training
|
||||
========
|
||||
Having prepared our data we are ready to start training a model.
|
||||
As a simple example, let us train a Maximum Likelihood Estimator (MLE).
|
||||
We only need to specify the highest ngram order to instantiate it.
|
||||
|
||||
>>> from nltk.lm import MLE
|
||||
>>> lm = MLE(2)
|
||||
|
||||
This automatically creates an empty vocabulary...
|
||||
|
||||
>>> len(lm.vocab)
|
||||
0
|
||||
|
||||
... which gets filled as we fit the model.
|
||||
|
||||
>>> lm.fit(train, vocab)
|
||||
>>> print(lm.vocab)
|
||||
<Vocabulary with cutoff=1 unk_label='<UNK>' and 9 items>
|
||||
>>> len(lm.vocab)
|
||||
9
|
||||
|
||||
The vocabulary helps us handle words that have not occurred during training.
|
||||
|
||||
>>> lm.vocab.lookup(text[0])
|
||||
('a', 'b', 'c')
|
||||
>>> lm.vocab.lookup(["aliens", "from", "Mars"])
|
||||
('<UNK>', '<UNK>', '<UNK>')
|
||||
|
||||
Moreover, in some cases we want to ignore words that we did see during training
|
||||
but that didn't occur frequently enough, to provide us useful information.
|
||||
You can tell the vocabulary to ignore such words.
|
||||
To find out how that works, check out the docs for the `Vocabulary` class.
|
||||
|
||||
|
||||
Using a Trained Model
|
||||
=====================
|
||||
When it comes to ngram models the training boils down to counting up the ngrams
|
||||
from the training corpus.
|
||||
|
||||
>>> print(lm.counts)
|
||||
<NgramCounter with 2 ngram orders and 24 ngrams>
|
||||
|
||||
This provides a convenient interface to access counts for unigrams...
|
||||
|
||||
>>> lm.counts['a']
|
||||
2
|
||||
|
||||
...and bigrams (in this case "a b")
|
||||
|
||||
>>> lm.counts[['a']]['b']
|
||||
1
|
||||
|
||||
And so on. However, the real purpose of training a language model is to have it
|
||||
score how probable words are in certain contexts.
|
||||
This being MLE, the model returns the item's relative frequency as its score.
|
||||
|
||||
>>> lm.score("a")
|
||||
0.15384615384615385
|
||||
|
||||
Items that are not seen during training are mapped to the vocabulary's
|
||||
"unknown label" token. This is "<UNK>" by default.
|
||||
|
||||
>>> lm.score("<UNK>") == lm.score("aliens")
|
||||
True
|
||||
|
||||
Here's how you get the score for a word given some preceding context.
|
||||
For example we want to know what is the chance that "b" is preceded by "a".
|
||||
|
||||
>>> lm.score("b", ["a"])
|
||||
0.5
|
||||
|
||||
To avoid underflow when working with many small score values it makes sense to
|
||||
take their logarithm.
|
||||
For convenience this can be done with the `logscore` method.
|
||||
|
||||
>>> lm.logscore("a")
|
||||
-2.700439718141092
|
||||
|
||||
Building on this method, we can also evaluate our model's cross-entropy and
|
||||
perplexity with respect to sequences of ngrams.
|
||||
|
||||
>>> test = [('a', 'b'), ('c', 'd')]
|
||||
>>> lm.entropy(test)
|
||||
1.292481250360578
|
||||
>>> lm.perplexity(test)
|
||||
2.449489742783178
|
||||
|
||||
It is advisable to preprocess your test text exactly the same way as you did
|
||||
the training text.
|
||||
|
||||
One cool feature of ngram models is that they can be used to generate text.
|
||||
|
||||
>>> lm.generate(1, random_seed=3)
|
||||
'<s>'
|
||||
>>> lm.generate(5, random_seed=3)
|
||||
['<s>', 'a', 'b', 'c', 'd']
|
||||
|
||||
Provide `random_seed` if you want to consistently reproduce the same text all
|
||||
other things being equal. Here we are using it to test the examples.
|
||||
|
||||
You can also condition your generation on some preceding text with the `context`
|
||||
argument.
|
||||
|
||||
>>> lm.generate(5, text_seed=['c'], random_seed=3)
|
||||
['</s>', 'c', 'd', 'c', 'd']
|
||||
|
||||
Note that an ngram model is restricted in how much preceding context it can
|
||||
take into account. For example, a trigram model can only condition its output
|
||||
on 2 preceding words. If you pass in a 4-word context, the first two words
|
||||
will be ignored.
|
||||
"""
|
||||
|
||||
from nltk.lm.counter import NgramCounter
|
||||
from nltk.lm.models import (
|
||||
MLE,
|
||||
AbsoluteDiscountingInterpolated,
|
||||
KneserNeyInterpolated,
|
||||
Laplace,
|
||||
Lidstone,
|
||||
StupidBackoff,
|
||||
WittenBellInterpolated,
|
||||
)
|
||||
from nltk.lm.vocabulary import Vocabulary
|
||||
|
||||
__all__ = [
|
||||
"Vocabulary",
|
||||
"NgramCounter",
|
||||
"MLE",
|
||||
"Lidstone",
|
||||
"Laplace",
|
||||
"WittenBellInterpolated",
|
||||
"KneserNeyInterpolated",
|
||||
"AbsoluteDiscountingInterpolated",
|
||||
"StupidBackoff",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
238
Backend/venv/lib/python3.12/site-packages/nltk/lm/api.py
Normal file
238
Backend/venv/lib/python3.12/site-packages/nltk/lm/api.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Interface."""
|
||||
|
||||
import random
|
||||
import warnings
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from bisect import bisect
|
||||
from itertools import accumulate
|
||||
|
||||
from nltk.lm.counter import NgramCounter
|
||||
from nltk.lm.util import log_base2
|
||||
from nltk.lm.vocabulary import Vocabulary
|
||||
|
||||
|
||||
class Smoothing(metaclass=ABCMeta):
|
||||
"""Ngram Smoothing Interface
|
||||
|
||||
Implements Chen & Goodman 1995's idea that all smoothing algorithms have
|
||||
certain features in common. This should ideally allow smoothing algorithms to
|
||||
work both with Backoff and Interpolation.
|
||||
"""
|
||||
|
||||
def __init__(self, vocabulary, counter):
|
||||
"""
|
||||
:param vocabulary: The Ngram vocabulary object.
|
||||
:type vocabulary: nltk.lm.vocab.Vocabulary
|
||||
:param counter: The counts of the vocabulary items.
|
||||
:type counter: nltk.lm.counter.NgramCounter
|
||||
"""
|
||||
self.vocab = vocabulary
|
||||
self.counts = counter
|
||||
|
||||
@abstractmethod
|
||||
def unigram_score(self, word):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def alpha_gamma(self, word, context):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def _mean(items):
|
||||
"""Return average (aka mean) for sequence of items."""
|
||||
return sum(items) / len(items)
|
||||
|
||||
|
||||
def _random_generator(seed_or_generator):
|
||||
if isinstance(seed_or_generator, random.Random):
|
||||
return seed_or_generator
|
||||
return random.Random(seed_or_generator)
|
||||
|
||||
|
||||
def _weighted_choice(population, weights, random_generator=None):
|
||||
"""Like random.choice, but with weights.
|
||||
|
||||
Heavily inspired by python 3.6 `random.choices`.
|
||||
"""
|
||||
if not population:
|
||||
raise ValueError("Can't choose from empty population")
|
||||
if len(population) != len(weights):
|
||||
raise ValueError("The number of weights does not match the population")
|
||||
cum_weights = list(accumulate(weights))
|
||||
total = cum_weights[-1]
|
||||
threshold = random_generator.random()
|
||||
return population[bisect(cum_weights, total * threshold)]
|
||||
|
||||
|
||||
class LanguageModel(metaclass=ABCMeta):
|
||||
"""ABC for Language Models.
|
||||
|
||||
Cannot be directly instantiated itself.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, order, vocabulary=None, counter=None):
|
||||
"""Creates new LanguageModel.
|
||||
|
||||
:param vocabulary: If provided, this vocabulary will be used instead
|
||||
of creating a new one when training.
|
||||
:type vocabulary: `nltk.lm.Vocabulary` or None
|
||||
:param counter: If provided, use this object to count ngrams.
|
||||
:type counter: `nltk.lm.NgramCounter` or None
|
||||
:param ngrams_fn: If given, defines how sentences in training text are turned to ngram
|
||||
sequences.
|
||||
:type ngrams_fn: function or None
|
||||
:param pad_fn: If given, defines how sentences in training text are padded.
|
||||
:type pad_fn: function or None
|
||||
"""
|
||||
self.order = order
|
||||
if vocabulary and not isinstance(vocabulary, Vocabulary):
|
||||
warnings.warn(
|
||||
f"The `vocabulary` argument passed to {self.__class__.__name__!r} "
|
||||
"must be an instance of `nltk.lm.Vocabulary`.",
|
||||
stacklevel=3,
|
||||
)
|
||||
self.vocab = Vocabulary() if vocabulary is None else vocabulary
|
||||
self.counts = NgramCounter() if counter is None else counter
|
||||
|
||||
def fit(self, text, vocabulary_text=None):
|
||||
"""Trains the model on a text.
|
||||
|
||||
:param text: Training text as a sequence of sentences.
|
||||
|
||||
"""
|
||||
if not self.vocab:
|
||||
if vocabulary_text is None:
|
||||
raise ValueError(
|
||||
"Cannot fit without a vocabulary or text to create it from."
|
||||
)
|
||||
self.vocab.update(vocabulary_text)
|
||||
self.counts.update(self.vocab.lookup(sent) for sent in text)
|
||||
|
||||
def score(self, word, context=None):
|
||||
"""Masks out of vocab (OOV) words and computes their model score.
|
||||
|
||||
For model-specific logic of calculating scores, see the `unmasked_score`
|
||||
method.
|
||||
"""
|
||||
return self.unmasked_score(
|
||||
self.vocab.lookup(word), self.vocab.lookup(context) if context else None
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Score a word given some optional context.
|
||||
|
||||
Concrete models are expected to provide an implementation.
|
||||
Note that this method does not mask its arguments with the OOV label.
|
||||
Use the `score` method for that.
|
||||
|
||||
:param str word: Word for which we want the score
|
||||
:param tuple(str) context: Context the word is in.
|
||||
If `None`, compute unigram score.
|
||||
:param context: tuple(str) or None
|
||||
:rtype: float
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def logscore(self, word, context=None):
|
||||
"""Evaluate the log score of this word in this context.
|
||||
|
||||
The arguments are the same as for `score` and `unmasked_score`.
|
||||
|
||||
"""
|
||||
return log_base2(self.score(word, context))
|
||||
|
||||
def context_counts(self, context):
|
||||
"""Helper method for retrieving counts for a given context.
|
||||
|
||||
Assumes context has been checked and oov words in it masked.
|
||||
:type context: tuple(str) or None
|
||||
|
||||
"""
|
||||
return (
|
||||
self.counts[len(context) + 1][context] if context else self.counts.unigrams
|
||||
)
|
||||
|
||||
def entropy(self, text_ngrams):
|
||||
"""Calculate cross-entropy of model for given evaluation text.
|
||||
|
||||
This implementation is based on the Shannon-McMillan-Breiman theorem,
|
||||
as used and referenced by Dan Jurafsky and Jordan Boyd-Graber.
|
||||
|
||||
:param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
|
||||
:rtype: float
|
||||
|
||||
"""
|
||||
return -1 * _mean(
|
||||
[self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]
|
||||
)
|
||||
|
||||
def perplexity(self, text_ngrams):
|
||||
"""Calculates the perplexity of the given text.
|
||||
|
||||
This is simply 2 ** cross-entropy for the text, so the arguments are the same.
|
||||
|
||||
"""
|
||||
return pow(2.0, self.entropy(text_ngrams))
|
||||
|
||||
def generate(self, num_words=1, text_seed=None, random_seed=None):
|
||||
"""Generate words from the model.
|
||||
|
||||
:param int num_words: How many words to generate. By default 1.
|
||||
:param text_seed: Generation can be conditioned on preceding context.
|
||||
:param random_seed: A random seed or an instance of `random.Random`. If provided,
|
||||
makes the random sampling part of generation reproducible.
|
||||
:return: One (str) word or a list of words generated from model.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from nltk.lm import MLE
|
||||
>>> lm = MLE(2)
|
||||
>>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
|
||||
>>> lm.fit([[("a",), ("b",), ("c",)]])
|
||||
>>> lm.generate(random_seed=3)
|
||||
'a'
|
||||
>>> lm.generate(text_seed=['a'])
|
||||
'b'
|
||||
|
||||
"""
|
||||
text_seed = [] if text_seed is None else list(text_seed)
|
||||
random_generator = _random_generator(random_seed)
|
||||
# This is the base recursion case.
|
||||
if num_words == 1:
|
||||
context = (
|
||||
text_seed[-self.order + 1 :]
|
||||
if len(text_seed) >= self.order
|
||||
else text_seed
|
||||
)
|
||||
samples = self.context_counts(self.vocab.lookup(context))
|
||||
while context and not samples:
|
||||
context = context[1:] if len(context) > 1 else []
|
||||
samples = self.context_counts(self.vocab.lookup(context))
|
||||
# Sorting samples achieves two things:
|
||||
# - reproducible randomness when sampling
|
||||
# - turns Mapping into Sequence which `_weighted_choice` expects
|
||||
samples = sorted(samples)
|
||||
return _weighted_choice(
|
||||
samples,
|
||||
tuple(self.score(w, context) for w in samples),
|
||||
random_generator,
|
||||
)
|
||||
# We build up text one word at a time using the preceding context.
|
||||
generated = []
|
||||
for _ in range(num_words):
|
||||
generated.append(
|
||||
self.generate(
|
||||
num_words=1,
|
||||
text_seed=text_seed + generated,
|
||||
random_seed=random_generator,
|
||||
)
|
||||
)
|
||||
return generated
|
||||
163
Backend/venv/lib/python3.12/site-packages/nltk/lm/counter.py
Normal file
163
Backend/venv/lib/python3.12/site-packages/nltk/lm/counter.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Language Model Counter
|
||||
----------------------
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import Sequence
|
||||
|
||||
from nltk.probability import ConditionalFreqDist, FreqDist
|
||||
|
||||
|
||||
class NgramCounter:
|
||||
"""Class for counting ngrams.
|
||||
|
||||
Will count any ngram sequence you give it ;)
|
||||
|
||||
First we need to make sure we are feeding the counter sentences of ngrams.
|
||||
|
||||
>>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
|
||||
>>> from nltk.util import ngrams
|
||||
>>> text_bigrams = [ngrams(sent, 2) for sent in text]
|
||||
>>> text_unigrams = [ngrams(sent, 1) for sent in text]
|
||||
|
||||
The counting itself is very simple.
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
|
||||
|
||||
You can conveniently access ngram counts using standard python dictionary notation.
|
||||
String keys will give you unigram counts.
|
||||
|
||||
>>> ngram_counts['a']
|
||||
2
|
||||
>>> ngram_counts['aliens']
|
||||
0
|
||||
|
||||
If you want to access counts for higher order ngrams, use a list or a tuple.
|
||||
These are treated as "context" keys, so what you get is a frequency distribution
|
||||
over all continuations after the given context.
|
||||
|
||||
>>> sorted(ngram_counts[['a']].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
>>> sorted(ngram_counts[('a',)].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
|
||||
This is equivalent to specifying explicitly the order of the ngram (in this case
|
||||
2 for bigram) and indexing on the context.
|
||||
|
||||
>>> ngram_counts[2][('a',)] is ngram_counts[['a']]
|
||||
True
|
||||
|
||||
Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
|
||||
It is generally advisable to use the less verbose and more flexible square
|
||||
bracket notation.
|
||||
|
||||
To get the count of the full ngram "a b", do this:
|
||||
|
||||
>>> ngram_counts[['a']]['b']
|
||||
1
|
||||
|
||||
Specifying the ngram order as a number can be useful for accessing all ngrams
|
||||
in that order.
|
||||
|
||||
>>> ngram_counts[2]
|
||||
<ConditionalFreqDist with 4 conditions>
|
||||
|
||||
The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
|
||||
Unigrams can also be accessed with a human-friendly alias.
|
||||
|
||||
>>> ngram_counts.unigrams is ngram_counts[1]
|
||||
True
|
||||
|
||||
Similarly to `collections.Counter`, you can update counts after initialization.
|
||||
|
||||
>>> ngram_counts['e']
|
||||
0
|
||||
>>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
|
||||
>>> ngram_counts['e']
|
||||
1
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, ngram_text=None):
|
||||
"""Creates a new NgramCounter.
|
||||
|
||||
If `ngram_text` is specified, counts ngrams from it, otherwise waits for
|
||||
`update` method to be called explicitly.
|
||||
|
||||
:param ngram_text: Optional text containing sentences of ngrams, as for `update` method.
|
||||
:type ngram_text: Iterable(Iterable(tuple(str))) or None
|
||||
|
||||
"""
|
||||
self._counts = defaultdict(ConditionalFreqDist)
|
||||
self._counts[1] = self.unigrams = FreqDist()
|
||||
|
||||
if ngram_text:
|
||||
self.update(ngram_text)
|
||||
|
||||
def update(self, ngram_text):
|
||||
"""Updates ngram counts from `ngram_text`.
|
||||
|
||||
Expects `ngram_text` to be a sequence of sentences (sequences).
|
||||
Each sentence consists of ngrams as tuples of strings.
|
||||
|
||||
:param Iterable(Iterable(tuple(str))) ngram_text: Text containing sentences of ngrams.
|
||||
:raises TypeError: if the ngrams are not tuples.
|
||||
|
||||
"""
|
||||
|
||||
for sent in ngram_text:
|
||||
for ngram in sent:
|
||||
if not isinstance(ngram, tuple):
|
||||
raise TypeError(
|
||||
"Ngram <{}> isn't a tuple, " "but {}".format(ngram, type(ngram))
|
||||
)
|
||||
|
||||
ngram_order = len(ngram)
|
||||
if ngram_order == 1:
|
||||
self.unigrams[ngram[0]] += 1
|
||||
continue
|
||||
|
||||
context, word = ngram[:-1], ngram[-1]
|
||||
self[ngram_order][context][word] += 1
|
||||
|
||||
def N(self):
|
||||
"""Returns grand total number of ngrams stored.
|
||||
|
||||
This includes ngrams from all orders, so some duplication is expected.
|
||||
:rtype: int
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
|
||||
>>> counts.N()
|
||||
3
|
||||
|
||||
"""
|
||||
return sum(val.N() for val in self._counts.values())
|
||||
|
||||
def __getitem__(self, item):
|
||||
"""User-friendly access to ngram counts."""
|
||||
if isinstance(item, int):
|
||||
return self._counts[item]
|
||||
elif isinstance(item, str):
|
||||
return self._counts.__getitem__(1)[item]
|
||||
elif isinstance(item, Sequence):
|
||||
return self._counts.__getitem__(len(item) + 1)[tuple(item)]
|
||||
|
||||
def __str__(self):
|
||||
return "<{} with {} ngram orders and {} ngrams>".format(
|
||||
self.__class__.__name__, len(self._counts), self.N()
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return self._counts.__len__()
|
||||
|
||||
def __contains__(self, item):
|
||||
return item in self._counts
|
||||
141
Backend/venv/lib/python3.12/site-packages/nltk/lm/models.py
Normal file
141
Backend/venv/lib/python3.12/site-packages/nltk/lm/models.py
Normal file
@@ -0,0 +1,141 @@
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# Manu Joseph <manujosephv@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Models"""
|
||||
|
||||
from nltk.lm.api import LanguageModel, Smoothing
|
||||
from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell
|
||||
|
||||
|
||||
class MLE(LanguageModel):
|
||||
"""Class for providing MLE ngram model scores.
|
||||
|
||||
Inherits initialization from BaseNgramModel.
|
||||
"""
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Returns the MLE score for a word given a context.
|
||||
|
||||
Args:
|
||||
- word is expected to be a string
|
||||
- context is expected to be something reasonably convertible to a tuple
|
||||
"""
|
||||
return self.context_counts(context).freq(word)
|
||||
|
||||
|
||||
class Lidstone(LanguageModel):
|
||||
"""Provides Lidstone-smoothed scores.
|
||||
|
||||
In addition to initialization arguments from BaseNgramModel also requires
|
||||
a number by which to increase the counts, gamma.
|
||||
"""
|
||||
|
||||
def __init__(self, gamma, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.gamma = gamma
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Add-one smoothing: Lidstone or Laplace.
|
||||
|
||||
To see what kind, look at `gamma` attribute on the class.
|
||||
|
||||
"""
|
||||
counts = self.context_counts(context)
|
||||
word_count = counts[word]
|
||||
norm_count = counts.N()
|
||||
return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
|
||||
|
||||
|
||||
class Laplace(Lidstone):
|
||||
"""Implements Laplace (add one) smoothing.
|
||||
|
||||
Initialization identical to BaseNgramModel because gamma is always 1.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(1, *args, **kwargs)
|
||||
|
||||
|
||||
class StupidBackoff(LanguageModel):
|
||||
"""Provides StupidBackoff scores.
|
||||
|
||||
In addition to initialization arguments from BaseNgramModel also requires
|
||||
a parameter alpha with which we scale the lower order probabilities.
|
||||
Note that this is not a true probability distribution as scores for ngrams
|
||||
of the same order do not sum up to unity.
|
||||
"""
|
||||
|
||||
def __init__(self, alpha=0.4, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.alpha = alpha
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
if not context:
|
||||
# Base recursion
|
||||
return self.counts.unigrams.freq(word)
|
||||
counts = self.context_counts(context)
|
||||
word_count = counts[word]
|
||||
norm_count = counts.N()
|
||||
if word_count > 0:
|
||||
return word_count / norm_count
|
||||
else:
|
||||
return self.alpha * self.unmasked_score(word, context[1:])
|
||||
|
||||
|
||||
class InterpolatedLanguageModel(LanguageModel):
|
||||
"""Logic common to all interpolated language models.
|
||||
|
||||
The idea to abstract this comes from Chen & Goodman 1995.
|
||||
Do not instantiate this class directly!
|
||||
"""
|
||||
|
||||
def __init__(self, smoothing_cls, order, **kwargs):
|
||||
params = kwargs.pop("params", {})
|
||||
super().__init__(order, **kwargs)
|
||||
self.estimator = smoothing_cls(self.vocab, self.counts, **params)
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
if not context:
|
||||
# The base recursion case: no context, we only have a unigram.
|
||||
return self.estimator.unigram_score(word)
|
||||
if not self.counts[context]:
|
||||
# It can also happen that we have no data for this context.
|
||||
# In that case we defer to the lower-order ngram.
|
||||
# This is the same as setting alpha to 0 and gamma to 1.
|
||||
alpha, gamma = 0, 1
|
||||
else:
|
||||
alpha, gamma = self.estimator.alpha_gamma(word, context)
|
||||
return alpha + gamma * self.unmasked_score(word, context[1:])
|
||||
|
||||
|
||||
class WittenBellInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of Witten-Bell smoothing."""
|
||||
|
||||
def __init__(self, order, **kwargs):
|
||||
super().__init__(WittenBell, order, **kwargs)
|
||||
|
||||
|
||||
class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of smoothing with absolute discount."""
|
||||
|
||||
def __init__(self, order, discount=0.75, **kwargs):
|
||||
super().__init__(
|
||||
AbsoluteDiscounting, order, params={"discount": discount}, **kwargs
|
||||
)
|
||||
|
||||
|
||||
class KneserNeyInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of Kneser-Ney smoothing."""
|
||||
|
||||
def __init__(self, order, discount=0.1, **kwargs):
|
||||
if not (0 <= discount <= 1):
|
||||
raise ValueError(
|
||||
"Discount must be between 0 and 1 for probabilities to sum to unity."
|
||||
)
|
||||
super().__init__(
|
||||
KneserNey, order, params={"discount": discount, "order": order}, **kwargs
|
||||
)
|
||||
@@ -0,0 +1,51 @@
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
|
||||
from nltk.util import everygrams, pad_sequence
|
||||
|
||||
flatten = chain.from_iterable
|
||||
pad_both_ends = partial(
|
||||
pad_sequence,
|
||||
pad_left=True,
|
||||
left_pad_symbol="<s>",
|
||||
pad_right=True,
|
||||
right_pad_symbol="</s>",
|
||||
)
|
||||
pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.
|
||||
|
||||
Following convention <s> pads the start of sentence </s> pads its end.
|
||||
"""
|
||||
|
||||
|
||||
def padded_everygrams(order, sentence):
|
||||
"""Helper with some useful defaults.
|
||||
|
||||
Applies pad_both_ends to sentence and follows it up with everygrams.
|
||||
"""
|
||||
return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
|
||||
|
||||
|
||||
def padded_everygram_pipeline(order, text):
|
||||
"""Default preprocessing for a sequence of sentences.
|
||||
|
||||
Creates two iterators:
|
||||
|
||||
- sentences padded and turned into sequences of `nltk.util.everygrams`
|
||||
- sentences padded as above and chained together for a flat stream of words
|
||||
|
||||
:param order: Largest ngram length produced by `everygrams`.
|
||||
:param text: Text to iterate over. Expected to be an iterable of sentences.
|
||||
:type text: Iterable[Iterable[str]]
|
||||
:return: iterator over text as ngrams, iterator over text as vocabulary data
|
||||
"""
|
||||
padding_fn = partial(pad_both_ends, n=order)
|
||||
return (
|
||||
(everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
|
||||
flatten(map(padding_fn, text)),
|
||||
)
|
||||
127
Backend/venv/lib/python3.12/site-packages/nltk/lm/smoothing.py
Normal file
127
Backend/venv/lib/python3.12/site-packages/nltk/lm/smoothing.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# Manu Joseph <manujosephv@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Smoothing algorithms for language modeling.
|
||||
|
||||
According to Chen & Goodman 1995 these should work with both Backoff and
|
||||
Interpolation.
|
||||
"""
|
||||
from operator import methodcaller
|
||||
|
||||
from nltk.lm.api import Smoothing
|
||||
from nltk.probability import ConditionalFreqDist
|
||||
|
||||
|
||||
def _count_values_gt_zero(distribution):
|
||||
"""Count values that are greater than zero in a distribution.
|
||||
|
||||
Assumes distribution is either a mapping with counts as values or
|
||||
an instance of `nltk.ConditionalFreqDist`.
|
||||
"""
|
||||
as_count = (
|
||||
methodcaller("N")
|
||||
if isinstance(distribution, ConditionalFreqDist)
|
||||
else lambda count: count
|
||||
)
|
||||
# We explicitly check that values are > 0 to guard against negative counts.
|
||||
return sum(
|
||||
1 for dist_or_count in distribution.values() if as_count(dist_or_count) > 0
|
||||
)
|
||||
|
||||
|
||||
class WittenBell(Smoothing):
|
||||
"""Witten-Bell smoothing."""
|
||||
|
||||
def __init__(self, vocabulary, counter, **kwargs):
|
||||
super().__init__(vocabulary, counter, **kwargs)
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
alpha = self.counts[context].freq(word)
|
||||
gamma = self._gamma(context)
|
||||
return (1.0 - gamma) * alpha, gamma
|
||||
|
||||
def _gamma(self, context):
|
||||
n_plus = _count_values_gt_zero(self.counts[context])
|
||||
return n_plus / (n_plus + self.counts[context].N())
|
||||
|
||||
def unigram_score(self, word):
|
||||
return self.counts.unigrams.freq(word)
|
||||
|
||||
|
||||
class AbsoluteDiscounting(Smoothing):
|
||||
"""Smoothing with absolute discount."""
|
||||
|
||||
def __init__(self, vocabulary, counter, discount=0.75, **kwargs):
|
||||
super().__init__(vocabulary, counter, **kwargs)
|
||||
self.discount = discount
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
alpha = (
|
||||
max(self.counts[context][word] - self.discount, 0)
|
||||
/ self.counts[context].N()
|
||||
)
|
||||
gamma = self._gamma(context)
|
||||
return alpha, gamma
|
||||
|
||||
def _gamma(self, context):
|
||||
n_plus = _count_values_gt_zero(self.counts[context])
|
||||
return (self.discount * n_plus) / self.counts[context].N()
|
||||
|
||||
def unigram_score(self, word):
|
||||
return self.counts.unigrams.freq(word)
|
||||
|
||||
|
||||
class KneserNey(Smoothing):
|
||||
"""Kneser-Ney Smoothing.
|
||||
|
||||
This is an extension of smoothing with a discount.
|
||||
|
||||
Resources:
|
||||
- https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf
|
||||
- https://www.youtube.com/watch?v=ody1ysUTD7o
|
||||
- https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8
|
||||
- https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf
|
||||
- https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, vocabulary, counter, order, discount=0.1, **kwargs):
|
||||
super().__init__(vocabulary, counter, **kwargs)
|
||||
self.discount = discount
|
||||
self._order = order
|
||||
|
||||
def unigram_score(self, word):
|
||||
word_continuation_count, total_count = self._continuation_counts(word)
|
||||
return word_continuation_count / total_count
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
prefix_counts = self.counts[context]
|
||||
word_continuation_count, total_count = (
|
||||
(prefix_counts[word], prefix_counts.N())
|
||||
if len(context) + 1 == self._order
|
||||
else self._continuation_counts(word, context)
|
||||
)
|
||||
alpha = max(word_continuation_count - self.discount, 0.0) / total_count
|
||||
gamma = self.discount * _count_values_gt_zero(prefix_counts) / total_count
|
||||
return alpha, gamma
|
||||
|
||||
def _continuation_counts(self, word, context=tuple()):
|
||||
"""Count continuations that end with context and word.
|
||||
|
||||
Continuations track unique ngram "types", regardless of how many
|
||||
instances were observed for each "type".
|
||||
This is different than raw ngram counts which track number of instances.
|
||||
"""
|
||||
higher_order_ngrams_with_context = (
|
||||
counts
|
||||
for prefix_ngram, counts in self.counts[len(context) + 2].items()
|
||||
if prefix_ngram[1:] == context
|
||||
)
|
||||
higher_order_ngrams_with_word_count, total = 0, 0
|
||||
for counts in higher_order_ngrams_with_context:
|
||||
higher_order_ngrams_with_word_count += int(counts[word] > 0)
|
||||
total += _count_values_gt_zero(counts)
|
||||
return higher_order_ngrams_with_word_count, total
|
||||
19
Backend/venv/lib/python3.12/site-packages/nltk/lm/util.py
Normal file
19
Backend/venv/lib/python3.12/site-packages/nltk/lm/util.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Utilities"""
|
||||
|
||||
from math import log
|
||||
|
||||
NEG_INF = float("-inf")
|
||||
POS_INF = float("inf")
|
||||
|
||||
|
||||
def log_base2(score):
|
||||
"""Convenience function for computing logarithms with base 2."""
|
||||
if score == 0.0:
|
||||
return NEG_INF
|
||||
return log(score, 2)
|
||||
218
Backend/venv/lib/python3.12/site-packages/nltk/lm/vocabulary.py
Normal file
218
Backend/venv/lib/python3.12/site-packages/nltk/lm/vocabulary.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Vocabulary"""
|
||||
|
||||
import sys
|
||||
from collections import Counter
|
||||
from collections.abc import Iterable
|
||||
from functools import singledispatch
|
||||
from itertools import chain
|
||||
|
||||
|
||||
@singledispatch
|
||||
def _dispatched_lookup(words, vocab):
|
||||
raise TypeError(f"Unsupported type for looking up in vocabulary: {type(words)}")
|
||||
|
||||
|
||||
@_dispatched_lookup.register(Iterable)
|
||||
def _(words, vocab):
|
||||
"""Look up a sequence of words in the vocabulary.
|
||||
|
||||
Returns an iterator over looked up words.
|
||||
|
||||
"""
|
||||
return tuple(_dispatched_lookup(w, vocab) for w in words)
|
||||
|
||||
|
||||
@_dispatched_lookup.register(str)
|
||||
def _string_lookup(word, vocab):
|
||||
"""Looks up one word in the vocabulary."""
|
||||
return word if word in vocab else vocab.unk_label
|
||||
|
||||
|
||||
class Vocabulary:
|
||||
"""Stores language model vocabulary.
|
||||
|
||||
Satisfies two common language modeling requirements for a vocabulary:
|
||||
|
||||
- When checking membership and calculating its size, filters items
|
||||
by comparing their counts to a cutoff value.
|
||||
- Adds a special "unknown" token which unseen words are mapped to.
|
||||
|
||||
>>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
|
||||
>>> from nltk.lm import Vocabulary
|
||||
>>> vocab = Vocabulary(words, unk_cutoff=2)
|
||||
|
||||
Tokens with counts greater than or equal to the cutoff value will
|
||||
be considered part of the vocabulary.
|
||||
|
||||
>>> vocab['c']
|
||||
3
|
||||
>>> 'c' in vocab
|
||||
True
|
||||
>>> vocab['d']
|
||||
2
|
||||
>>> 'd' in vocab
|
||||
True
|
||||
|
||||
Tokens with frequency counts less than the cutoff value will be considered not
|
||||
part of the vocabulary even though their entries in the count dictionary are
|
||||
preserved.
|
||||
|
||||
>>> vocab['b']
|
||||
1
|
||||
>>> 'b' in vocab
|
||||
False
|
||||
>>> vocab['aliens']
|
||||
0
|
||||
>>> 'aliens' in vocab
|
||||
False
|
||||
|
||||
Keeping the count entries for seen words allows us to change the cutoff value
|
||||
without having to recalculate the counts.
|
||||
|
||||
>>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
|
||||
>>> "b" in vocab2
|
||||
True
|
||||
|
||||
The cutoff value influences not only membership checking but also the result of
|
||||
getting the size of the vocabulary using the built-in `len`.
|
||||
Note that while the number of keys in the vocabulary's counter stays the same,
|
||||
the items in the vocabulary differ depending on the cutoff.
|
||||
We use `sorted` to demonstrate because it keeps the order consistent.
|
||||
|
||||
>>> sorted(vocab2.counts)
|
||||
['-', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab2)
|
||||
['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab.counts)
|
||||
['-', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab)
|
||||
['<UNK>', 'a', 'c', 'd']
|
||||
|
||||
In addition to items it gets populated with, the vocabulary stores a special
|
||||
token that stands in for so-called "unknown" items. By default it's "<UNK>".
|
||||
|
||||
>>> "<UNK>" in vocab
|
||||
True
|
||||
|
||||
We can look up words in a vocabulary using its `lookup` method.
|
||||
"Unseen" words (with counts less than cutoff) are looked up as the unknown label.
|
||||
If given one word (a string) as an input, this method will return a string.
|
||||
|
||||
>>> vocab.lookup("a")
|
||||
'a'
|
||||
>>> vocab.lookup("aliens")
|
||||
'<UNK>'
|
||||
|
||||
If given a sequence, it will return an tuple of the looked up words.
|
||||
|
||||
>>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
|
||||
('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')
|
||||
|
||||
It's possible to update the counts after the vocabulary has been created.
|
||||
In general, the interface is the same as that of `collections.Counter`.
|
||||
|
||||
>>> vocab['b']
|
||||
1
|
||||
>>> vocab.update(["b", "b", "c"])
|
||||
>>> vocab['b']
|
||||
3
|
||||
"""
|
||||
|
||||
def __init__(self, counts=None, unk_cutoff=1, unk_label="<UNK>"):
|
||||
"""Create a new Vocabulary.
|
||||
|
||||
:param counts: Optional iterable or `collections.Counter` instance to
|
||||
pre-seed the Vocabulary. In case it is iterable, counts
|
||||
are calculated.
|
||||
:param int unk_cutoff: Words that occur less frequently than this value
|
||||
are not considered part of the vocabulary.
|
||||
:param unk_label: Label for marking words not part of vocabulary.
|
||||
|
||||
"""
|
||||
self.unk_label = unk_label
|
||||
if unk_cutoff < 1:
|
||||
raise ValueError(f"Cutoff value cannot be less than 1. Got: {unk_cutoff}")
|
||||
self._cutoff = unk_cutoff
|
||||
|
||||
self.counts = Counter()
|
||||
self.update(counts if counts is not None else "")
|
||||
|
||||
@property
|
||||
def cutoff(self):
|
||||
"""Cutoff value.
|
||||
|
||||
Items with count below this value are not considered part of vocabulary.
|
||||
|
||||
"""
|
||||
return self._cutoff
|
||||
|
||||
def update(self, *counter_args, **counter_kwargs):
|
||||
"""Update vocabulary counts.
|
||||
|
||||
Wraps `collections.Counter.update` method.
|
||||
|
||||
"""
|
||||
self.counts.update(*counter_args, **counter_kwargs)
|
||||
self._len = sum(1 for _ in self)
|
||||
|
||||
def lookup(self, words):
|
||||
"""Look up one or more words in the vocabulary.
|
||||
|
||||
If passed one word as a string will return that word or `self.unk_label`.
|
||||
Otherwise will assume it was passed a sequence of words, will try to look
|
||||
each of them up and return an iterator over the looked up words.
|
||||
|
||||
:param words: Word(s) to look up.
|
||||
:type words: Iterable(str) or str
|
||||
:rtype: generator(str) or str
|
||||
:raises: TypeError for types other than strings or iterables
|
||||
|
||||
>>> from nltk.lm import Vocabulary
|
||||
>>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
|
||||
>>> vocab.lookup("a")
|
||||
'a'
|
||||
>>> vocab.lookup("aliens")
|
||||
'<UNK>'
|
||||
>>> vocab.lookup(["a", "b", "c", ["x", "b"]])
|
||||
('a', 'b', '<UNK>', ('<UNK>', 'b'))
|
||||
|
||||
"""
|
||||
return _dispatched_lookup(words, self)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self._cutoff if item == self.unk_label else self.counts[item]
|
||||
|
||||
def __contains__(self, item):
|
||||
"""Only consider items with counts GE to cutoff as being in the
|
||||
vocabulary."""
|
||||
return self[item] >= self.cutoff
|
||||
|
||||
def __iter__(self):
|
||||
"""Building on membership check define how to iterate over
|
||||
vocabulary."""
|
||||
return chain(
|
||||
(item for item in self.counts if item in self),
|
||||
[self.unk_label] if self.counts else [],
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
"""Computing size of vocabulary reflects the cutoff."""
|
||||
return self._len
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.unk_label == other.unk_label
|
||||
and self.cutoff == other.cutoff
|
||||
and self.counts == other.counts
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "<{} with cutoff={} unk_label='{}' and {} items>".format(
|
||||
self.__class__.__name__, self.cutoff, self.unk_label, len(self)
|
||||
)
|
||||
Reference in New Issue
Block a user