updates
This commit is contained in:
163
Backend/venv/lib/python3.12/site-packages/nltk/lm/counter.py
Normal file
163
Backend/venv/lib/python3.12/site-packages/nltk/lm/counter.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Language Model Counter
|
||||
----------------------
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import Sequence
|
||||
|
||||
from nltk.probability import ConditionalFreqDist, FreqDist
|
||||
|
||||
|
||||
class NgramCounter:
|
||||
"""Class for counting ngrams.
|
||||
|
||||
Will count any ngram sequence you give it ;)
|
||||
|
||||
First we need to make sure we are feeding the counter sentences of ngrams.
|
||||
|
||||
>>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
|
||||
>>> from nltk.util import ngrams
|
||||
>>> text_bigrams = [ngrams(sent, 2) for sent in text]
|
||||
>>> text_unigrams = [ngrams(sent, 1) for sent in text]
|
||||
|
||||
The counting itself is very simple.
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
|
||||
|
||||
You can conveniently access ngram counts using standard python dictionary notation.
|
||||
String keys will give you unigram counts.
|
||||
|
||||
>>> ngram_counts['a']
|
||||
2
|
||||
>>> ngram_counts['aliens']
|
||||
0
|
||||
|
||||
If you want to access counts for higher order ngrams, use a list or a tuple.
|
||||
These are treated as "context" keys, so what you get is a frequency distribution
|
||||
over all continuations after the given context.
|
||||
|
||||
>>> sorted(ngram_counts[['a']].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
>>> sorted(ngram_counts[('a',)].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
|
||||
This is equivalent to specifying explicitly the order of the ngram (in this case
|
||||
2 for bigram) and indexing on the context.
|
||||
|
||||
>>> ngram_counts[2][('a',)] is ngram_counts[['a']]
|
||||
True
|
||||
|
||||
Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
|
||||
It is generally advisable to use the less verbose and more flexible square
|
||||
bracket notation.
|
||||
|
||||
To get the count of the full ngram "a b", do this:
|
||||
|
||||
>>> ngram_counts[['a']]['b']
|
||||
1
|
||||
|
||||
Specifying the ngram order as a number can be useful for accessing all ngrams
|
||||
in that order.
|
||||
|
||||
>>> ngram_counts[2]
|
||||
<ConditionalFreqDist with 4 conditions>
|
||||
|
||||
The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
|
||||
Unigrams can also be accessed with a human-friendly alias.
|
||||
|
||||
>>> ngram_counts.unigrams is ngram_counts[1]
|
||||
True
|
||||
|
||||
Similarly to `collections.Counter`, you can update counts after initialization.
|
||||
|
||||
>>> ngram_counts['e']
|
||||
0
|
||||
>>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
|
||||
>>> ngram_counts['e']
|
||||
1
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, ngram_text=None):
|
||||
"""Creates a new NgramCounter.
|
||||
|
||||
If `ngram_text` is specified, counts ngrams from it, otherwise waits for
|
||||
`update` method to be called explicitly.
|
||||
|
||||
:param ngram_text: Optional text containing sentences of ngrams, as for `update` method.
|
||||
:type ngram_text: Iterable(Iterable(tuple(str))) or None
|
||||
|
||||
"""
|
||||
self._counts = defaultdict(ConditionalFreqDist)
|
||||
self._counts[1] = self.unigrams = FreqDist()
|
||||
|
||||
if ngram_text:
|
||||
self.update(ngram_text)
|
||||
|
||||
def update(self, ngram_text):
|
||||
"""Updates ngram counts from `ngram_text`.
|
||||
|
||||
Expects `ngram_text` to be a sequence of sentences (sequences).
|
||||
Each sentence consists of ngrams as tuples of strings.
|
||||
|
||||
:param Iterable(Iterable(tuple(str))) ngram_text: Text containing sentences of ngrams.
|
||||
:raises TypeError: if the ngrams are not tuples.
|
||||
|
||||
"""
|
||||
|
||||
for sent in ngram_text:
|
||||
for ngram in sent:
|
||||
if not isinstance(ngram, tuple):
|
||||
raise TypeError(
|
||||
"Ngram <{}> isn't a tuple, " "but {}".format(ngram, type(ngram))
|
||||
)
|
||||
|
||||
ngram_order = len(ngram)
|
||||
if ngram_order == 1:
|
||||
self.unigrams[ngram[0]] += 1
|
||||
continue
|
||||
|
||||
context, word = ngram[:-1], ngram[-1]
|
||||
self[ngram_order][context][word] += 1
|
||||
|
||||
def N(self):
|
||||
"""Returns grand total number of ngrams stored.
|
||||
|
||||
This includes ngrams from all orders, so some duplication is expected.
|
||||
:rtype: int
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
|
||||
>>> counts.N()
|
||||
3
|
||||
|
||||
"""
|
||||
return sum(val.N() for val in self._counts.values())
|
||||
|
||||
def __getitem__(self, item):
|
||||
"""User-friendly access to ngram counts."""
|
||||
if isinstance(item, int):
|
||||
return self._counts[item]
|
||||
elif isinstance(item, str):
|
||||
return self._counts.__getitem__(1)[item]
|
||||
elif isinstance(item, Sequence):
|
||||
return self._counts.__getitem__(len(item) + 1)[tuple(item)]
|
||||
|
||||
def __str__(self):
|
||||
return "<{} with {} ngram orders and {} ngrams>".format(
|
||||
self.__class__.__name__, len(self._counts), self.N()
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return self._counts.__len__()
|
||||
|
||||
def __contains__(self, item):
|
||||
return item in self._counts
|
||||
Reference in New Issue
Block a user