updates
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
# Natural Language Toolkit: Language ID module using TextCat algorithm
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A module for language identification using the TextCat algorithm.
|
||||
An implementation of the text categorization algorithm
|
||||
presented in Cavnar, W. B. and J. M. Trenkle,
|
||||
"N-Gram-Based Text Categorization".
|
||||
|
||||
The algorithm takes advantage of Zipf's law and uses
|
||||
n-gram frequencies to profile languages and text-yet to
|
||||
be identified-then compares using a distance measure.
|
||||
|
||||
Language n-grams are provided by the "An Crubadan"
|
||||
project. A corpus reader was created separately to read
|
||||
those files.
|
||||
|
||||
For details regarding the algorithm, see:
|
||||
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
|
||||
|
||||
For details about An Crubadan, see:
|
||||
https://borel.slu.edu/crubadan/index.html
|
||||
"""
|
||||
|
||||
from sys import maxsize
|
||||
|
||||
from nltk.util import trigrams
|
||||
|
||||
# Note: this is NOT "re" you're likely used to. The regex module
|
||||
# is an alternative to the standard re module that supports
|
||||
# Unicode codepoint properties with the \p{} syntax.
|
||||
# You may have to "pip install regx"
|
||||
try:
|
||||
import regex as re
|
||||
except ImportError:
|
||||
re = None
|
||||
######################################################################
|
||||
## Language identification using TextCat
|
||||
######################################################################
|
||||
|
||||
|
||||
class TextCat:
|
||||
_corpus = None
|
||||
fingerprints = {}
|
||||
_START_CHAR = "<"
|
||||
_END_CHAR = ">"
|
||||
|
||||
last_distances = {}
|
||||
|
||||
def __init__(self):
|
||||
if not re:
|
||||
raise OSError(
|
||||
"classify.textcat requires the regex module that "
|
||||
"supports unicode. Try '$ pip install regex' and "
|
||||
"see https://pypi.python.org/pypi/regex for "
|
||||
"further details."
|
||||
)
|
||||
|
||||
from nltk.corpus import crubadan
|
||||
|
||||
self._corpus = crubadan
|
||||
# Load all language ngrams into cache
|
||||
for lang in self._corpus.langs():
|
||||
self._corpus.lang_freq(lang)
|
||||
|
||||
def remove_punctuation(self, text):
|
||||
"""Get rid of punctuation except apostrophes"""
|
||||
return re.sub(r"[^\P{P}\']+", "", text)
|
||||
|
||||
def profile(self, text):
|
||||
"""Create FreqDist of trigrams within text"""
|
||||
from nltk import FreqDist, word_tokenize
|
||||
|
||||
clean_text = self.remove_punctuation(text)
|
||||
tokens = word_tokenize(clean_text)
|
||||
|
||||
fingerprint = FreqDist()
|
||||
for t in tokens:
|
||||
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
|
||||
token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
|
||||
|
||||
for cur_trigram in token_trigrams:
|
||||
if cur_trigram in fingerprint:
|
||||
fingerprint[cur_trigram] += 1
|
||||
else:
|
||||
fingerprint[cur_trigram] = 1
|
||||
|
||||
return fingerprint
|
||||
|
||||
def calc_dist(self, lang, trigram, text_profile):
|
||||
"""Calculate the "out-of-place" measure between the
|
||||
text and language profile for a single trigram"""
|
||||
|
||||
lang_fd = self._corpus.lang_freq(lang)
|
||||
dist = 0
|
||||
|
||||
if trigram in lang_fd:
|
||||
idx_lang_profile = list(lang_fd.keys()).index(trigram)
|
||||
idx_text = list(text_profile.keys()).index(trigram)
|
||||
|
||||
# print(idx_lang_profile, ", ", idx_text)
|
||||
dist = abs(idx_lang_profile - idx_text)
|
||||
else:
|
||||
# Arbitrary but should be larger than
|
||||
# any possible trigram file length
|
||||
# in terms of total lines
|
||||
dist = maxsize
|
||||
|
||||
return dist
|
||||
|
||||
def lang_dists(self, text):
|
||||
"""Calculate the "out-of-place" measure between
|
||||
the text and all languages"""
|
||||
|
||||
distances = {}
|
||||
profile = self.profile(text)
|
||||
# For all the languages
|
||||
for lang in self._corpus._all_lang_freq.keys():
|
||||
# Calculate distance metric for every trigram in
|
||||
# input text to be identified
|
||||
lang_dist = 0
|
||||
for trigram in profile:
|
||||
lang_dist += self.calc_dist(lang, trigram, profile)
|
||||
|
||||
distances[lang] = lang_dist
|
||||
|
||||
return distances
|
||||
|
||||
def guess_language(self, text):
|
||||
"""Find the language with the min distance
|
||||
to the text and return its ISO 639-3 code"""
|
||||
self.last_distances = self.lang_dists(text)
|
||||
|
||||
return min(self.last_distances, key=self.last_distances.get)
|
||||
#################################################')
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.corpus import udhr
|
||||
|
||||
langs = [
|
||||
"Kurdish-UTF8",
|
||||
"Abkhaz-UTF8",
|
||||
"Farsi_Persian-UTF8",
|
||||
"Hindi-UTF8",
|
||||
"Hawaiian-UTF8",
|
||||
"Russian-UTF8",
|
||||
"Vietnamese-UTF8",
|
||||
"Serbian_Srpski-UTF8",
|
||||
"Esperanto-UTF8",
|
||||
]
|
||||
|
||||
friendly = {
|
||||
"kmr": "Northern Kurdish",
|
||||
"abk": "Abkhazian",
|
||||
"pes": "Iranian Persian",
|
||||
"hin": "Hindi",
|
||||
"haw": "Hawaiian",
|
||||
"rus": "Russian",
|
||||
"vie": "Vietnamese",
|
||||
"srp": "Serbian",
|
||||
"epo": "Esperanto",
|
||||
}
|
||||
|
||||
tc = TextCat()
|
||||
|
||||
for cur_lang in langs:
|
||||
# Get raw data from UDHR corpus
|
||||
raw_sentences = udhr.sents(cur_lang)
|
||||
rows = len(raw_sentences) - 1
|
||||
cols = list(map(len, raw_sentences))
|
||||
|
||||
sample = ""
|
||||
|
||||
# Generate a sample text of the language
|
||||
for i in range(0, rows):
|
||||
cur_sent = " " + " ".join([raw_sentences[i][j] for j in range(0, cols[i])])
|
||||
sample += cur_sent
|
||||
|
||||
# Try to detect what it is
|
||||
print("Language snippet: " + sample[0:140] + "...")
|
||||
guess = tc.guess_language(sample)
|
||||
print(f"Language detection: {guess} ({friendly[guess]})")
|
||||
print("#" * 140)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
Reference in New Issue
Block a user