updates
This commit is contained in:
342
Backend/venv/lib/python3.12/site-packages/nltk/stem/lancaster.py
Normal file
342
Backend/venv/lib/python3.12/site-packages/nltk/stem/lancaster.py
Normal file
@@ -0,0 +1,342 @@
|
||||
# Natural Language Toolkit: Stemmers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Tomcavage <stomcava@law.upenn.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
|
||||
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
|
||||
"""
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class LancasterStemmer(StemmerI):
|
||||
"""
|
||||
Lancaster Stemmer
|
||||
|
||||
>>> from nltk.stem.lancaster import LancasterStemmer
|
||||
>>> st = LancasterStemmer()
|
||||
>>> st.stem('maximum') # Remove "-um" when word is intact
|
||||
'maxim'
|
||||
>>> st.stem('presumably') # Don't remove "-um" when word is not intact
|
||||
'presum'
|
||||
>>> st.stem('multiply') # No action taken if word ends with "-ply"
|
||||
'multiply'
|
||||
>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
|
||||
'provid'
|
||||
>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
|
||||
'ow'
|
||||
>>> st.stem('ear') # ditto
|
||||
'ear'
|
||||
>>> st.stem('saying') # Words starting with consonant must contain at least 3
|
||||
'say'
|
||||
>>> st.stem('crying') # letters and one of those letters must be a vowel
|
||||
'cry'
|
||||
>>> st.stem('string') # ditto
|
||||
'string'
|
||||
>>> st.stem('meant') # ditto
|
||||
'meant'
|
||||
>>> st.stem('cement') # ditto
|
||||
'cem'
|
||||
>>> st_pre = LancasterStemmer(strip_prefix_flag=True)
|
||||
>>> st_pre.stem('kilometer') # Test Prefix
|
||||
'met'
|
||||
>>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
|
||||
>>> st_custom.stem("ness") # Change s to t
|
||||
'nest'
|
||||
"""
|
||||
|
||||
# The rule list is static since it doesn't change between instances
|
||||
default_rule_tuple = (
|
||||
"ai*2.", # -ia > - if intact
|
||||
"a*1.", # -a > - if intact
|
||||
"bb1.", # -bb > -b
|
||||
"city3s.", # -ytic > -ys
|
||||
"ci2>", # -ic > -
|
||||
"cn1t>", # -nc > -nt
|
||||
"dd1.", # -dd > -d
|
||||
"dei3y>", # -ied > -y
|
||||
"deec2ss.", # -ceed >", -cess
|
||||
"dee1.", # -eed > -ee
|
||||
"de2>", # -ed > -
|
||||
"dooh4>", # -hood > -
|
||||
"e1>", # -e > -
|
||||
"feil1v.", # -lief > -liev
|
||||
"fi2>", # -if > -
|
||||
"gni3>", # -ing > -
|
||||
"gai3y.", # -iag > -y
|
||||
"ga2>", # -ag > -
|
||||
"gg1.", # -gg > -g
|
||||
"ht*2.", # -th > - if intact
|
||||
"hsiug5ct.", # -guish > -ct
|
||||
"hsi3>", # -ish > -
|
||||
"i*1.", # -i > - if intact
|
||||
"i1y>", # -i > -y
|
||||
"ji1d.", # -ij > -id -- see nois4j> & vis3j>
|
||||
"juf1s.", # -fuj > -fus
|
||||
"ju1d.", # -uj > -ud
|
||||
"jo1d.", # -oj > -od
|
||||
"jeh1r.", # -hej > -her
|
||||
"jrev1t.", # -verj > -vert
|
||||
"jsim2t.", # -misj > -mit
|
||||
"jn1d.", # -nj > -nd
|
||||
"j1s.", # -j > -s
|
||||
"lbaifi6.", # -ifiabl > -
|
||||
"lbai4y.", # -iabl > -y
|
||||
"lba3>", # -abl > -
|
||||
"lbi3.", # -ibl > -
|
||||
"lib2l>", # -bil > -bl
|
||||
"lc1.", # -cl > c
|
||||
"lufi4y.", # -iful > -y
|
||||
"luf3>", # -ful > -
|
||||
"lu2.", # -ul > -
|
||||
"lai3>", # -ial > -
|
||||
"lau3>", # -ual > -
|
||||
"la2>", # -al > -
|
||||
"ll1.", # -ll > -l
|
||||
"mui3.", # -ium > -
|
||||
"mu*2.", # -um > - if intact
|
||||
"msi3>", # -ism > -
|
||||
"mm1.", # -mm > -m
|
||||
"nois4j>", # -sion > -j
|
||||
"noix4ct.", # -xion > -ct
|
||||
"noi3>", # -ion > -
|
||||
"nai3>", # -ian > -
|
||||
"na2>", # -an > -
|
||||
"nee0.", # protect -een
|
||||
"ne2>", # -en > -
|
||||
"nn1.", # -nn > -n
|
||||
"pihs4>", # -ship > -
|
||||
"pp1.", # -pp > -p
|
||||
"re2>", # -er > -
|
||||
"rae0.", # protect -ear
|
||||
"ra2.", # -ar > -
|
||||
"ro2>", # -or > -
|
||||
"ru2>", # -ur > -
|
||||
"rr1.", # -rr > -r
|
||||
"rt1>", # -tr > -t
|
||||
"rei3y>", # -ier > -y
|
||||
"sei3y>", # -ies > -y
|
||||
"sis2.", # -sis > -s
|
||||
"si2>", # -is > -
|
||||
"ssen4>", # -ness > -
|
||||
"ss0.", # protect -ss
|
||||
"suo3>", # -ous > -
|
||||
"su*2.", # -us > - if intact
|
||||
"s*1>", # -s > - if intact
|
||||
"s0.", # -s > -s
|
||||
"tacilp4y.", # -plicat > -ply
|
||||
"ta2>", # -at > -
|
||||
"tnem4>", # -ment > -
|
||||
"tne3>", # -ent > -
|
||||
"tna3>", # -ant > -
|
||||
"tpir2b.", # -ript > -rib
|
||||
"tpro2b.", # -orpt > -orb
|
||||
"tcud1.", # -duct > -duc
|
||||
"tpmus2.", # -sumpt > -sum
|
||||
"tpec2iv.", # -cept > -ceiv
|
||||
"tulo2v.", # -olut > -olv
|
||||
"tsis0.", # protect -sist
|
||||
"tsi3>", # -ist > -
|
||||
"tt1.", # -tt > -t
|
||||
"uqi3.", # -iqu > -
|
||||
"ugo1.", # -ogu > -og
|
||||
"vis3j>", # -siv > -j
|
||||
"vie0.", # protect -eiv
|
||||
"vi2>", # -iv > -
|
||||
"ylb1>", # -bly > -bl
|
||||
"yli3y>", # -ily > -y
|
||||
"ylp0.", # protect -ply
|
||||
"yl2>", # -ly > -
|
||||
"ygo1.", # -ogy > -og
|
||||
"yhp1.", # -phy > -ph
|
||||
"ymo1.", # -omy > -om
|
||||
"ypo1.", # -opy > -op
|
||||
"yti3>", # -ity > -
|
||||
"yte3>", # -ety > -
|
||||
"ytl2.", # -lty > -l
|
||||
"yrtsi5.", # -istry > -
|
||||
"yra3>", # -ary > -
|
||||
"yro3>", # -ory > -
|
||||
"yfi3.", # -ify > -
|
||||
"ycn2t>", # -ncy > -nt
|
||||
"yca3>", # -acy > -
|
||||
"zi2>", # -iz > -
|
||||
"zy1s.", # -yz > -ys
|
||||
)
|
||||
|
||||
def __init__(self, rule_tuple=None, strip_prefix_flag=False):
|
||||
"""Create an instance of the Lancaster stemmer."""
|
||||
# Setup an empty rule dictionary - this will be filled in later
|
||||
self.rule_dictionary = {}
|
||||
# Check if a user wants to strip prefix
|
||||
self._strip_prefix = strip_prefix_flag
|
||||
# Check if a user wants to use his/her own rule tuples.
|
||||
self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
|
||||
|
||||
def parseRules(self, rule_tuple=None):
|
||||
"""Validate the set of rules used in this stemmer.
|
||||
|
||||
If this function is called as an individual method, without using stem
|
||||
method, rule_tuple argument will be compiled into self.rule_dictionary.
|
||||
If this function is called within stem, self._rule_tuple will be used.
|
||||
|
||||
"""
|
||||
# If there is no argument for the function, use class' own rule tuple.
|
||||
rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
|
||||
valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$")
|
||||
# Empty any old rules from the rule set before adding new ones
|
||||
self.rule_dictionary = {}
|
||||
|
||||
for rule in rule_tuple:
|
||||
if not valid_rule.match(rule):
|
||||
raise ValueError(f"The rule {rule} is invalid")
|
||||
first_letter = rule[0:1]
|
||||
if first_letter in self.rule_dictionary:
|
||||
self.rule_dictionary[first_letter].append(rule)
|
||||
else:
|
||||
self.rule_dictionary[first_letter] = [rule]
|
||||
|
||||
def stem(self, word):
|
||||
"""Stem a word using the Lancaster stemmer."""
|
||||
# Lower-case the word, since all the rules are lower-cased
|
||||
word = word.lower()
|
||||
word = self.__stripPrefix(word) if self._strip_prefix else word
|
||||
|
||||
# Save a copy of the original word
|
||||
intact_word = word
|
||||
|
||||
# If rule dictionary is empty, parse rule tuple.
|
||||
if not self.rule_dictionary:
|
||||
self.parseRules()
|
||||
|
||||
return self.__doStemming(word, intact_word)
|
||||
|
||||
def __doStemming(self, word, intact_word):
|
||||
"""Perform the actual word stemming"""
|
||||
|
||||
valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
|
||||
|
||||
proceed = True
|
||||
|
||||
while proceed:
|
||||
# Find the position of the last letter of the word to be stemmed
|
||||
last_letter_position = self.__getLastLetter(word)
|
||||
|
||||
# Only stem the word if it has a last letter and a rule matching that last letter
|
||||
if (
|
||||
last_letter_position < 0
|
||||
or word[last_letter_position] not in self.rule_dictionary
|
||||
):
|
||||
proceed = False
|
||||
|
||||
else:
|
||||
rule_was_applied = False
|
||||
|
||||
# Go through each rule that matches the word's final letter
|
||||
for rule in self.rule_dictionary[word[last_letter_position]]:
|
||||
rule_match = valid_rule.match(rule)
|
||||
if rule_match:
|
||||
(
|
||||
ending_string,
|
||||
intact_flag,
|
||||
remove_total,
|
||||
append_string,
|
||||
cont_flag,
|
||||
) = rule_match.groups()
|
||||
|
||||
# Convert the number of chars to remove when stemming
|
||||
# from a string to an integer
|
||||
remove_total = int(remove_total)
|
||||
|
||||
# Proceed if word's ending matches rule's word ending
|
||||
if word.endswith(ending_string[::-1]):
|
||||
if intact_flag:
|
||||
if word == intact_word and self.__isAcceptable(
|
||||
word, remove_total
|
||||
):
|
||||
word = self.__applyRule(
|
||||
word, remove_total, append_string
|
||||
)
|
||||
rule_was_applied = True
|
||||
if cont_flag == ".":
|
||||
proceed = False
|
||||
break
|
||||
elif self.__isAcceptable(word, remove_total):
|
||||
word = self.__applyRule(
|
||||
word, remove_total, append_string
|
||||
)
|
||||
rule_was_applied = True
|
||||
if cont_flag == ".":
|
||||
proceed = False
|
||||
break
|
||||
# If no rules apply, the word doesn't need any more stemming
|
||||
if rule_was_applied == False:
|
||||
proceed = False
|
||||
return word
|
||||
|
||||
def __getLastLetter(self, word):
|
||||
"""Get the zero-based index of the last alphabetic character in this string"""
|
||||
last_letter = -1
|
||||
for position in range(len(word)):
|
||||
if word[position].isalpha():
|
||||
last_letter = position
|
||||
else:
|
||||
break
|
||||
return last_letter
|
||||
|
||||
def __isAcceptable(self, word, remove_total):
|
||||
"""Determine if the word is acceptable for stemming."""
|
||||
word_is_acceptable = False
|
||||
# If the word starts with a vowel, it must be at least 2
|
||||
# characters long to be stemmed
|
||||
if word[0] in "aeiouy":
|
||||
if len(word) - remove_total >= 2:
|
||||
word_is_acceptable = True
|
||||
# If the word starts with a consonant, it must be at least 3
|
||||
# characters long (including one vowel) to be stemmed
|
||||
elif len(word) - remove_total >= 3:
|
||||
if word[1] in "aeiouy":
|
||||
word_is_acceptable = True
|
||||
elif word[2] in "aeiouy":
|
||||
word_is_acceptable = True
|
||||
return word_is_acceptable
|
||||
|
||||
def __applyRule(self, word, remove_total, append_string):
|
||||
"""Apply the stemming rule to the word"""
|
||||
# Remove letters from the end of the word
|
||||
new_word_length = len(word) - remove_total
|
||||
word = word[0:new_word_length]
|
||||
|
||||
# And add new letters to the end of the truncated word
|
||||
if append_string:
|
||||
word += append_string
|
||||
return word
|
||||
|
||||
def __stripPrefix(self, word):
|
||||
"""Remove prefix from a word.
|
||||
|
||||
This function originally taken from Whoosh.
|
||||
|
||||
"""
|
||||
for prefix in (
|
||||
"kilo",
|
||||
"micro",
|
||||
"milli",
|
||||
"intra",
|
||||
"ultra",
|
||||
"mega",
|
||||
"nano",
|
||||
"pico",
|
||||
"pseudo",
|
||||
):
|
||||
if word.startswith(prefix):
|
||||
return word[len(prefix) :]
|
||||
return word
|
||||
|
||||
def __repr__(self):
|
||||
return "<LancasterStemmer>"
|
||||
Reference in New Issue
Block a user