updates
This commit is contained in:
717
Backend/venv/lib/python3.12/site-packages/nltk/stem/porter.py
Normal file
717
Backend/venv/lib/python3.12/site-packages/nltk/stem/porter.py
Normal file
@@ -0,0 +1,717 @@
|
||||
"""
|
||||
Porter Stemmer
|
||||
|
||||
This is the Porter stemming algorithm. It follows the algorithm
|
||||
presented in
|
||||
|
||||
Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
|
||||
|
||||
with some optional deviations that can be turned on or off with the
|
||||
`mode` argument to the constructor.
|
||||
|
||||
Martin Porter, the algorithm's inventor, maintains a web page about the
|
||||
algorithm at
|
||||
|
||||
https://www.tartarus.org/~martin/PorterStemmer/
|
||||
|
||||
which includes another Python implementation and other implementations
|
||||
in many languages.
|
||||
"""
|
||||
|
||||
__docformat__ = "plaintext"
|
||||
|
||||
import re
|
||||
|
||||
from nltk.stem.api import StemmerI
|
||||
|
||||
|
||||
class PorterStemmer(StemmerI):
|
||||
"""
|
||||
A word stemmer based on the Porter stemming algorithm.
|
||||
|
||||
Porter, M. "An algorithm for suffix stripping."
|
||||
Program 14.3 (1980): 130-137.
|
||||
|
||||
See https://www.tartarus.org/~martin/PorterStemmer/ for the homepage
|
||||
of the algorithm.
|
||||
|
||||
Martin Porter has endorsed several modifications to the Porter
|
||||
algorithm since writing his original paper, and those extensions are
|
||||
included in the implementations on his website. Additionally, others
|
||||
have proposed further improvements to the algorithm, including NLTK
|
||||
contributors. There are thus three modes that can be selected by
|
||||
passing the appropriate constant to the class constructor's `mode`
|
||||
attribute:
|
||||
|
||||
- PorterStemmer.ORIGINAL_ALGORITHM
|
||||
|
||||
An implementation that is faithful to the original paper.
|
||||
|
||||
Note that Martin Porter has deprecated this version of the
|
||||
algorithm. Martin distributes implementations of the Porter
|
||||
Stemmer in many languages, hosted at:
|
||||
|
||||
https://www.tartarus.org/~martin/PorterStemmer/
|
||||
|
||||
and all of these implementations include his extensions. He
|
||||
strongly recommends against using the original, published
|
||||
version of the algorithm; only use this mode if you clearly
|
||||
understand why you are choosing to do so.
|
||||
|
||||
- PorterStemmer.MARTIN_EXTENSIONS
|
||||
|
||||
An implementation that only uses the modifications to the
|
||||
algorithm that are included in the implementations on Martin
|
||||
Porter's website. He has declared Porter frozen, so the
|
||||
behaviour of those implementations should never change.
|
||||
|
||||
- PorterStemmer.NLTK_EXTENSIONS (default)
|
||||
|
||||
An implementation that includes further improvements devised by
|
||||
NLTK contributors or taken from other modified implementations
|
||||
found on the web.
|
||||
|
||||
For the best stemming, you should use the default NLTK_EXTENSIONS
|
||||
version. However, if you need to get the same results as either the
|
||||
original algorithm or one of Martin Porter's hosted versions for
|
||||
compatibility with an existing implementation or dataset, you can use
|
||||
one of the other modes instead.
|
||||
"""
|
||||
|
||||
# Modes the Stemmer can be instantiated in
|
||||
NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
|
||||
MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
|
||||
ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
|
||||
|
||||
def __init__(self, mode=NLTK_EXTENSIONS):
|
||||
if mode not in (
|
||||
self.NLTK_EXTENSIONS,
|
||||
self.MARTIN_EXTENSIONS,
|
||||
self.ORIGINAL_ALGORITHM,
|
||||
):
|
||||
raise ValueError(
|
||||
"Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
|
||||
"PorterStemmer.MARTIN_EXTENSIONS, or "
|
||||
"PorterStemmer.ORIGINAL_ALGORITHM"
|
||||
)
|
||||
|
||||
self.mode = mode
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
# This is a table of irregular forms. It is quite short,
|
||||
# but still reflects the errors actually drawn to Martin
|
||||
# Porter's attention over a 20 year period!
|
||||
irregular_forms = {
|
||||
"sky": ["sky", "skies"],
|
||||
"die": ["dying"],
|
||||
"lie": ["lying"],
|
||||
"tie": ["tying"],
|
||||
"news": ["news"],
|
||||
"inning": ["innings", "inning"],
|
||||
"outing": ["outings", "outing"],
|
||||
"canning": ["cannings", "canning"],
|
||||
"howe": ["howe"],
|
||||
"proceed": ["proceed"],
|
||||
"exceed": ["exceed"],
|
||||
"succeed": ["succeed"],
|
||||
}
|
||||
|
||||
self.pool = {}
|
||||
for key in irregular_forms:
|
||||
for val in irregular_forms[key]:
|
||||
self.pool[val] = key
|
||||
|
||||
self.vowels = frozenset(["a", "e", "i", "o", "u"])
|
||||
|
||||
def _is_consonant(self, word, i):
|
||||
"""Returns True if word[i] is a consonant, False otherwise
|
||||
|
||||
A consonant is defined in the paper as follows:
|
||||
|
||||
A consonant in a word is a letter other than A, E, I, O or
|
||||
U, and other than Y preceded by a consonant. (The fact that
|
||||
the term `consonant' is defined to some extent in terms of
|
||||
itself does not make it ambiguous.) So in TOY the consonants
|
||||
are T and Y, and in SYZYGY they are S, Z and G. If a letter
|
||||
is not a consonant it is a vowel.
|
||||
"""
|
||||
if word[i] in self.vowels:
|
||||
return False
|
||||
if word[i] == "y":
|
||||
if i == 0:
|
||||
return True
|
||||
else:
|
||||
return not self._is_consonant(word, i - 1)
|
||||
return True
|
||||
|
||||
def _measure(self, stem):
|
||||
r"""Returns the 'measure' of stem, per definition in the paper
|
||||
|
||||
From the paper:
|
||||
|
||||
A consonant will be denoted by c, a vowel by v. A list
|
||||
ccc... of length greater than 0 will be denoted by C, and a
|
||||
list vvv... of length greater than 0 will be denoted by V.
|
||||
Any word, or part of a word, therefore has one of the four
|
||||
forms:
|
||||
|
||||
CVCV ... C
|
||||
CVCV ... V
|
||||
VCVC ... C
|
||||
VCVC ... V
|
||||
|
||||
These may all be represented by the single form
|
||||
|
||||
[C]VCVC ... [V]
|
||||
|
||||
where the square brackets denote arbitrary presence of their
|
||||
contents. Using (VC){m} to denote VC repeated m times, this
|
||||
may again be written as
|
||||
|
||||
[C](VC){m}[V].
|
||||
|
||||
m will be called the \measure\ of any word or word part when
|
||||
represented in this form. The case m = 0 covers the null
|
||||
word. Here are some examples:
|
||||
|
||||
m=0 TR, EE, TREE, Y, BY.
|
||||
m=1 TROUBLE, OATS, TREES, IVY.
|
||||
m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
||||
"""
|
||||
cv_sequence = ""
|
||||
|
||||
# Construct a string of 'c's and 'v's representing whether each
|
||||
# character in `stem` is a consonant or a vowel.
|
||||
# e.g. 'falafel' becomes 'cvcvcvc',
|
||||
# 'architecture' becomes 'vcccvcvccvcv'
|
||||
for i in range(len(stem)):
|
||||
if self._is_consonant(stem, i):
|
||||
cv_sequence += "c"
|
||||
else:
|
||||
cv_sequence += "v"
|
||||
|
||||
# Count the number of 'vc' occurrences, which is equivalent to
|
||||
# the number of 'VC' occurrences in Porter's reduced form in the
|
||||
# docstring above, which is in turn equivalent to `m`
|
||||
return cv_sequence.count("vc")
|
||||
|
||||
def _has_positive_measure(self, stem):
|
||||
return self._measure(stem) > 0
|
||||
|
||||
def _contains_vowel(self, stem):
|
||||
"""Returns True if stem contains a vowel, else False"""
|
||||
for i in range(len(stem)):
|
||||
if not self._is_consonant(stem, i):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _ends_double_consonant(self, word):
|
||||
"""Implements condition *d from the paper
|
||||
|
||||
Returns True if word ends with a double consonant
|
||||
"""
|
||||
return (
|
||||
len(word) >= 2
|
||||
and word[-1] == word[-2]
|
||||
and self._is_consonant(word, len(word) - 1)
|
||||
)
|
||||
|
||||
def _ends_cvc(self, word):
|
||||
"""Implements condition *o from the paper
|
||||
|
||||
From the paper:
|
||||
|
||||
*o - the stem ends cvc, where the second c is not W, X or Y
|
||||
(e.g. -WIL, -HOP).
|
||||
"""
|
||||
return (
|
||||
len(word) >= 3
|
||||
and self._is_consonant(word, len(word) - 3)
|
||||
and not self._is_consonant(word, len(word) - 2)
|
||||
and self._is_consonant(word, len(word) - 1)
|
||||
and word[-1] not in ("w", "x", "y")
|
||||
) or (
|
||||
self.mode == self.NLTK_EXTENSIONS
|
||||
and len(word) == 2
|
||||
and not self._is_consonant(word, 0)
|
||||
and self._is_consonant(word, 1)
|
||||
)
|
||||
|
||||
def _replace_suffix(self, word, suffix, replacement):
|
||||
"""Replaces `suffix` of `word` with `replacement"""
|
||||
assert word.endswith(suffix), "Given word doesn't end with given suffix"
|
||||
if suffix == "":
|
||||
return word + replacement
|
||||
else:
|
||||
return word[: -len(suffix)] + replacement
|
||||
|
||||
def _apply_rule_list(self, word, rules):
|
||||
"""Applies the first applicable suffix-removal rule to the word
|
||||
|
||||
Takes a word and a list of suffix-removal rules represented as
|
||||
3-tuples, with the first element being the suffix to remove,
|
||||
the second element being the string to replace it with, and the
|
||||
final element being the condition for the rule to be applicable,
|
||||
or None if the rule is unconditional.
|
||||
"""
|
||||
for rule in rules:
|
||||
suffix, replacement, condition = rule
|
||||
if suffix == "*d" and self._ends_double_consonant(word):
|
||||
stem = word[:-2]
|
||||
if condition is None or condition(stem):
|
||||
return stem + replacement
|
||||
else:
|
||||
# Don't try any further rules
|
||||
return word
|
||||
if word.endswith(suffix):
|
||||
stem = self._replace_suffix(word, suffix, "")
|
||||
if condition is None or condition(stem):
|
||||
return stem + replacement
|
||||
else:
|
||||
# Don't try any further rules
|
||||
return word
|
||||
|
||||
return word
|
||||
|
||||
def _step1a(self, word):
|
||||
"""Implements Step 1a from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
SSES -> SS caresses -> caress
|
||||
IES -> I ponies -> poni
|
||||
ties -> ti
|
||||
SS -> SS caress -> caress
|
||||
S -> cats -> cat
|
||||
"""
|
||||
# this NLTK-only rule extends the original algorithm, so
|
||||
# that 'flies'->'fli' but 'dies'->'die' etc
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
if word.endswith("ies") and len(word) == 4:
|
||||
return self._replace_suffix(word, "ies", "ie")
|
||||
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
("sses", "ss", None), # SSES -> SS
|
||||
("ies", "i", None), # IES -> I
|
||||
("ss", "ss", None), # SS -> SS
|
||||
("s", "", None), # S ->
|
||||
],
|
||||
)
|
||||
|
||||
def _step1b(self, word):
|
||||
"""Implements Step 1b from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
(m>0) EED -> EE feed -> feed
|
||||
agreed -> agree
|
||||
(*v*) ED -> plastered -> plaster
|
||||
bled -> bled
|
||||
(*v*) ING -> motoring -> motor
|
||||
sing -> sing
|
||||
|
||||
If the second or third of the rules in Step 1b is successful,
|
||||
the following is done:
|
||||
|
||||
AT -> ATE conflat(ed) -> conflate
|
||||
BL -> BLE troubl(ed) -> trouble
|
||||
IZ -> IZE siz(ed) -> size
|
||||
(*d and not (*L or *S or *Z))
|
||||
-> single letter
|
||||
hopp(ing) -> hop
|
||||
tann(ed) -> tan
|
||||
fall(ing) -> fall
|
||||
hiss(ing) -> hiss
|
||||
fizz(ed) -> fizz
|
||||
(m=1 and *o) -> E fail(ing) -> fail
|
||||
fil(ing) -> file
|
||||
|
||||
The rule to map to a single letter causes the removal of one of
|
||||
the double letter pair. The -E is put back on -AT, -BL and -IZ,
|
||||
so that the suffixes -ATE, -BLE and -IZE can be recognised
|
||||
later. This E may be removed in step 4.
|
||||
"""
|
||||
# this NLTK-only block extends the original algorithm, so that
|
||||
# 'spied'->'spi' but 'died'->'die' etc
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
if word.endswith("ied"):
|
||||
if len(word) == 4:
|
||||
return self._replace_suffix(word, "ied", "ie")
|
||||
else:
|
||||
return self._replace_suffix(word, "ied", "i")
|
||||
|
||||
# (m>0) EED -> EE
|
||||
if word.endswith("eed"):
|
||||
stem = self._replace_suffix(word, "eed", "")
|
||||
if self._measure(stem) > 0:
|
||||
return stem + "ee"
|
||||
else:
|
||||
return word
|
||||
|
||||
rule_2_or_3_succeeded = False
|
||||
|
||||
for suffix in ["ed", "ing"]:
|
||||
if word.endswith(suffix):
|
||||
intermediate_stem = self._replace_suffix(word, suffix, "")
|
||||
if self._contains_vowel(intermediate_stem):
|
||||
rule_2_or_3_succeeded = True
|
||||
break
|
||||
|
||||
if not rule_2_or_3_succeeded:
|
||||
return word
|
||||
|
||||
return self._apply_rule_list(
|
||||
intermediate_stem,
|
||||
[
|
||||
("at", "ate", None), # AT -> ATE
|
||||
("bl", "ble", None), # BL -> BLE
|
||||
("iz", "ize", None), # IZ -> IZE
|
||||
# (*d and not (*L or *S or *Z))
|
||||
# -> single letter
|
||||
(
|
||||
"*d",
|
||||
intermediate_stem[-1],
|
||||
lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
|
||||
),
|
||||
# (m=1 and *o) -> E
|
||||
(
|
||||
"",
|
||||
"e",
|
||||
lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
def _step1c(self, word):
|
||||
"""Implements Step 1c from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 1c
|
||||
|
||||
(*v*) Y -> I happy -> happi
|
||||
sky -> sky
|
||||
"""
|
||||
|
||||
def nltk_condition(stem):
|
||||
"""
|
||||
This has been modified from the original Porter algorithm so
|
||||
that y->i is only done when y is preceded by a consonant,
|
||||
but not if the stem is only a single consonant, i.e.
|
||||
|
||||
(*c and not c) Y -> I
|
||||
|
||||
So 'happy' -> 'happi', but
|
||||
'enjoy' -> 'enjoy' etc
|
||||
|
||||
This is a much better rule. Formerly 'enjoy'->'enjoi' and
|
||||
'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
|
||||
with this modification that no longer really matters.
|
||||
|
||||
Also, the removal of the contains_vowel(z) condition means
|
||||
that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
|
||||
conflate with 'spied', 'tried', 'flies' ...
|
||||
"""
|
||||
return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
|
||||
|
||||
def original_condition(stem):
|
||||
return self._contains_vowel(stem)
|
||||
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
(
|
||||
"y",
|
||||
"i",
|
||||
(
|
||||
nltk_condition
|
||||
if self.mode == self.NLTK_EXTENSIONS
|
||||
else original_condition
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
def _step2(self, word):
|
||||
"""Implements Step 2 from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 2
|
||||
|
||||
(m>0) ATIONAL -> ATE relational -> relate
|
||||
(m>0) TIONAL -> TION conditional -> condition
|
||||
rational -> rational
|
||||
(m>0) ENCI -> ENCE valenci -> valence
|
||||
(m>0) ANCI -> ANCE hesitanci -> hesitance
|
||||
(m>0) IZER -> IZE digitizer -> digitize
|
||||
(m>0) ABLI -> ABLE conformabli -> conformable
|
||||
(m>0) ALLI -> AL radicalli -> radical
|
||||
(m>0) ENTLI -> ENT differentli -> different
|
||||
(m>0) ELI -> E vileli - > vile
|
||||
(m>0) OUSLI -> OUS analogousli -> analogous
|
||||
(m>0) IZATION -> IZE vietnamization -> vietnamize
|
||||
(m>0) ATION -> ATE predication -> predicate
|
||||
(m>0) ATOR -> ATE operator -> operate
|
||||
(m>0) ALISM -> AL feudalism -> feudal
|
||||
(m>0) IVENESS -> IVE decisiveness -> decisive
|
||||
(m>0) FULNESS -> FUL hopefulness -> hopeful
|
||||
(m>0) OUSNESS -> OUS callousness -> callous
|
||||
(m>0) ALITI -> AL formaliti -> formal
|
||||
(m>0) IVITI -> IVE sensitiviti -> sensitive
|
||||
(m>0) BILITI -> BLE sensibiliti -> sensible
|
||||
"""
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
# Instead of applying the ALLI -> AL rule after '(a)bli' per
|
||||
# the published algorithm, instead we apply it first, and,
|
||||
# if it succeeds, run the result through step2 again.
|
||||
if word.endswith("alli") and self._has_positive_measure(
|
||||
self._replace_suffix(word, "alli", "")
|
||||
):
|
||||
return self._step2(self._replace_suffix(word, "alli", "al"))
|
||||
|
||||
bli_rule = ("bli", "ble", self._has_positive_measure)
|
||||
abli_rule = ("abli", "able", self._has_positive_measure)
|
||||
|
||||
rules = [
|
||||
("ational", "ate", self._has_positive_measure),
|
||||
("tional", "tion", self._has_positive_measure),
|
||||
("enci", "ence", self._has_positive_measure),
|
||||
("anci", "ance", self._has_positive_measure),
|
||||
("izer", "ize", self._has_positive_measure),
|
||||
abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
|
||||
("alli", "al", self._has_positive_measure),
|
||||
("entli", "ent", self._has_positive_measure),
|
||||
("eli", "e", self._has_positive_measure),
|
||||
("ousli", "ous", self._has_positive_measure),
|
||||
("ization", "ize", self._has_positive_measure),
|
||||
("ation", "ate", self._has_positive_measure),
|
||||
("ator", "ate", self._has_positive_measure),
|
||||
("alism", "al", self._has_positive_measure),
|
||||
("iveness", "ive", self._has_positive_measure),
|
||||
("fulness", "ful", self._has_positive_measure),
|
||||
("ousness", "ous", self._has_positive_measure),
|
||||
("aliti", "al", self._has_positive_measure),
|
||||
("iviti", "ive", self._has_positive_measure),
|
||||
("biliti", "ble", self._has_positive_measure),
|
||||
]
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS:
|
||||
rules.append(("fulli", "ful", self._has_positive_measure))
|
||||
|
||||
# The 'l' of the 'logi' -> 'log' rule is put with the stem,
|
||||
# so that short stems like 'geo' 'theo' etc work like
|
||||
# 'archaeo' 'philo' etc.
|
||||
rules.append(
|
||||
("logi", "log", lambda stem: self._has_positive_measure(word[:-3]))
|
||||
)
|
||||
|
||||
if self.mode == self.MARTIN_EXTENSIONS:
|
||||
rules.append(("logi", "log", self._has_positive_measure))
|
||||
|
||||
return self._apply_rule_list(word, rules)
|
||||
|
||||
def _step3(self, word):
|
||||
"""Implements Step 3 from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 3
|
||||
|
||||
(m>0) ICATE -> IC triplicate -> triplic
|
||||
(m>0) ATIVE -> formative -> form
|
||||
(m>0) ALIZE -> AL formalize -> formal
|
||||
(m>0) ICITI -> IC electriciti -> electric
|
||||
(m>0) ICAL -> IC electrical -> electric
|
||||
(m>0) FUL -> hopeful -> hope
|
||||
(m>0) NESS -> goodness -> good
|
||||
"""
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
("icate", "ic", self._has_positive_measure),
|
||||
("ative", "", self._has_positive_measure),
|
||||
("alize", "al", self._has_positive_measure),
|
||||
("iciti", "ic", self._has_positive_measure),
|
||||
("ical", "ic", self._has_positive_measure),
|
||||
("ful", "", self._has_positive_measure),
|
||||
("ness", "", self._has_positive_measure),
|
||||
],
|
||||
)
|
||||
|
||||
def _step4(self, word):
|
||||
"""Implements Step 4 from "An algorithm for suffix stripping"
|
||||
|
||||
Step 4
|
||||
|
||||
(m>1) AL -> revival -> reviv
|
||||
(m>1) ANCE -> allowance -> allow
|
||||
(m>1) ENCE -> inference -> infer
|
||||
(m>1) ER -> airliner -> airlin
|
||||
(m>1) IC -> gyroscopic -> gyroscop
|
||||
(m>1) ABLE -> adjustable -> adjust
|
||||
(m>1) IBLE -> defensible -> defens
|
||||
(m>1) ANT -> irritant -> irrit
|
||||
(m>1) EMENT -> replacement -> replac
|
||||
(m>1) MENT -> adjustment -> adjust
|
||||
(m>1) ENT -> dependent -> depend
|
||||
(m>1 and (*S or *T)) ION -> adoption -> adopt
|
||||
(m>1) OU -> homologou -> homolog
|
||||
(m>1) ISM -> communism -> commun
|
||||
(m>1) ATE -> activate -> activ
|
||||
(m>1) ITI -> angulariti -> angular
|
||||
(m>1) OUS -> homologous -> homolog
|
||||
(m>1) IVE -> effective -> effect
|
||||
(m>1) IZE -> bowdlerize -> bowdler
|
||||
|
||||
The suffixes are now removed. All that remains is a little
|
||||
tidying up.
|
||||
"""
|
||||
measure_gt_1 = lambda stem: self._measure(stem) > 1
|
||||
|
||||
return self._apply_rule_list(
|
||||
word,
|
||||
[
|
||||
("al", "", measure_gt_1),
|
||||
("ance", "", measure_gt_1),
|
||||
("ence", "", measure_gt_1),
|
||||
("er", "", measure_gt_1),
|
||||
("ic", "", measure_gt_1),
|
||||
("able", "", measure_gt_1),
|
||||
("ible", "", measure_gt_1),
|
||||
("ant", "", measure_gt_1),
|
||||
("ement", "", measure_gt_1),
|
||||
("ment", "", measure_gt_1),
|
||||
("ent", "", measure_gt_1),
|
||||
# (m>1 and (*S or *T)) ION ->
|
||||
(
|
||||
"ion",
|
||||
"",
|
||||
lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
|
||||
),
|
||||
("ou", "", measure_gt_1),
|
||||
("ism", "", measure_gt_1),
|
||||
("ate", "", measure_gt_1),
|
||||
("iti", "", measure_gt_1),
|
||||
("ous", "", measure_gt_1),
|
||||
("ive", "", measure_gt_1),
|
||||
("ize", "", measure_gt_1),
|
||||
],
|
||||
)
|
||||
|
||||
def _step5a(self, word):
|
||||
"""Implements Step 5a from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 5a
|
||||
|
||||
(m>1) E -> probate -> probat
|
||||
rate -> rate
|
||||
(m=1 and not *o) E -> cease -> ceas
|
||||
"""
|
||||
# Note that Martin's test vocabulary and reference
|
||||
# implementations are inconsistent in how they handle the case
|
||||
# where two rules both refer to a suffix that matches the word
|
||||
# to be stemmed, but only the condition of the second one is
|
||||
# true.
|
||||
# Earlier in step2b we had the rules:
|
||||
# (m>0) EED -> EE
|
||||
# (*v*) ED ->
|
||||
# but the examples in the paper included "feed"->"feed", even
|
||||
# though (*v*) is true for "fe" and therefore the second rule
|
||||
# alone would map "feed"->"fe".
|
||||
# However, in THIS case, we need to handle the consecutive rules
|
||||
# differently and try both conditions (obviously; the second
|
||||
# rule here would be redundant otherwise). Martin's paper makes
|
||||
# no explicit mention of the inconsistency; you have to infer it
|
||||
# from the examples.
|
||||
# For this reason, we can't use _apply_rule_list here.
|
||||
if word.endswith("e"):
|
||||
stem = self._replace_suffix(word, "e", "")
|
||||
if self._measure(stem) > 1:
|
||||
return stem
|
||||
if self._measure(stem) == 1 and not self._ends_cvc(stem):
|
||||
return stem
|
||||
return word
|
||||
|
||||
def _step5b(self, word):
|
||||
"""Implements Step 5a from "An algorithm for suffix stripping"
|
||||
|
||||
From the paper:
|
||||
|
||||
Step 5b
|
||||
|
||||
(m > 1 and *d and *L) -> single letter
|
||||
controll -> control
|
||||
roll -> roll
|
||||
"""
|
||||
return self._apply_rule_list(
|
||||
word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
|
||||
)
|
||||
|
||||
def stem(self, word, to_lowercase=True):
|
||||
"""
|
||||
:param to_lowercase: if `to_lowercase=True` the word always lowercase
|
||||
"""
|
||||
stem = word.lower() if to_lowercase else word
|
||||
|
||||
if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
|
||||
return self.pool[stem]
|
||||
|
||||
if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
|
||||
# With this line, strings of length 1 or 2 don't go through
|
||||
# the stemming process, although no mention is made of this
|
||||
# in the published algorithm.
|
||||
return stem
|
||||
|
||||
stem = self._step1a(stem)
|
||||
stem = self._step1b(stem)
|
||||
stem = self._step1c(stem)
|
||||
stem = self._step2(stem)
|
||||
stem = self._step3(stem)
|
||||
stem = self._step4(stem)
|
||||
stem = self._step5a(stem)
|
||||
stem = self._step5b(stem)
|
||||
|
||||
return stem
|
||||
|
||||
def __repr__(self):
|
||||
return "<PorterStemmer>"
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
A demonstration of the porter stemmer on a sample from
|
||||
the Penn Treebank corpus.
|
||||
"""
|
||||
|
||||
from nltk import stem
|
||||
from nltk.corpus import treebank
|
||||
|
||||
stemmer = stem.PorterStemmer()
|
||||
|
||||
orig = []
|
||||
stemmed = []
|
||||
for item in treebank.fileids()[:3]:
|
||||
for word, tag in treebank.tagged_words(item):
|
||||
orig.append(word)
|
||||
stemmed.append(stemmer.stem(word))
|
||||
|
||||
# Convert the results to a string, and word-wrap them.
|
||||
results = " ".join(stemmed)
|
||||
results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
|
||||
|
||||
# Convert the original to a string, and word wrap it.
|
||||
original = " ".join(orig)
|
||||
original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
|
||||
|
||||
# Print the results.
|
||||
print("-Original-".center(70).replace(" ", "*").replace("-", " "))
|
||||
print(original)
|
||||
print("-Results-".center(70).replace(" ", "*").replace("-", " "))
|
||||
print(results)
|
||||
print("*" * 70)
|
||||
Reference in New Issue
Block a user