updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/stem/porter.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/stem/porter.py
@@ -0,0 +1,717 @@
+"""
+Porter Stemmer
+
+This is the Porter stemming algorithm. It follows the algorithm
+presented in
+
+Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
+
+with some optional deviations that can be turned on or off with the
+`mode` argument to the constructor.
+
+Martin Porter, the algorithm's inventor, maintains a web page about the
+algorithm at
+
+    https://www.tartarus.org/~martin/PorterStemmer/
+
+which includes another Python implementation and other implementations
+in many languages.
+"""
+
+__docformat__ = "plaintext"
+
+import re
+
+from nltk.stem.api import StemmerI
+
+
+class PorterStemmer(StemmerI):
+    """
+    A word stemmer based on the Porter stemming algorithm.
+
+        Porter, M. "An algorithm for suffix stripping."
+        Program 14.3 (1980): 130-137.
+
+    See https://www.tartarus.org/~martin/PorterStemmer/ for the homepage
+    of the algorithm.
+
+    Martin Porter has endorsed several modifications to the Porter
+    algorithm since writing his original paper, and those extensions are
+    included in the implementations on his website. Additionally, others
+    have proposed further improvements to the algorithm, including NLTK
+    contributors. There are thus three modes that can be selected by
+    passing the appropriate constant to the class constructor's `mode`
+    attribute:
+
+    - PorterStemmer.ORIGINAL_ALGORITHM
+
+        An implementation that is faithful to the original paper.
+
+        Note that Martin Porter has deprecated this version of the
+        algorithm. Martin distributes implementations of the Porter
+        Stemmer in many languages, hosted at:
+
+        https://www.tartarus.org/~martin/PorterStemmer/
+
+        and all of these implementations include his extensions. He
+        strongly recommends against using the original, published
+        version of the algorithm; only use this mode if you clearly
+        understand why you are choosing to do so.
+
+    - PorterStemmer.MARTIN_EXTENSIONS
+
+        An implementation that only uses the modifications to the
+        algorithm that are included in the implementations on Martin
+        Porter's website. He has declared Porter frozen, so the
+        behaviour of those implementations should never change.
+
+    - PorterStemmer.NLTK_EXTENSIONS (default)
+
+        An implementation that includes further improvements devised by
+        NLTK contributors or taken from other modified implementations
+        found on the web.
+
+    For the best stemming, you should use the default NLTK_EXTENSIONS
+    version. However, if you need to get the same results as either the
+    original algorithm or one of Martin Porter's hosted versions for
+    compatibility with an existing implementation or dataset, you can use
+    one of the other modes instead.
+    """
+
+    # Modes the Stemmer can be instantiated in
+    NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
+    MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
+    ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
+
+    def __init__(self, mode=NLTK_EXTENSIONS):
+        if mode not in (
+            self.NLTK_EXTENSIONS,
+            self.MARTIN_EXTENSIONS,
+            self.ORIGINAL_ALGORITHM,
+        ):
+            raise ValueError(
+                "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
+                "PorterStemmer.MARTIN_EXTENSIONS, or "
+                "PorterStemmer.ORIGINAL_ALGORITHM"
+            )
+
+        self.mode = mode
+
+        if self.mode == self.NLTK_EXTENSIONS:
+            # This is a table of irregular forms. It is quite short,
+            # but still reflects the errors actually drawn to Martin
+            # Porter's attention over a 20 year period!
+            irregular_forms = {
+                "sky": ["sky", "skies"],
+                "die": ["dying"],
+                "lie": ["lying"],
+                "tie": ["tying"],
+                "news": ["news"],
+                "inning": ["innings", "inning"],
+                "outing": ["outings", "outing"],
+                "canning": ["cannings", "canning"],
+                "howe": ["howe"],
+                "proceed": ["proceed"],
+                "exceed": ["exceed"],
+                "succeed": ["succeed"],
+            }
+
+            self.pool = {}
+            for key in irregular_forms:
+                for val in irregular_forms[key]:
+                    self.pool[val] = key
+
+        self.vowels = frozenset(["a", "e", "i", "o", "u"])
+
+    def _is_consonant(self, word, i):
+        """Returns True if word[i] is a consonant, False otherwise
+
+        A consonant is defined in the paper as follows:
+
+            A consonant in a word is a letter other than A, E, I, O or
+            U, and other than Y preceded by a consonant. (The fact that
+            the term `consonant' is defined to some extent in terms of
+            itself does not make it ambiguous.) So in TOY the consonants
+            are T and Y, and in SYZYGY they are S, Z and G. If a letter
+            is not a consonant it is a vowel.
+        """
+        if word[i] in self.vowels:
+            return False
+        if word[i] == "y":
+            if i == 0:
+                return True
+            else:
+                return not self._is_consonant(word, i - 1)
+        return True
+
+    def _measure(self, stem):
+        r"""Returns the 'measure' of stem, per definition in the paper
+
+        From the paper:
+
+            A consonant will be denoted by c, a vowel by v. A list
+            ccc... of length greater than 0 will be denoted by C, and a
+            list vvv... of length greater than 0 will be denoted by V.
+            Any word, or part of a word, therefore has one of the four
+            forms:
+
+                CVCV ... C
+                CVCV ... V
+                VCVC ... C
+                VCVC ... V
+
+            These may all be represented by the single form
+
+                [C]VCVC ... [V]
+
+            where the square brackets denote arbitrary presence of their
+            contents. Using (VC){m} to denote VC repeated m times, this
+            may again be written as
+
+                [C](VC){m}[V].
+
+            m will be called the \measure\ of any word or word part when
+            represented in this form. The case m = 0 covers the null
+            word. Here are some examples:
+
+                m=0    TR,  EE,  TREE,  Y,  BY.
+                m=1    TROUBLE,  OATS,  TREES,  IVY.
+                m=2    TROUBLES,  PRIVATE,  OATEN,  ORRERY.
+        """
+        cv_sequence = ""
+
+        # Construct a string of 'c's and 'v's representing whether each
+        # character in `stem` is a consonant or a vowel.
+        # e.g. 'falafel' becomes 'cvcvcvc',
+        #      'architecture' becomes 'vcccvcvccvcv'
+        for i in range(len(stem)):
+            if self._is_consonant(stem, i):
+                cv_sequence += "c"
+            else:
+                cv_sequence += "v"
+
+        # Count the number of 'vc' occurrences, which is equivalent to
+        # the number of 'VC' occurrences in Porter's reduced form in the
+        # docstring above, which is in turn equivalent to `m`
+        return cv_sequence.count("vc")
+
+    def _has_positive_measure(self, stem):
+        return self._measure(stem) > 0
+
+    def _contains_vowel(self, stem):
+        """Returns True if stem contains a vowel, else False"""
+        for i in range(len(stem)):
+            if not self._is_consonant(stem, i):
+                return True
+        return False
+
+    def _ends_double_consonant(self, word):
+        """Implements condition *d from the paper
+
+        Returns True if word ends with a double consonant
+        """
+        return (
+            len(word) >= 2
+            and word[-1] == word[-2]
+            and self._is_consonant(word, len(word) - 1)
+        )
+
+    def _ends_cvc(self, word):
+        """Implements condition *o from the paper
+
+        From the paper:
+
+            *o  - the stem ends cvc, where the second c is not W, X or Y
+                  (e.g. -WIL, -HOP).
+        """
+        return (
+            len(word) >= 3
+            and self._is_consonant(word, len(word) - 3)
+            and not self._is_consonant(word, len(word) - 2)
+            and self._is_consonant(word, len(word) - 1)
+            and word[-1] not in ("w", "x", "y")
+        ) or (
+            self.mode == self.NLTK_EXTENSIONS
+            and len(word) == 2
+            and not self._is_consonant(word, 0)
+            and self._is_consonant(word, 1)
+        )
+
+    def _replace_suffix(self, word, suffix, replacement):
+        """Replaces `suffix` of `word` with `replacement"""
+        assert word.endswith(suffix), "Given word doesn't end with given suffix"
+        if suffix == "":
+            return word + replacement
+        else:
+            return word[: -len(suffix)] + replacement
+
+    def _apply_rule_list(self, word, rules):
+        """Applies the first applicable suffix-removal rule to the word
+
+        Takes a word and a list of suffix-removal rules represented as
+        3-tuples, with the first element being the suffix to remove,
+        the second element being the string to replace it with, and the
+        final element being the condition for the rule to be applicable,
+        or None if the rule is unconditional.
+        """
+        for rule in rules:
+            suffix, replacement, condition = rule
+            if suffix == "*d" and self._ends_double_consonant(word):
+                stem = word[:-2]
+                if condition is None or condition(stem):
+                    return stem + replacement
+                else:
+                    # Don't try any further rules
+                    return word
+            if word.endswith(suffix):
+                stem = self._replace_suffix(word, suffix, "")
+                if condition is None or condition(stem):
+                    return stem + replacement
+                else:
+                    # Don't try any further rules
+                    return word
+
+        return word
+
+    def _step1a(self, word):
+        """Implements Step 1a from "An algorithm for suffix stripping"
+
+        From the paper:
+
+            SSES -> SS                         caresses  ->  caress
+            IES  -> I                          ponies    ->  poni
+                                               ties      ->  ti
+            SS   -> SS                         caress    ->  caress
+            S    ->                            cats      ->  cat
+        """
+        # this NLTK-only rule extends the original algorithm, so
+        # that 'flies'->'fli' but 'dies'->'die' etc
+        if self.mode == self.NLTK_EXTENSIONS:
+            if word.endswith("ies") and len(word) == 4:
+                return self._replace_suffix(word, "ies", "ie")
+
+        return self._apply_rule_list(
+            word,
+            [
+                ("sses", "ss", None),  # SSES -> SS
+                ("ies", "i", None),  # IES  -> I
+                ("ss", "ss", None),  # SS   -> SS
+                ("s", "", None),  # S    ->
+            ],
+        )
+
+    def _step1b(self, word):
+        """Implements Step 1b from "An algorithm for suffix stripping"
+
+        From the paper:
+
+            (m>0) EED -> EE                    feed      ->  feed
+                                               agreed    ->  agree
+            (*v*) ED  ->                       plastered ->  plaster
+                                               bled      ->  bled
+            (*v*) ING ->                       motoring  ->  motor
+                                               sing      ->  sing
+
+        If the second or third of the rules in Step 1b is successful,
+        the following is done:
+
+            AT -> ATE                       conflat(ed)  ->  conflate
+            BL -> BLE                       troubl(ed)   ->  trouble
+            IZ -> IZE                       siz(ed)      ->  size
+            (*d and not (*L or *S or *Z))
+               -> single letter
+                                            hopp(ing)    ->  hop
+                                            tann(ed)     ->  tan
+                                            fall(ing)    ->  fall
+                                            hiss(ing)    ->  hiss
+                                            fizz(ed)     ->  fizz
+            (m=1 and *o) -> E               fail(ing)    ->  fail
+                                            fil(ing)     ->  file
+
+        The rule to map to a single letter causes the removal of one of
+        the double letter pair. The -E is put back on -AT, -BL and -IZ,
+        so that the suffixes -ATE, -BLE and -IZE can be recognised
+        later. This E may be removed in step 4.
+        """
+        # this NLTK-only block extends the original algorithm, so that
+        # 'spied'->'spi' but 'died'->'die' etc
+        if self.mode == self.NLTK_EXTENSIONS:
+            if word.endswith("ied"):
+                if len(word) == 4:
+                    return self._replace_suffix(word, "ied", "ie")
+                else:
+                    return self._replace_suffix(word, "ied", "i")
+
+        # (m>0) EED -> EE
+        if word.endswith("eed"):
+            stem = self._replace_suffix(word, "eed", "")
+            if self._measure(stem) > 0:
+                return stem + "ee"
+            else:
+                return word
+
+        rule_2_or_3_succeeded = False
+
+        for suffix in ["ed", "ing"]:
+            if word.endswith(suffix):
+                intermediate_stem = self._replace_suffix(word, suffix, "")
+                if self._contains_vowel(intermediate_stem):
+                    rule_2_or_3_succeeded = True
+                    break
+
+        if not rule_2_or_3_succeeded:
+            return word
+
+        return self._apply_rule_list(
+            intermediate_stem,
+            [
+                ("at", "ate", None),  # AT -> ATE
+                ("bl", "ble", None),  # BL -> BLE
+                ("iz", "ize", None),  # IZ -> IZE
+                # (*d and not (*L or *S or *Z))
+                # -> single letter
+                (
+                    "*d",
+                    intermediate_stem[-1],
+                    lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
+                ),
+                # (m=1 and *o) -> E
+                (
+                    "",
+                    "e",
+                    lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
+                ),
+            ],
+        )
+
+    def _step1c(self, word):
+        """Implements Step 1c from "An algorithm for suffix stripping"
+
+        From the paper:
+
+        Step 1c
+
+            (*v*) Y -> I                    happy        ->  happi
+                                            sky          ->  sky
+        """
+
+        def nltk_condition(stem):
+            """
+            This has been modified from the original Porter algorithm so
+            that y->i is only done when y is preceded by a consonant,
+            but not if the stem is only a single consonant, i.e.
+
+               (*c and not c) Y -> I
+
+            So 'happy' -> 'happi', but
+               'enjoy' -> 'enjoy'  etc
+
+            This is a much better rule. Formerly 'enjoy'->'enjoi' and
+            'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
+            with this modification that no longer really matters.
+
+            Also, the removal of the contains_vowel(z) condition means
+            that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
+            conflate with 'spied', 'tried', 'flies' ...
+            """
+            return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
+
+        def original_condition(stem):
+            return self._contains_vowel(stem)
+
+        return self._apply_rule_list(
+            word,
+            [
+                (
+                    "y",
+                    "i",
+                    (
+                        nltk_condition
+                        if self.mode == self.NLTK_EXTENSIONS
+                        else original_condition
+                    ),
+                )
+            ],
+        )
+
+    def _step2(self, word):
+        """Implements Step 2 from "An algorithm for suffix stripping"
+
+        From the paper:
+
+        Step 2
+
+            (m>0) ATIONAL ->  ATE       relational     ->  relate
+            (m>0) TIONAL  ->  TION      conditional    ->  condition
+                                        rational       ->  rational
+            (m>0) ENCI    ->  ENCE      valenci        ->  valence
+            (m>0) ANCI    ->  ANCE      hesitanci      ->  hesitance
+            (m>0) IZER    ->  IZE       digitizer      ->  digitize
+            (m>0) ABLI    ->  ABLE      conformabli    ->  conformable
+            (m>0) ALLI    ->  AL        radicalli      ->  radical
+            (m>0) ENTLI   ->  ENT       differentli    ->  different
+            (m>0) ELI     ->  E         vileli        - >  vile
+            (m>0) OUSLI   ->  OUS       analogousli    ->  analogous
+            (m>0) IZATION ->  IZE       vietnamization ->  vietnamize
+            (m>0) ATION   ->  ATE       predication    ->  predicate
+            (m>0) ATOR    ->  ATE       operator       ->  operate
+            (m>0) ALISM   ->  AL        feudalism      ->  feudal
+            (m>0) IVENESS ->  IVE       decisiveness   ->  decisive
+            (m>0) FULNESS ->  FUL       hopefulness    ->  hopeful
+            (m>0) OUSNESS ->  OUS       callousness    ->  callous
+            (m>0) ALITI   ->  AL        formaliti      ->  formal
+            (m>0) IVITI   ->  IVE       sensitiviti    ->  sensitive
+            (m>0) BILITI  ->  BLE       sensibiliti    ->  sensible
+        """
+
+        if self.mode == self.NLTK_EXTENSIONS:
+            # Instead of applying the ALLI -> AL rule after '(a)bli' per
+            # the published algorithm, instead we apply it first, and,
+            # if it succeeds, run the result through step2 again.
+            if word.endswith("alli") and self._has_positive_measure(
+                self._replace_suffix(word, "alli", "")
+            ):
+                return self._step2(self._replace_suffix(word, "alli", "al"))
+
+        bli_rule = ("bli", "ble", self._has_positive_measure)
+        abli_rule = ("abli", "able", self._has_positive_measure)
+
+        rules = [
+            ("ational", "ate", self._has_positive_measure),
+            ("tional", "tion", self._has_positive_measure),
+            ("enci", "ence", self._has_positive_measure),
+            ("anci", "ance", self._has_positive_measure),
+            ("izer", "ize", self._has_positive_measure),
+            abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
+            ("alli", "al", self._has_positive_measure),
+            ("entli", "ent", self._has_positive_measure),
+            ("eli", "e", self._has_positive_measure),
+            ("ousli", "ous", self._has_positive_measure),
+            ("ization", "ize", self._has_positive_measure),
+            ("ation", "ate", self._has_positive_measure),
+            ("ator", "ate", self._has_positive_measure),
+            ("alism", "al", self._has_positive_measure),
+            ("iveness", "ive", self._has_positive_measure),
+            ("fulness", "ful", self._has_positive_measure),
+            ("ousness", "ous", self._has_positive_measure),
+            ("aliti", "al", self._has_positive_measure),
+            ("iviti", "ive", self._has_positive_measure),
+            ("biliti", "ble", self._has_positive_measure),
+        ]
+
+        if self.mode == self.NLTK_EXTENSIONS:
+            rules.append(("fulli", "ful", self._has_positive_measure))
+
+            # The 'l' of the 'logi' -> 'log' rule is put with the stem,
+            # so that short stems like 'geo' 'theo' etc work like
+            # 'archaeo' 'philo' etc.
+            rules.append(
+                ("logi", "log", lambda stem: self._has_positive_measure(word[:-3]))
+            )
+
+        if self.mode == self.MARTIN_EXTENSIONS:
+            rules.append(("logi", "log", self._has_positive_measure))
+
+        return self._apply_rule_list(word, rules)
+
+    def _step3(self, word):
+        """Implements Step 3 from "An algorithm for suffix stripping"
+
+        From the paper:
+
+        Step 3
+
+            (m>0) ICATE ->  IC              triplicate     ->  triplic
+            (m>0) ATIVE ->                  formative      ->  form
+            (m>0) ALIZE ->  AL              formalize      ->  formal
+            (m>0) ICITI ->  IC              electriciti    ->  electric
+            (m>0) ICAL  ->  IC              electrical     ->  electric
+            (m>0) FUL   ->                  hopeful        ->  hope
+            (m>0) NESS  ->                  goodness       ->  good
+        """
+        return self._apply_rule_list(
+            word,
+            [
+                ("icate", "ic", self._has_positive_measure),
+                ("ative", "", self._has_positive_measure),
+                ("alize", "al", self._has_positive_measure),
+                ("iciti", "ic", self._has_positive_measure),
+                ("ical", "ic", self._has_positive_measure),
+                ("ful", "", self._has_positive_measure),
+                ("ness", "", self._has_positive_measure),
+            ],
+        )
+
+    def _step4(self, word):
+        """Implements Step 4 from "An algorithm for suffix stripping"
+
+        Step 4
+
+            (m>1) AL    ->                  revival        ->  reviv
+            (m>1) ANCE  ->                  allowance      ->  allow
+            (m>1) ENCE  ->                  inference      ->  infer
+            (m>1) ER    ->                  airliner       ->  airlin
+            (m>1) IC    ->                  gyroscopic     ->  gyroscop
+            (m>1) ABLE  ->                  adjustable     ->  adjust
+            (m>1) IBLE  ->                  defensible     ->  defens
+            (m>1) ANT   ->                  irritant       ->  irrit
+            (m>1) EMENT ->                  replacement    ->  replac
+            (m>1) MENT  ->                  adjustment     ->  adjust
+            (m>1) ENT   ->                  dependent      ->  depend
+            (m>1 and (*S or *T)) ION ->     adoption       ->  adopt
+            (m>1) OU    ->                  homologou      ->  homolog
+            (m>1) ISM   ->                  communism      ->  commun
+            (m>1) ATE   ->                  activate       ->  activ
+            (m>1) ITI   ->                  angulariti     ->  angular
+            (m>1) OUS   ->                  homologous     ->  homolog
+            (m>1) IVE   ->                  effective      ->  effect
+            (m>1) IZE   ->                  bowdlerize     ->  bowdler
+
+        The suffixes are now removed. All that remains is a little
+        tidying up.
+        """
+        measure_gt_1 = lambda stem: self._measure(stem) > 1
+
+        return self._apply_rule_list(
+            word,
+            [
+                ("al", "", measure_gt_1),
+                ("ance", "", measure_gt_1),
+                ("ence", "", measure_gt_1),
+                ("er", "", measure_gt_1),
+                ("ic", "", measure_gt_1),
+                ("able", "", measure_gt_1),
+                ("ible", "", measure_gt_1),
+                ("ant", "", measure_gt_1),
+                ("ement", "", measure_gt_1),
+                ("ment", "", measure_gt_1),
+                ("ent", "", measure_gt_1),
+                # (m>1 and (*S or *T)) ION ->
+                (
+                    "ion",
+                    "",
+                    lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
+                ),
+                ("ou", "", measure_gt_1),
+                ("ism", "", measure_gt_1),
+                ("ate", "", measure_gt_1),
+                ("iti", "", measure_gt_1),
+                ("ous", "", measure_gt_1),
+                ("ive", "", measure_gt_1),
+                ("ize", "", measure_gt_1),
+            ],
+        )
+
+    def _step5a(self, word):
+        """Implements Step 5a from "An algorithm for suffix stripping"
+
+        From the paper:
+
+        Step 5a
+
+            (m>1) E     ->                  probate        ->  probat
+                                            rate           ->  rate
+            (m=1 and not *o) E ->           cease          ->  ceas
+        """
+        # Note that Martin's test vocabulary and reference
+        # implementations are inconsistent in how they handle the case
+        # where two rules both refer to a suffix that matches the word
+        # to be stemmed, but only the condition of the second one is
+        # true.
+        # Earlier in step2b we had the rules:
+        #     (m>0) EED -> EE
+        #     (*v*) ED  ->
+        # but the examples in the paper included "feed"->"feed", even
+        # though (*v*) is true for "fe" and therefore the second rule
+        # alone would map "feed"->"fe".
+        # However, in THIS case, we need to handle the consecutive rules
+        # differently and try both conditions (obviously; the second
+        # rule here would be redundant otherwise). Martin's paper makes
+        # no explicit mention of the inconsistency; you have to infer it
+        # from the examples.
+        # For this reason, we can't use _apply_rule_list here.
+        if word.endswith("e"):
+            stem = self._replace_suffix(word, "e", "")
+            if self._measure(stem) > 1:
+                return stem
+            if self._measure(stem) == 1 and not self._ends_cvc(stem):
+                return stem
+        return word
+
+    def _step5b(self, word):
+        """Implements Step 5a from "An algorithm for suffix stripping"
+
+        From the paper:
+
+        Step 5b
+
+            (m > 1 and *d and *L) -> single letter
+                                    controll       ->  control
+                                    roll           ->  roll
+        """
+        return self._apply_rule_list(
+            word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
+        )
+
+    def stem(self, word, to_lowercase=True):
+        """
+        :param to_lowercase: if `to_lowercase=True` the word always lowercase
+        """
+        stem = word.lower() if to_lowercase else word
+
+        if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
+            return self.pool[stem]
+
+        if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
+            # With this line, strings of length 1 or 2 don't go through
+            # the stemming process, although no mention is made of this
+            # in the published algorithm.
+            return stem
+
+        stem = self._step1a(stem)
+        stem = self._step1b(stem)
+        stem = self._step1c(stem)
+        stem = self._step2(stem)
+        stem = self._step3(stem)
+        stem = self._step4(stem)
+        stem = self._step5a(stem)
+        stem = self._step5b(stem)
+
+        return stem
+
+    def __repr__(self):
+        return "<PorterStemmer>"
+
+
+def demo():
+    """
+    A demonstration of the porter stemmer on a sample from
+    the Penn Treebank corpus.
+    """
+
+    from nltk import stem
+    from nltk.corpus import treebank
+
+    stemmer = stem.PorterStemmer()
+
+    orig = []
+    stemmed = []
+    for item in treebank.fileids()[:3]:
+        for word, tag in treebank.tagged_words(item):
+            orig.append(word)
+            stemmed.append(stemmer.stem(word))
+
+    # Convert the results to a string, and word-wrap them.
+    results = " ".join(stemmed)
+    results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
+
+    # Convert the original to a string, and word wrap it.
+    original = " ".join(orig)
+    original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
+
+    # Print the results.
+    print("-Original-".center(70).replace(" ", "*").replace("-", " "))
+    print(original)
+    print("-Results-".center(70).replace(" ", "*").replace("-", " "))
+    print(results)
+    print("*" * 70)