updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/vader.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/vader.py
@@ -0,0 +1,633 @@
+# Natural Language Toolkit: vader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
+#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
+#         Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
+#         George Berry <geb97@cornell.edu> (modifications)
+#         Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+# Modifications to the original VADER code have been made in order to
+# integrate it into NLTK. These have involved changes to
+# ensure Python 3 compatibility, and refactoring to achieve greater modularity.
+
+"""
+If you use the VADER sentiment analysis tools, please cite:
+
+Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
+Sentiment Analysis of Social Media Text. Eighth International Conference on
+Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+"""
+
+import math
+import re
+import string
+from itertools import product
+
+import nltk.data
+from nltk.util import pairwise
+
+
+class VaderConstants:
+    """
+    A class to keep the Vader lists and constants.
+    """
+
+    ##Constants##
+    # (empirically derived mean sentiment intensity rating increase for booster words)
+    B_INCR = 0.293
+    B_DECR = -0.293
+
+    # (empirically derived mean sentiment intensity rating increase for using
+    # ALLCAPs to emphasize a word)
+    C_INCR = 0.733
+
+    N_SCALAR = -0.74
+
+    NEGATE = {
+        "aint",
+        "arent",
+        "cannot",
+        "cant",
+        "couldnt",
+        "darent",
+        "didnt",
+        "doesnt",
+        "ain't",
+        "aren't",
+        "can't",
+        "couldn't",
+        "daren't",
+        "didn't",
+        "doesn't",
+        "dont",
+        "hadnt",
+        "hasnt",
+        "havent",
+        "isnt",
+        "mightnt",
+        "mustnt",
+        "neither",
+        "don't",
+        "hadn't",
+        "hasn't",
+        "haven't",
+        "isn't",
+        "mightn't",
+        "mustn't",
+        "neednt",
+        "needn't",
+        "never",
+        "none",
+        "nope",
+        "nor",
+        "not",
+        "nothing",
+        "nowhere",
+        "oughtnt",
+        "shant",
+        "shouldnt",
+        "uhuh",
+        "wasnt",
+        "werent",
+        "oughtn't",
+        "shan't",
+        "shouldn't",
+        "uh-uh",
+        "wasn't",
+        "weren't",
+        "without",
+        "wont",
+        "wouldnt",
+        "won't",
+        "wouldn't",
+        "rarely",
+        "seldom",
+        "despite",
+    }
+
+    # booster/dampener 'intensifiers' or 'degree adverbs'
+    # https://en.wiktionary.org/wiki/Category:English_degree_adverbs
+
+    BOOSTER_DICT = {
+        "absolutely": B_INCR,
+        "amazingly": B_INCR,
+        "awfully": B_INCR,
+        "completely": B_INCR,
+        "considerably": B_INCR,
+        "decidedly": B_INCR,
+        "deeply": B_INCR,
+        "effing": B_INCR,
+        "enormously": B_INCR,
+        "entirely": B_INCR,
+        "especially": B_INCR,
+        "exceptionally": B_INCR,
+        "extremely": B_INCR,
+        "fabulously": B_INCR,
+        "flipping": B_INCR,
+        "flippin": B_INCR,
+        "fricking": B_INCR,
+        "frickin": B_INCR,
+        "frigging": B_INCR,
+        "friggin": B_INCR,
+        "fully": B_INCR,
+        "fucking": B_INCR,
+        "greatly": B_INCR,
+        "hella": B_INCR,
+        "highly": B_INCR,
+        "hugely": B_INCR,
+        "incredibly": B_INCR,
+        "intensely": B_INCR,
+        "majorly": B_INCR,
+        "more": B_INCR,
+        "most": B_INCR,
+        "particularly": B_INCR,
+        "purely": B_INCR,
+        "quite": B_INCR,
+        "really": B_INCR,
+        "remarkably": B_INCR,
+        "so": B_INCR,
+        "substantially": B_INCR,
+        "thoroughly": B_INCR,
+        "totally": B_INCR,
+        "tremendously": B_INCR,
+        "uber": B_INCR,
+        "unbelievably": B_INCR,
+        "unusually": B_INCR,
+        "utterly": B_INCR,
+        "very": B_INCR,
+        "almost": B_DECR,
+        "barely": B_DECR,
+        "hardly": B_DECR,
+        "just enough": B_DECR,
+        "kind of": B_DECR,
+        "kinda": B_DECR,
+        "kindof": B_DECR,
+        "kind-of": B_DECR,
+        "less": B_DECR,
+        "little": B_DECR,
+        "marginally": B_DECR,
+        "occasionally": B_DECR,
+        "partly": B_DECR,
+        "scarcely": B_DECR,
+        "slightly": B_DECR,
+        "somewhat": B_DECR,
+        "sort of": B_DECR,
+        "sorta": B_DECR,
+        "sortof": B_DECR,
+        "sort-of": B_DECR,
+    }
+
+    # check for special case idioms using a sentiment-laden keyword known to SAGE
+    SPECIAL_CASE_IDIOMS = {
+        "the shit": 3,
+        "the bomb": 3,
+        "bad ass": 1.5,
+        "yeah right": -2,
+        "cut the mustard": 2,
+        "kiss of death": -1.5,
+        "hand to mouth": -2,
+    }
+
+    # for removing punctuation
+    REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]")
+
+    PUNC_LIST = [
+        ".",
+        "!",
+        "?",
+        ",",
+        ";",
+        ":",
+        "-",
+        "'",
+        '"',
+        "!!",
+        "!!!",
+        "??",
+        "???",
+        "?!?",
+        "!?!",
+        "?!?!",
+        "!?!?",
+    ]
+
+    def __init__(self):
+        pass
+
+    def negated(self, input_words, include_nt=True):
+        """
+        Determine if input contains negation words
+        """
+        neg_words = self.NEGATE
+        if any(word.lower() in neg_words for word in input_words):
+            return True
+        if include_nt:
+            if any("n't" in word.lower() for word in input_words):
+                return True
+        for first, second in pairwise(input_words):
+            if second.lower() == "least" and first.lower() != "at":
+                return True
+        return False
+
+    def normalize(self, score, alpha=15):
+        """
+        Normalize the score to be between -1 and 1 using an alpha that
+        approximates the max expected value
+        """
+        norm_score = score / math.sqrt((score * score) + alpha)
+        return norm_score
+
+    def scalar_inc_dec(self, word, valence, is_cap_diff):
+        """
+        Check if the preceding words increase, decrease, or negate/nullify the
+        valence
+        """
+        scalar = 0.0
+        word_lower = word.lower()
+        if word_lower in self.BOOSTER_DICT:
+            scalar = self.BOOSTER_DICT[word_lower]
+            if valence < 0:
+                scalar *= -1
+            # check if booster/dampener word is in ALLCAPS (while others aren't)
+            if word.isupper() and is_cap_diff:
+                if valence > 0:
+                    scalar += self.C_INCR
+                else:
+                    scalar -= self.C_INCR
+        return scalar
+
+
+class SentiText:
+    """
+    Identify sentiment-relevant string-level properties of input text.
+    """
+
+    def __init__(self, text, punc_list, regex_remove_punctuation):
+        if not isinstance(text, str):
+            text = str(text.encode("utf-8"))
+        self.text = text
+        self.PUNC_LIST = punc_list
+        self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
+        self.words_and_emoticons = self._words_and_emoticons()
+        # doesn't separate words from
+        # adjacent punctuation (keeps emoticons & contractions)
+        self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
+
+    def _words_plus_punc(self):
+        """
+        Returns mapping of form:
+        {
+            'cat,': 'cat',
+            ',cat': 'cat',
+        }
+        """
+        no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
+        # removes punctuation (but loses emoticons & contractions)
+        words_only = no_punc_text.split()
+        # remove singletons
+        words_only = {w for w in words_only if len(w) > 1}
+        # the product gives ('cat', ',') and (',', 'cat')
+        punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
+        punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
+        words_punc_dict = punc_before
+        words_punc_dict.update(punc_after)
+        return words_punc_dict
+
+    def _words_and_emoticons(self):
+        """
+        Removes leading and trailing puncutation
+        Leaves contractions and most emoticons
+            Does not preserve punc-plus-letter emoticons (e.g. :D)
+        """
+        wes = self.text.split()
+        words_punc_dict = self._words_plus_punc()
+        wes = [we for we in wes if len(we) > 1]
+        for i, we in enumerate(wes):
+            if we in words_punc_dict:
+                wes[i] = words_punc_dict[we]
+        return wes
+
+    def allcap_differential(self, words):
+        """
+        Check whether just some words in the input are ALL CAPS
+
+        :param list words: The words to inspect
+        :returns: `True` if some but not all items in `words` are ALL CAPS
+        """
+        is_different = False
+        allcap_words = 0
+        for word in words:
+            if word.isupper():
+                allcap_words += 1
+        cap_differential = len(words) - allcap_words
+        if 0 < cap_differential < len(words):
+            is_different = True
+        return is_different
+
+
+class SentimentIntensityAnalyzer:
+    """
+    Give a sentiment intensity score to sentences.
+    """
+
+    def __init__(
+        self,
+        lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
+    ):
+        self.lexicon_file = nltk.data.load(lexicon_file)
+        self.lexicon = self.make_lex_dict()
+        self.constants = VaderConstants()
+
+    def make_lex_dict(self):
+        """
+        Convert lexicon file to a dictionary
+        """
+        lex_dict = {}
+        for line in self.lexicon_file.split("\n"):
+            (word, measure) = line.strip().split("\t")[0:2]
+            lex_dict[word] = float(measure)
+        return lex_dict
+
+    def polarity_scores(self, text):
+        """
+        Return a float for sentiment strength based on the input text.
+        Positive values are positive valence, negative value are negative
+        valence.
+
+        :note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you
+            are interested in processing the text in the hashtags too, then we recommend
+            preprocessing your data to remove the #, after which the hashtag text may be
+            matched as if it was a normal word in the sentence.
+        """
+        # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
+        sentitext = SentiText(
+            text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION
+        )
+        sentiments = []
+        words_and_emoticons = sentitext.words_and_emoticons
+        for item in words_and_emoticons:
+            valence = 0
+            i = words_and_emoticons.index(item)
+            if (
+                i < len(words_and_emoticons) - 1
+                and item.lower() == "kind"
+                and words_and_emoticons[i + 1].lower() == "of"
+            ) or item.lower() in self.constants.BOOSTER_DICT:
+                sentiments.append(valence)
+                continue
+
+            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
+
+        sentiments = self._but_check(words_and_emoticons, sentiments)
+
+        return self.score_valence(sentiments, text)
+
+    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
+        is_cap_diff = sentitext.is_cap_diff
+        words_and_emoticons = sentitext.words_and_emoticons
+        item_lowercase = item.lower()
+        if item_lowercase in self.lexicon:
+            # get the sentiment valence
+            valence = self.lexicon[item_lowercase]
+
+            # check if sentiment laden word is in ALL CAPS (while others aren't)
+            if item.isupper() and is_cap_diff:
+                if valence > 0:
+                    valence += self.constants.C_INCR
+                else:
+                    valence -= self.constants.C_INCR
+
+            for start_i in range(0, 3):
+                if (
+                    i > start_i
+                    and words_and_emoticons[i - (start_i + 1)].lower()
+                    not in self.lexicon
+                ):
+                    # dampen the scalar modifier of preceding words and emoticons
+                    # (excluding the ones that immediately preceed the item) based
+                    # on their distance from the current item.
+                    s = self.constants.scalar_inc_dec(
+                        words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
+                    )
+                    if start_i == 1 and s != 0:
+                        s = s * 0.95
+                    if start_i == 2 and s != 0:
+                        s = s * 0.9
+                    valence = valence + s
+                    valence = self._never_check(
+                        valence, words_and_emoticons, start_i, i
+                    )
+                    if start_i == 2:
+                        valence = self._idioms_check(valence, words_and_emoticons, i)
+
+                        # future work: consider other sentiment-laden idioms
+                        # other_idioms =
+                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
+                        #  "upper hand": 1, "break a leg": 2,
+                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
+                        #  "on the ball": 2,"under the weather": -2}
+
+            valence = self._least_check(valence, words_and_emoticons, i)
+
+        sentiments.append(valence)
+        return sentiments
+
+    def _least_check(self, valence, words_and_emoticons, i):
+        # check for negation case using "least"
+        if (
+            i > 1
+            and words_and_emoticons[i - 1].lower() not in self.lexicon
+            and words_and_emoticons[i - 1].lower() == "least"
+        ):
+            if (
+                words_and_emoticons[i - 2].lower() != "at"
+                and words_and_emoticons[i - 2].lower() != "very"
+            ):
+                valence = valence * self.constants.N_SCALAR
+        elif (
+            i > 0
+            and words_and_emoticons[i - 1].lower() not in self.lexicon
+            and words_and_emoticons[i - 1].lower() == "least"
+        ):
+            valence = valence * self.constants.N_SCALAR
+        return valence
+
+    def _but_check(self, words_and_emoticons, sentiments):
+        words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons]
+        but = {"but"} & set(words_and_emoticons)
+        if but:
+            bi = words_and_emoticons.index(next(iter(but)))
+            for sidx, sentiment in enumerate(sentiments):
+                if sidx < bi:
+                    sentiments[sidx] = sentiment * 0.5
+                elif sidx > bi:
+                    sentiments[sidx] = sentiment * 1.5
+        return sentiments
+
+    def _idioms_check(self, valence, words_and_emoticons, i):
+        onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}"
+
+        twoonezero = "{} {} {}".format(
+            words_and_emoticons[i - 2],
+            words_and_emoticons[i - 1],
+            words_and_emoticons[i],
+        )
+
+        twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}"
+
+        threetwoone = "{} {} {}".format(
+            words_and_emoticons[i - 3],
+            words_and_emoticons[i - 2],
+            words_and_emoticons[i - 1],
+        )
+
+        threetwo = "{} {}".format(
+            words_and_emoticons[i - 3], words_and_emoticons[i - 2]
+        )
+
+        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
+
+        for seq in sequences:
+            if seq in self.constants.SPECIAL_CASE_IDIOMS:
+                valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
+                break
+
+        if len(words_and_emoticons) - 1 > i:
+            zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}"
+            if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
+                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
+        if len(words_and_emoticons) - 1 > i + 1:
+            zeroonetwo = "{} {} {}".format(
+                words_and_emoticons[i],
+                words_and_emoticons[i + 1],
+                words_and_emoticons[i + 2],
+            )
+            if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
+                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
+
+        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
+        if (
+            threetwo in self.constants.BOOSTER_DICT
+            or twoone in self.constants.BOOSTER_DICT
+        ):
+            valence = valence + self.constants.B_DECR
+        return valence
+
+    def _never_check(self, valence, words_and_emoticons, start_i, i):
+        if start_i == 0:
+            if self.constants.negated([words_and_emoticons[i - 1]]):
+                valence = valence * self.constants.N_SCALAR
+        if start_i == 1:
+            if words_and_emoticons[i - 2] == "never" and (
+                words_and_emoticons[i - 1] == "so"
+                or words_and_emoticons[i - 1] == "this"
+            ):
+                valence = valence * 1.5
+            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
+                valence = valence * self.constants.N_SCALAR
+        if start_i == 2:
+            if (
+                words_and_emoticons[i - 3] == "never"
+                and (
+                    words_and_emoticons[i - 2] == "so"
+                    or words_and_emoticons[i - 2] == "this"
+                )
+                or (
+                    words_and_emoticons[i - 1] == "so"
+                    or words_and_emoticons[i - 1] == "this"
+                )
+            ):
+                valence = valence * 1.25
+            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
+                valence = valence * self.constants.N_SCALAR
+        return valence
+
+    def _punctuation_emphasis(self, sum_s, text):
+        # add emphasis from exclamation points and question marks
+        ep_amplifier = self._amplify_ep(text)
+        qm_amplifier = self._amplify_qm(text)
+        punct_emph_amplifier = ep_amplifier + qm_amplifier
+        return punct_emph_amplifier
+
+    def _amplify_ep(self, text):
+        # check for added emphasis resulting from exclamation points (up to 4 of them)
+        ep_count = text.count("!")
+        if ep_count > 4:
+            ep_count = 4
+        # (empirically derived mean sentiment intensity rating increase for
+        # exclamation points)
+        ep_amplifier = ep_count * 0.292
+        return ep_amplifier
+
+    def _amplify_qm(self, text):
+        # check for added emphasis resulting from question marks (2 or 3+)
+        qm_count = text.count("?")
+        qm_amplifier = 0
+        if qm_count > 1:
+            if qm_count <= 3:
+                # (empirically derived mean sentiment intensity rating increase for
+                # question marks)
+                qm_amplifier = qm_count * 0.18
+            else:
+                qm_amplifier = 0.96
+        return qm_amplifier
+
+    def _sift_sentiment_scores(self, sentiments):
+        # want separate positive versus negative sentiment scores
+        pos_sum = 0.0
+        neg_sum = 0.0
+        neu_count = 0
+        for sentiment_score in sentiments:
+            if sentiment_score > 0:
+                pos_sum += (
+                    float(sentiment_score) + 1
+                )  # compensates for neutral words that are counted as 1
+            if sentiment_score < 0:
+                neg_sum += (
+                    float(sentiment_score) - 1
+                )  # when used with math.fabs(), compensates for neutrals
+            if sentiment_score == 0:
+                neu_count += 1
+        return pos_sum, neg_sum, neu_count
+
+    def score_valence(self, sentiments, text):
+        if sentiments:
+            sum_s = float(sum(sentiments))
+            # compute and add emphasis from punctuation in text
+            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
+            if sum_s > 0:
+                sum_s += punct_emph_amplifier
+            elif sum_s < 0:
+                sum_s -= punct_emph_amplifier
+
+            compound = self.constants.normalize(sum_s)
+            # discriminate between positive, negative and neutral sentiment scores
+            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
+
+            if pos_sum > math.fabs(neg_sum):
+                pos_sum += punct_emph_amplifier
+            elif pos_sum < math.fabs(neg_sum):
+                neg_sum -= punct_emph_amplifier
+
+            total = pos_sum + math.fabs(neg_sum) + neu_count
+            pos = math.fabs(pos_sum / total)
+            neg = math.fabs(neg_sum / total)
+            neu = math.fabs(neu_count / total)
+
+        else:
+            compound = 0.0
+            pos = 0.0
+            neg = 0.0
+            neu = 0.0
+
+        sentiment_dict = {
+            "neg": round(neg, 3),
+            "neu": round(neu, 3),
+            "pos": round(pos, 3),
+            "compound": round(compound, 4),
+        }
+
+        return sentiment_dict