updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/init.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/init.py
@@ -0,0 +1,13 @@
+# Natural Language Toolkit: Sentiment Analysis
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK Sentiment Analysis Package
+
+"""
+from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/init.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/init.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/sentiment_analyzer.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/sentiment_analyzer.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/util.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/util.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/vader.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/pycache/vader.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/sentiment_analyzer.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/sentiment_analyzer.py
@@ -0,0 +1,255 @@
+#
+# Natural Language Toolkit: Sentiment Analyzer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks
+using NLTK features and classifiers, especially for teaching and demonstrative
+purposes.
+"""
+
+import sys
+from collections import defaultdict
+
+from nltk.classify.util import accuracy as eval_accuracy
+from nltk.classify.util import apply_features
+from nltk.collocations import BigramCollocationFinder
+from nltk.metrics import BigramAssocMeasures
+from nltk.metrics import f_measure as eval_f_measure
+from nltk.metrics import precision as eval_precision
+from nltk.metrics import recall as eval_recall
+from nltk.probability import FreqDist
+
+
+class SentimentAnalyzer:
+    """
+    A Sentiment Analysis tool based on machine learning approaches.
+    """
+
+    def __init__(self, classifier=None):
+        self.feat_extractors = defaultdict(list)
+        self.classifier = classifier
+
+    def all_words(self, documents, labeled=None):
+        """
+        Return all words/tokens from the documents (with duplicates).
+
+        :param documents: a list of (words, label) tuples.
+        :param labeled: if `True`, assume that each document is represented by a
+            (words, label) tuple: (list(str), str). If `False`, each document is
+            considered as being a simple list of strings: list(str).
+        :rtype: list(str)
+        :return: A list of all words/tokens in `documents`.
+        """
+        all_words = []
+        if labeled is None:
+            labeled = documents and isinstance(documents[0], tuple)
+        if labeled:
+            for words, _sentiment in documents:
+                all_words.extend(words)
+        elif not labeled:
+            for words in documents:
+                all_words.extend(words)
+        return all_words
+
+    def apply_features(self, documents, labeled=None):
+        """
+        Apply all feature extractor functions to the documents. This is a wrapper
+        around `nltk.classify.util.apply_features`.
+
+        If `labeled=False`, return featuresets as:
+            [feature_func(doc) for doc in documents]
+        If `labeled=True`, return featuresets as:
+            [(feature_func(tok), label) for (tok, label) in toks]
+
+        :param documents: a list of documents. `If labeled=True`, the method expects
+            a list of (words, label) tuples.
+        :rtype: LazyMap
+        """
+        return apply_features(self.extract_features, documents, labeled)
+
+    def unigram_word_feats(self, words, top_n=None, min_freq=0):
+        """
+        Return most common top_n word features.
+
+        :param words: a list of words/tokens.
+        :param top_n: number of best words/tokens to use, sorted by frequency.
+        :rtype: list(str)
+        :return: A list of `top_n` words/tokens (with no duplicates) sorted by
+            frequency.
+        """
+        # Stopwords are not removed
+        unigram_feats_freqs = FreqDist(word for word in words)
+        return [
+            w
+            for w, f in unigram_feats_freqs.most_common(top_n)
+            if unigram_feats_freqs[w] > min_freq
+        ]
+
+    def bigram_collocation_feats(
+        self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi
+    ):
+        """
+        Return `top_n` bigram features (using `assoc_measure`).
+        Note that this method is based on bigram collocations measures, and not
+        on simple bigram frequency.
+
+        :param documents: a list (or iterable) of tokens.
+        :param top_n: number of best words/tokens to use, sorted by association
+            measure.
+        :param assoc_measure: bigram association measure to use as score function.
+        :param min_freq: the minimum number of occurrencies of bigrams to take
+            into consideration.
+
+        :return: `top_n` ngrams scored by the given association measure.
+        """
+        finder = BigramCollocationFinder.from_documents(documents)
+        finder.apply_freq_filter(min_freq)
+        return finder.nbest(assoc_measure, top_n)
+
+    def classify(self, instance):
+        """
+        Classify a single instance applying the features that have already been
+        stored in the SentimentAnalyzer.
+
+        :param instance: a list (or iterable) of tokens.
+        :return: the classification result given by applying the classifier.
+        """
+        instance_feats = self.apply_features([instance], labeled=False)
+        return self.classifier.classify(instance_feats[0])
+
+    def add_feat_extractor(self, function, **kwargs):
+        """
+        Add a new function to extract features from a document. This function will
+        be used in extract_features().
+        Important: in this step our kwargs are only representing additional parameters,
+        and NOT the document we have to parse. The document will always be the first
+        parameter in the parameter list, and it will be added in the extract_features()
+        function.
+
+        :param function: the extractor function to add to the list of feature extractors.
+        :param kwargs: additional parameters required by the `function` function.
+        """
+        self.feat_extractors[function].append(kwargs)
+
+    def extract_features(self, document):
+        """
+        Apply extractor functions (and their parameters) to the present document.
+        We pass `document` as the first parameter of the extractor functions.
+        If we want to use the same extractor function multiple times, we have to
+        add it to the extractors with `add_feat_extractor` using multiple sets of
+        parameters (one for each call of the extractor function).
+
+        :param document: the document that will be passed as argument to the
+            feature extractor functions.
+        :return: A dictionary of populated features extracted from the document.
+        :rtype: dict
+        """
+        all_features = {}
+        for extractor in self.feat_extractors:
+            for param_set in self.feat_extractors[extractor]:
+                feats = extractor(document, **param_set)
+            all_features.update(feats)
+        return all_features
+
+    def train(self, trainer, training_set, save_classifier=None, **kwargs):
+        """
+        Train classifier on the training set, optionally saving the output in the
+        file specified by `save_classifier`.
+        Additional arguments depend on the specific trainer used. For example,
+        a MaxentClassifier can use `max_iter` parameter to specify the number
+        of iterations, while a NaiveBayesClassifier cannot.
+
+        :param trainer: `train` method of a classifier.
+            E.g.: NaiveBayesClassifier.train
+        :param training_set: the training set to be passed as argument to the
+            classifier `train` method.
+        :param save_classifier: the filename of the file where the classifier
+            will be stored (optional).
+        :param kwargs: additional parameters that will be passed as arguments to
+            the classifier `train` function.
+        :return: A classifier instance trained on the training set.
+        :rtype:
+        """
+        print("Training classifier")
+        self.classifier = trainer(training_set, **kwargs)
+        if save_classifier:
+            self.save_file(self.classifier, save_classifier)
+
+        return self.classifier
+
+    def save_file(self, content, filename):
+        """
+        Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
+        """
+        print("Saving", filename, file=sys.stderr)
+        with open(filename, "wb") as storage_file:
+            import pickle
+
+            # The protocol=2 parameter is for python2 compatibility
+            pickle.dump(content, storage_file, protocol=2)
+
+    def evaluate(
+        self,
+        test_set,
+        classifier=None,
+        accuracy=True,
+        f_measure=True,
+        precision=True,
+        recall=True,
+        verbose=False,
+    ):
+        """
+        Evaluate and print classifier performance on the test set.
+
+        :param test_set: A list of (tokens, label) tuples to use as gold set.
+        :param classifier: a classifier instance (previously trained).
+        :param accuracy: if `True`, evaluate classifier accuracy.
+        :param f_measure: if `True`, evaluate classifier f_measure.
+        :param precision: if `True`, evaluate classifier precision.
+        :param recall: if `True`, evaluate classifier recall.
+        :return: evaluation results.
+        :rtype: dict(str): float
+        """
+        if classifier is None:
+            classifier = self.classifier
+        print(f"Evaluating {type(classifier).__name__} results...")
+        metrics_results = {}
+        if accuracy:
+            accuracy_score = eval_accuracy(classifier, test_set)
+            metrics_results["Accuracy"] = accuracy_score
+
+        gold_results = defaultdict(set)
+        test_results = defaultdict(set)
+        labels = set()
+        for i, (feats, label) in enumerate(test_set):
+            labels.add(label)
+            gold_results[label].add(i)
+            observed = classifier.classify(feats)
+            test_results[observed].add(i)
+
+        for label in labels:
+            if precision:
+                precision_score = eval_precision(
+                    gold_results[label], test_results[label]
+                )
+                metrics_results[f"Precision [{label}]"] = precision_score
+            if recall:
+                recall_score = eval_recall(gold_results[label], test_results[label])
+                metrics_results[f"Recall [{label}]"] = recall_score
+            if f_measure:
+                f_measure_score = eval_f_measure(
+                    gold_results[label], test_results[label]
+                )
+                metrics_results[f"F-measure [{label}]"] = f_measure_score
+
+        # Print evaluation results (in alphabetical order)
+        if verbose:
+            for result in sorted(metrics_results):
+                print(f"{result}: {metrics_results[result]}")
+
+        return metrics_results
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/util.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/util.py
@@ -0,0 +1,887 @@
+#
+# Natural Language Toolkit: Sentiment Analyzer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility methods for Sentiment Analysis.
+"""
+
+import codecs
+import csv
+import json
+import random
+import re
+import sys
+import time
+from copy import deepcopy
+
+import nltk
+from nltk.corpus import CategorizedPlaintextCorpusReader
+from nltk.data import load
+from nltk.tokenize import PunktTokenizer
+from nltk.tokenize.casual import EMOTICON_RE
+
+# ////////////////////////////////////////////////////////////
+# { Regular expressions
+# ////////////////////////////////////////////////////////////
+
+# Regular expression for negation by Christopher Potts
+NEGATION = r"""
+    (?:
+        ^(?:never|no|nothing|nowhere|noone|none|not|
+            havent|hasnt|hadnt|cant|couldnt|shouldnt|
+            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
+        )$
+    )
+    |
+    n't"""
+
+NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
+
+CLAUSE_PUNCT = r"^[.:;!?]$"
+CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
+
+# Happy and sad emoticons
+
+HAPPY = {
+    ":-)",
+    ":)",
+    ";)",
+    ":o)",
+    ":]",
+    ":3",
+    ":c)",
+    ":>",
+    "=]",
+    "8)",
+    "=)",
+    ":}",
+    ":^)",
+    ":-D",
+    ":D",
+    "8-D",
+    "8D",
+    "x-D",
+    "xD",
+    "X-D",
+    "XD",
+    "=-D",
+    "=D",
+    "=-3",
+    "=3",
+    ":-))",
+    ":'-)",
+    ":')",
+    ":*",
+    ":^*",
+    ">:P",
+    ":-P",
+    ":P",
+    "X-P",
+    "x-p",
+    "xp",
+    "XP",
+    ":-p",
+    ":p",
+    "=p",
+    ":-b",
+    ":b",
+    ">:)",
+    ">;)",
+    ">:-)",
+    "<3",
+}
+
+SAD = {
+    ":L",
+    ":-/",
+    ">:/",
+    ":S",
+    ">:[",
+    ":@",
+    ":-(",
+    ":[",
+    ":-||",
+    "=L",
+    ":<",
+    ":-[",
+    ":-<",
+    "=\\",
+    "=/",
+    ">:(",
+    ":(",
+    ">.<",
+    ":'-(",
+    ":'(",
+    ":\\",
+    ":-c",
+    ":c",
+    ":{",
+    ">:\\",
+    ";(",
+}
+
+
+def timer(method):
+    """
+    A timer decorator to measure execution performance of methods.
+    """
+
+    def timed(*args, **kw):
+        start = time.time()
+        result = method(*args, **kw)
+        end = time.time()
+        tot_time = end - start
+        hours = tot_time // 3600
+        mins = tot_time // 60 % 60
+        # in Python 2.x round() will return a float, so we convert it to int
+        secs = int(round(tot_time % 60))
+        if hours == 0 and mins == 0 and secs < 10:
+            print(f"[TIMER] {method.__name__}(): {method.__name__:.3f} seconds")
+        else:
+            print(f"[TIMER] {method.__name__}(): {hours}h {mins}m {secs}s")
+        return result
+
+    return timed
+
+
+# ////////////////////////////////////////////////////////////
+# { Feature extractor functions
+# ////////////////////////////////////////////////////////////
+"""
+Feature extractor functions are declared outside the SentimentAnalyzer class.
+Users should have the possibility to create their own feature extractors
+without modifying SentimentAnalyzer.
+"""
+
+
+def extract_unigram_feats(document, unigrams, handle_negation=False):
+    """
+    Populate a dictionary of unigram features, reflecting the presence/absence in
+    the document of each of the tokens in `unigrams`.
+
+    :param document: a list of words/tokens.
+    :param unigrams: a list of words/tokens whose presence/absence has to be
+        checked in `document`.
+    :param handle_negation: if `handle_negation == True` apply `mark_negation`
+        method to `document` before checking for unigram presence/absence.
+    :return: a dictionary of unigram features {unigram : boolean}.
+
+    >>> words = ['ice', 'police', 'riot']
+    >>> document = 'ice is melting due to global warming'.split()
+    >>> sorted(extract_unigram_feats(document, words).items())
+    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
+    """
+    features = {}
+    if handle_negation:
+        document = mark_negation(document)
+    for word in unigrams:
+        features[f"contains({word})"] = word in set(document)
+    return features
+
+
+def extract_bigram_feats(document, bigrams):
+    """
+    Populate a dictionary of bigram features, reflecting the presence/absence in
+    the document of each of the tokens in `bigrams`. This extractor function only
+    considers contiguous bigrams obtained by `nltk.bigrams`.
+
+    :param document: a list of words/tokens.
+    :param unigrams: a list of bigrams whose presence/absence has to be
+        checked in `document`.
+    :return: a dictionary of bigram features {bigram : boolean}.
+
+    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
+    >>> document = 'ice is melting due to global warming'.split()
+    >>> sorted(extract_bigram_feats(document, bigrams).items()) # doctest: +NORMALIZE_WHITESPACE
+    [('contains(global - warming)', True), ('contains(love - you)', False),
+    ('contains(police - prevented)', False)]
+    """
+    features = {}
+    for bigr in bigrams:
+        features[f"contains({bigr[0]} - {bigr[1]})"] = bigr in nltk.bigrams(document)
+    return features
+
+
+# ////////////////////////////////////////////////////////////
+# { Helper Functions
+# ////////////////////////////////////////////////////////////
+
+
+def mark_negation(document, double_neg_flip=False, shallow=False):
+    """
+    Append _NEG suffix to words that appear in the scope between a negation
+    and a punctuation mark.
+
+    :param document: a list of words/tokens, or a tuple (words, label).
+    :param shallow: if True, the method will modify the original document in place.
+    :param double_neg_flip: if True, double negation is considered affirmation
+        (we activate/deactivate negation scope every time we find a negation).
+    :return: if `shallow == True` the method will modify the original document
+        and return it. If `shallow == False` the method will return a modified
+        document, leaving the original unmodified.
+
+    >>> sent = "I didn't like this movie . It was bad .".split()
+    >>> mark_negation(sent)
+    ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
+    """
+    if not shallow:
+        document = deepcopy(document)
+    # check if the document is labeled. If so, do not consider the label.
+    labeled = document and isinstance(document[0], (tuple, list))
+    if labeled:
+        doc = document[0]
+    else:
+        doc = document
+    neg_scope = False
+    for i, word in enumerate(doc):
+        if NEGATION_RE.search(word):
+            if not neg_scope or (neg_scope and double_neg_flip):
+                neg_scope = not neg_scope
+                continue
+            else:
+                doc[i] += "_NEG"
+        elif neg_scope and CLAUSE_PUNCT_RE.search(word):
+            neg_scope = not neg_scope
+        elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
+            doc[i] += "_NEG"
+
+    return document
+
+
+def output_markdown(filename, **kwargs):
+    """
+    Write the output of an analysis to a file.
+    """
+    with codecs.open(filename, "at") as outfile:
+        text = "\n*** \n\n"
+        text += "{} \n\n".format(time.strftime("%d/%m/%Y, %H:%M"))
+        for k in sorted(kwargs):
+            if isinstance(kwargs[k], dict):
+                dictionary = kwargs[k]
+                text += f"  - **{k}:**\n"
+                for entry in sorted(dictionary):
+                    text += f"    - {entry}: {dictionary[entry]} \n"
+            elif isinstance(kwargs[k], list):
+                text += f"  - **{k}:**\n"
+                for entry in kwargs[k]:
+                    text += f"    - {entry}\n"
+            else:
+                text += f"  - **{k}:** {kwargs[k]} \n"
+        outfile.write(text)
+
+
+def split_train_test(all_instances, n=None):
+    """
+    Randomly split `n` instances of the dataset into train and test sets.
+
+    :param all_instances: a list of instances (e.g. documents) that will be split.
+    :param n: the number of instances to consider (in case we want to use only a
+        subset).
+    :return: two lists of instances. Train set is 8/10 of the total and test set
+        is 2/10 of the total.
+    """
+    random.seed(12345)
+    random.shuffle(all_instances)
+    if not n or n > len(all_instances):
+        n = len(all_instances)
+    train_set = all_instances[: int(0.8 * n)]
+    test_set = all_instances[int(0.8 * n) : n]
+
+    return train_set, test_set
+
+
+def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError as e:
+        raise ImportError(
+            "The plot function requires matplotlib to be installed."
+            "See https://matplotlib.org/"
+        ) from e
+
+    plt.locator_params(axis="y", nbins=3)
+    axes = plt.axes()
+    axes.yaxis.grid()
+    plt.plot(x_values, y_values, "ro", color="red")
+    plt.ylim(ymin=-1.2, ymax=1.2)
+    plt.tight_layout(pad=5)
+    if x_labels:
+        plt.xticks(x_values, x_labels, rotation="vertical")
+    if y_labels:
+        plt.yticks([-1, 0, 1], y_labels, rotation="horizontal")
+    # Pad margins so that markers are not clipped by the axes
+    plt.margins(0.2)
+    plt.show()
+
+
+# ////////////////////////////////////////////////////////////
+# { Parsing and conversion functions
+# ////////////////////////////////////////////////////////////
+
+
+def json2csv_preprocess(
+    json_file,
+    outfile,
+    fields,
+    encoding="utf8",
+    errors="replace",
+    gzip_compress=False,
+    skip_retweets=True,
+    skip_tongue_tweets=True,
+    skip_ambiguous_tweets=True,
+    strip_off_emoticons=True,
+    remove_duplicates=True,
+    limit=None,
+):
+    """
+    Convert json file to csv file, preprocessing each row to obtain a suitable
+    dataset for tweets Semantic Analysis.
+
+    :param json_file: the original json file containing tweets.
+    :param outfile: the output csv filename.
+    :param fields: a list of fields that will be extracted from the json file and
+        kept in the output csv file.
+    :param encoding: the encoding of the files.
+    :param errors: the error handling strategy for the output writer.
+    :param gzip_compress: if True, create a compressed GZIP file.
+
+    :param skip_retweets: if True, remove retweets.
+    :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"
+        emoticons.
+    :param skip_ambiguous_tweets: if True, remove tweets containing both happy
+        and sad emoticons.
+    :param strip_off_emoticons: if True, strip off emoticons from all tweets.
+    :param remove_duplicates: if True, remove tweets appearing more than once.
+    :param limit: an integer to set the number of tweets to convert. After the
+        limit is reached the conversion will stop. It can be useful to create
+        subsets of the original tweets json data.
+    """
+    with codecs.open(json_file, encoding=encoding) as fp:
+        (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+        # write the list of fields as header
+        writer.writerow(fields)
+
+        if remove_duplicates == True:
+            tweets_cache = []
+        i = 0
+        for line in fp:
+            tweet = json.loads(line)
+            row = extract_fields(tweet, fields)
+            try:
+                text = row[fields.index("text")]
+                # Remove retweets
+                if skip_retweets == True:
+                    if re.search(r"\bRT\b", text):
+                        continue
+                # Remove tweets containing ":P" and ":-P" emoticons
+                if skip_tongue_tweets == True:
+                    if re.search(r"\:\-?P\b", text):
+                        continue
+                # Remove tweets containing both happy and sad emoticons
+                if skip_ambiguous_tweets == True:
+                    all_emoticons = EMOTICON_RE.findall(text)
+                    if all_emoticons:
+                        if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD):
+                            continue
+                # Strip off emoticons from all tweets
+                if strip_off_emoticons == True:
+                    row[fields.index("text")] = re.sub(
+                        r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text)
+                    )
+                # Remove duplicate tweets
+                if remove_duplicates == True:
+                    if row[fields.index("text")] in tweets_cache:
+                        continue
+                    else:
+                        tweets_cache.append(row[fields.index("text")])
+            except ValueError:
+                pass
+            writer.writerow(row)
+            i += 1
+            if limit and i >= limit:
+                break
+        outf.close()
+
+
+def parse_tweets_set(
+    filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True
+):
+    """
+    Parse csv file containing tweets and output data a list of (text, label) tuples.
+
+    :param filename: the input csv filename.
+    :param label: the label to be appended to each tweet contained in the csv file.
+    :param word_tokenizer: the tokenizer instance that will be used to tokenize
+        each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
+        If no word_tokenizer is specified, tweets will not be tokenized.
+    :param sent_tokenizer: the tokenizer that will be used to split each tweet into
+        sentences.
+    :param skip_header: if True, skip the first line of the csv file (which usually
+        contains headers).
+
+    :return: a list of (text, label) tuples.
+    """
+    tweets = []
+    if not sent_tokenizer:
+        sent_tokenizer = PunktTokenizer()
+
+    with codecs.open(filename, "rt") as csvfile:
+        reader = csv.reader(csvfile)
+        if skip_header == True:
+            next(reader, None)  # skip the header
+        i = 0
+        for tweet_id, text in reader:
+            # text = text[1]
+            i += 1
+            sys.stdout.write(f"Loaded {i} tweets\r")
+            # Apply sentence and word tokenizer to text
+            if word_tokenizer:
+                tweet = [
+                    w
+                    for sent in sent_tokenizer.tokenize(text)
+                    for w in word_tokenizer.tokenize(sent)
+                ]
+            else:
+                tweet = text
+            tweets.append((tweet, label))
+
+    print(f"Loaded {i} tweets")
+    return tweets
+
+
+# ////////////////////////////////////////////////////////////
+# { Demos
+# ////////////////////////////////////////////////////////////
+
+
+def demo_tweets(trainer, n_instances=None, output=None):
+    """
+    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
+    TweetTokenizer.
+    Features are composed of:
+
+    - 1000 most frequent unigrams
+    - 100 top bigrams (using BigramAssocMeasures.pmi)
+
+    :param trainer: `train` method of a classifier.
+    :param n_instances: the number of total tweets that have to be used for
+        training and testing. Tweets will be equally split between positive and
+        negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.corpus import stopwords, twitter_samples
+    from nltk.sentiment import SentimentAnalyzer
+    from nltk.tokenize import TweetTokenizer
+
+    # Different customizations for the TweetTokenizer
+    tokenizer = TweetTokenizer(preserve_case=False)
+    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
+    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
+
+    if n_instances is not None:
+        n_instances = int(n_instances / 2)
+
+    fields = ["id", "text"]
+    positive_json = twitter_samples.abspath("positive_tweets.json")
+    positive_csv = "positive_tweets.csv"
+    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
+
+    negative_json = twitter_samples.abspath("negative_tweets.json")
+    negative_csv = "negative_tweets.csv"
+    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
+
+    neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer)
+    pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer)
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_tweets = train_pos_docs + train_neg_docs
+    testing_tweets = test_pos_docs + test_neg_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    # stopwords = stopwords.words('english')
+    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
+    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]
+
+    # Add simple unigram word features
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+
+    # Add bigram collocation features
+    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
+        [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12
+    )
+    sentim_analyzer.add_feat_extractor(
+        extract_bigram_feats, bigrams=bigram_collocs_feats
+    )
+
+    training_set = sentim_analyzer.apply_features(training_tweets)
+    test_set = sentim_analyzer.apply_features(testing_tweets)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print(
+            "Your classifier does not provide a show_most_informative_features() method."
+        )
+    results = sentim_analyzer.evaluate(test_set)
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(
+            output,
+            Dataset="labeled_tweets",
+            Classifier=type(classifier).__name__,
+            Tokenizer=tokenizer.__class__.__name__,
+            Feats=extr,
+            Results=results,
+            Instances=n_instances,
+        )
+
+
+def demo_movie_reviews(trainer, n_instances=None, output=None):
+    """
+    Train classifier on all instances of the Movie Reviews dataset.
+    The corpus has been preprocessed using the default sentence tokenizer and
+    WordPunctTokenizer.
+    Features are composed of:
+
+    - most frequent unigrams
+
+    :param trainer: `train` method of a classifier.
+    :param n_instances: the number of total reviews that have to be used for
+        training and testing. Reviews will be equally split between positive and
+        negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.corpus import movie_reviews
+    from nltk.sentiment import SentimentAnalyzer
+
+    if n_instances is not None:
+        n_instances = int(n_instances / 2)
+
+    pos_docs = [
+        (list(movie_reviews.words(pos_id)), "pos")
+        for pos_id in movie_reviews.fileids("pos")[:n_instances]
+    ]
+    neg_docs = [
+        (list(movie_reviews.words(neg_id)), "neg")
+        for neg_id in movie_reviews.fileids("neg")[:n_instances]
+    ]
+    # We separately split positive and negative instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_docs = train_pos_docs + train_neg_docs
+    testing_docs = test_pos_docs + test_neg_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    all_words = sentim_analyzer.all_words(training_docs)
+
+    # Add simple unigram word features
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+    # Apply features to obtain a feature-value representation of our datasets
+    training_set = sentim_analyzer.apply_features(training_docs)
+    test_set = sentim_analyzer.apply_features(testing_docs)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print(
+            "Your classifier does not provide a show_most_informative_features() method."
+        )
+    results = sentim_analyzer.evaluate(test_set)
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(
+            output,
+            Dataset="Movie_reviews",
+            Classifier=type(classifier).__name__,
+            Tokenizer="WordPunctTokenizer",
+            Feats=extr,
+            Results=results,
+            Instances=n_instances,
+        )
+
+
+def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
+    """
+    Train and test a classifier on instances of the Subjective Dataset by Pang and
+    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
+    All tokens (words and punctuation marks) are separated by a whitespace, so
+    we use the basic WhitespaceTokenizer to parse the data.
+
+    :param trainer: `train` method of a classifier.
+    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
+    :param n_instances: the number of total sentences that have to be used for
+        training and testing. Sentences will be equally split between positive
+        and negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.corpus import subjectivity
+    from nltk.sentiment import SentimentAnalyzer
+
+    if n_instances is not None:
+        n_instances = int(n_instances / 2)
+
+    subj_docs = [
+        (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances]
+    ]
+    obj_docs = [
+        (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances]
+    ]
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
+    train_obj_docs, test_obj_docs = split_train_test(obj_docs)
+
+    training_docs = train_subj_docs + train_obj_docs
+    testing_docs = test_subj_docs + test_obj_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    all_words_neg = sentim_analyzer.all_words(
+        [mark_negation(doc) for doc in training_docs]
+    )
+
+    # Add simple unigram word features handling negation
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+
+    # Apply features to obtain a feature-value representation of our datasets
+    training_set = sentim_analyzer.apply_features(training_docs)
+    test_set = sentim_analyzer.apply_features(testing_docs)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print(
+            "Your classifier does not provide a show_most_informative_features() method."
+        )
+    results = sentim_analyzer.evaluate(test_set)
+
+    if save_analyzer == True:
+        sentim_analyzer.save_file(sentim_analyzer, "sa_subjectivity.pickle")
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(
+            output,
+            Dataset="subjectivity",
+            Classifier=type(classifier).__name__,
+            Tokenizer="WhitespaceTokenizer",
+            Feats=extr,
+            Instances=n_instances,
+            Results=results,
+        )
+
+    return sentim_analyzer
+
+
+def demo_sent_subjectivity(text):
+    """
+    Classify a single sentence as subjective or objective using a stored
+    SentimentAnalyzer.
+
+    :param text: a sentence whose subjectivity has to be classified.
+    """
+    from nltk.classify import NaiveBayesClassifier
+    from nltk.tokenize import regexp
+
+    word_tokenizer = regexp.WhitespaceTokenizer()
+    try:
+        sentim_analyzer = load("sa_subjectivity.pickle")
+    except LookupError:
+        print("Cannot find the sentiment analyzer you want to load.")
+        print("Training a new one using NaiveBayesClassifier.")
+        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
+
+    # Tokenize and convert to lower case
+    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
+    print(sentim_analyzer.classify(tokenized_text))
+
+
+def demo_liu_hu_lexicon(sentence, plot=False):
+    """
+    Basic example of sentiment classification using Liu and Hu opinion lexicon.
+    This function simply counts the number of positive, negative and neutral words
+    in the sentence and classifies it depending on which polarity is more represented.
+    Words that do not appear in the lexicon are considered as neutral.
+
+    :param sentence: a sentence whose polarity has to be classified.
+    :param plot: if True, plot a visual representation of the sentence polarity.
+    """
+    from nltk.corpus import opinion_lexicon
+    from nltk.tokenize import treebank
+
+    tokenizer = treebank.TreebankWordTokenizer()
+    pos_words = 0
+    neg_words = 0
+    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
+
+    x = list(range(len(tokenized_sent)))  # x axis for the plot
+    y = []
+
+    for word in tokenized_sent:
+        if word in opinion_lexicon.positive():
+            pos_words += 1
+            y.append(1)  # positive
+        elif word in opinion_lexicon.negative():
+            neg_words += 1
+            y.append(-1)  # negative
+        else:
+            y.append(0)  # neutral
+
+    if pos_words > neg_words:
+        print("Positive")
+    elif pos_words < neg_words:
+        print("Negative")
+    elif pos_words == neg_words:
+        print("Neutral")
+
+    if plot == True:
+        _show_plot(
+            x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"]
+        )
+
+
+def demo_vader_instance(text):
+    """
+    Output polarity scores for a text using Vader approach.
+
+    :param text: a text whose polarity has to be evaluated.
+    """
+    from nltk.sentiment import SentimentIntensityAnalyzer
+
+    vader_analyzer = SentimentIntensityAnalyzer()
+    print(vader_analyzer.polarity_scores(text))
+
+
+def demo_vader_tweets(n_instances=None, output=None):
+    """
+    Classify 10000 positive and negative tweets using Vader approach.
+
+    :param n_instances: the number of total tweets that have to be classified.
+    :param output: the output file where results have to be reported.
+    """
+    from collections import defaultdict
+
+    from nltk.corpus import twitter_samples
+    from nltk.metrics import accuracy as eval_accuracy
+    from nltk.metrics import f_measure as eval_f_measure
+    from nltk.metrics import precision as eval_precision
+    from nltk.metrics import recall as eval_recall
+    from nltk.sentiment import SentimentIntensityAnalyzer
+
+    if n_instances is not None:
+        n_instances = int(n_instances / 2)
+
+    fields = ["id", "text"]
+    positive_json = twitter_samples.abspath("positive_tweets.json")
+    positive_csv = "positive_tweets.csv"
+    json2csv_preprocess(
+        positive_json,
+        positive_csv,
+        fields,
+        strip_off_emoticons=False,
+        limit=n_instances,
+    )
+
+    negative_json = twitter_samples.abspath("negative_tweets.json")
+    negative_csv = "negative_tweets.csv"
+    json2csv_preprocess(
+        negative_json,
+        negative_csv,
+        fields,
+        strip_off_emoticons=False,
+        limit=n_instances,
+    )
+
+    pos_docs = parse_tweets_set(positive_csv, label="pos")
+    neg_docs = parse_tweets_set(negative_csv, label="neg")
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_tweets = train_pos_docs + train_neg_docs
+    testing_tweets = test_pos_docs + test_neg_docs
+
+    vader_analyzer = SentimentIntensityAnalyzer()
+
+    gold_results = defaultdict(set)
+    test_results = defaultdict(set)
+    acc_gold_results = []
+    acc_test_results = []
+    labels = set()
+    num = 0
+    for i, (text, label) in enumerate(testing_tweets):
+        labels.add(label)
+        gold_results[label].add(i)
+        acc_gold_results.append(label)
+        score = vader_analyzer.polarity_scores(text)["compound"]
+        if score > 0:
+            observed = "pos"
+        else:
+            observed = "neg"
+        num += 1
+        acc_test_results.append(observed)
+        test_results[observed].add(i)
+    metrics_results = {}
+    for label in labels:
+        accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
+        metrics_results["Accuracy"] = accuracy_score
+        precision_score = eval_precision(gold_results[label], test_results[label])
+        metrics_results[f"Precision [{label}]"] = precision_score
+        recall_score = eval_recall(gold_results[label], test_results[label])
+        metrics_results[f"Recall [{label}]"] = recall_score
+        f_measure_score = eval_f_measure(gold_results[label], test_results[label])
+        metrics_results[f"F-measure [{label}]"] = f_measure_score
+
+    for result in sorted(metrics_results):
+        print(f"{result}: {metrics_results[result]}")
+
+    if output:
+        output_markdown(
+            output,
+            Approach="Vader",
+            Dataset="labeled_tweets",
+            Instances=n_instances,
+            Results=metrics_results,
+        )
+
+
+if __name__ == "__main__":
+    from sklearn.svm import LinearSVC
+
+    from nltk.classify import MaxentClassifier, NaiveBayesClassifier
+    from nltk.classify.scikitlearn import SklearnClassifier
+    from nltk.twitter.common import _outf_writer, extract_fields
+
+    naive_bayes = NaiveBayesClassifier.train
+    svm = SklearnClassifier(LinearSVC()).train
+    maxent = MaxentClassifier.train
+
+    demo_tweets(naive_bayes)
+    # demo_movie_reviews(svm)
+    # demo_subjectivity(svm)
+    # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ")
+    # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True)
+    # demo_vader_instance("This movie was actually neither that funny, nor super witty.")
+    # demo_vader_tweets()
--- a/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/vader.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/sentiment/vader.py
@@ -0,0 +1,633 @@
+# Natural Language Toolkit: vader
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
+#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
+#         Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
+#         George Berry <geb97@cornell.edu> (modifications)
+#         Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+#
+# Modifications to the original VADER code have been made in order to
+# integrate it into NLTK. These have involved changes to
+# ensure Python 3 compatibility, and refactoring to achieve greater modularity.
+
+"""
+If you use the VADER sentiment analysis tools, please cite:
+
+Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
+Sentiment Analysis of Social Media Text. Eighth International Conference on
+Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+"""
+
+import math
+import re
+import string
+from itertools import product
+
+import nltk.data
+from nltk.util import pairwise
+
+
+class VaderConstants:
+    """
+    A class to keep the Vader lists and constants.
+    """
+
+    ##Constants##
+    # (empirically derived mean sentiment intensity rating increase for booster words)
+    B_INCR = 0.293
+    B_DECR = -0.293
+
+    # (empirically derived mean sentiment intensity rating increase for using
+    # ALLCAPs to emphasize a word)
+    C_INCR = 0.733
+
+    N_SCALAR = -0.74
+
+    NEGATE = {
+        "aint",
+        "arent",
+        "cannot",
+        "cant",
+        "couldnt",
+        "darent",
+        "didnt",
+        "doesnt",
+        "ain't",
+        "aren't",
+        "can't",
+        "couldn't",
+        "daren't",
+        "didn't",
+        "doesn't",
+        "dont",
+        "hadnt",
+        "hasnt",
+        "havent",
+        "isnt",
+        "mightnt",
+        "mustnt",
+        "neither",
+        "don't",
+        "hadn't",
+        "hasn't",
+        "haven't",
+        "isn't",
+        "mightn't",
+        "mustn't",
+        "neednt",
+        "needn't",
+        "never",
+        "none",
+        "nope",
+        "nor",
+        "not",
+        "nothing",
+        "nowhere",
+        "oughtnt",
+        "shant",
+        "shouldnt",
+        "uhuh",
+        "wasnt",
+        "werent",
+        "oughtn't",
+        "shan't",
+        "shouldn't",
+        "uh-uh",
+        "wasn't",
+        "weren't",
+        "without",
+        "wont",
+        "wouldnt",
+        "won't",
+        "wouldn't",
+        "rarely",
+        "seldom",
+        "despite",
+    }
+
+    # booster/dampener 'intensifiers' or 'degree adverbs'
+    # https://en.wiktionary.org/wiki/Category:English_degree_adverbs
+
+    BOOSTER_DICT = {
+        "absolutely": B_INCR,
+        "amazingly": B_INCR,
+        "awfully": B_INCR,
+        "completely": B_INCR,
+        "considerably": B_INCR,
+        "decidedly": B_INCR,
+        "deeply": B_INCR,
+        "effing": B_INCR,
+        "enormously": B_INCR,
+        "entirely": B_INCR,
+        "especially": B_INCR,
+        "exceptionally": B_INCR,
+        "extremely": B_INCR,
+        "fabulously": B_INCR,
+        "flipping": B_INCR,
+        "flippin": B_INCR,
+        "fricking": B_INCR,
+        "frickin": B_INCR,
+        "frigging": B_INCR,
+        "friggin": B_INCR,
+        "fully": B_INCR,
+        "fucking": B_INCR,
+        "greatly": B_INCR,
+        "hella": B_INCR,
+        "highly": B_INCR,
+        "hugely": B_INCR,
+        "incredibly": B_INCR,
+        "intensely": B_INCR,
+        "majorly": B_INCR,
+        "more": B_INCR,
+        "most": B_INCR,
+        "particularly": B_INCR,
+        "purely": B_INCR,
+        "quite": B_INCR,
+        "really": B_INCR,
+        "remarkably": B_INCR,
+        "so": B_INCR,
+        "substantially": B_INCR,
+        "thoroughly": B_INCR,
+        "totally": B_INCR,
+        "tremendously": B_INCR,
+        "uber": B_INCR,
+        "unbelievably": B_INCR,
+        "unusually": B_INCR,
+        "utterly": B_INCR,
+        "very": B_INCR,
+        "almost": B_DECR,
+        "barely": B_DECR,
+        "hardly": B_DECR,
+        "just enough": B_DECR,
+        "kind of": B_DECR,
+        "kinda": B_DECR,
+        "kindof": B_DECR,
+        "kind-of": B_DECR,
+        "less": B_DECR,
+        "little": B_DECR,
+        "marginally": B_DECR,
+        "occasionally": B_DECR,
+        "partly": B_DECR,
+        "scarcely": B_DECR,
+        "slightly": B_DECR,
+        "somewhat": B_DECR,
+        "sort of": B_DECR,
+        "sorta": B_DECR,
+        "sortof": B_DECR,
+        "sort-of": B_DECR,
+    }
+
+    # check for special case idioms using a sentiment-laden keyword known to SAGE
+    SPECIAL_CASE_IDIOMS = {
+        "the shit": 3,
+        "the bomb": 3,
+        "bad ass": 1.5,
+        "yeah right": -2,
+        "cut the mustard": 2,
+        "kiss of death": -1.5,
+        "hand to mouth": -2,
+    }
+
+    # for removing punctuation
+    REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]")
+
+    PUNC_LIST = [
+        ".",
+        "!",
+        "?",
+        ",",
+        ";",
+        ":",
+        "-",
+        "'",
+        '"',
+        "!!",
+        "!!!",
+        "??",
+        "???",
+        "?!?",
+        "!?!",
+        "?!?!",
+        "!?!?",
+    ]
+
+    def __init__(self):
+        pass
+
+    def negated(self, input_words, include_nt=True):
+        """
+        Determine if input contains negation words
+        """
+        neg_words = self.NEGATE
+        if any(word.lower() in neg_words for word in input_words):
+            return True
+        if include_nt:
+            if any("n't" in word.lower() for word in input_words):
+                return True
+        for first, second in pairwise(input_words):
+            if second.lower() == "least" and first.lower() != "at":
+                return True
+        return False
+
+    def normalize(self, score, alpha=15):
+        """
+        Normalize the score to be between -1 and 1 using an alpha that
+        approximates the max expected value
+        """
+        norm_score = score / math.sqrt((score * score) + alpha)
+        return norm_score
+
+    def scalar_inc_dec(self, word, valence, is_cap_diff):
+        """
+        Check if the preceding words increase, decrease, or negate/nullify the
+        valence
+        """
+        scalar = 0.0
+        word_lower = word.lower()
+        if word_lower in self.BOOSTER_DICT:
+            scalar = self.BOOSTER_DICT[word_lower]
+            if valence < 0:
+                scalar *= -1
+            # check if booster/dampener word is in ALLCAPS (while others aren't)
+            if word.isupper() and is_cap_diff:
+                if valence > 0:
+                    scalar += self.C_INCR
+                else:
+                    scalar -= self.C_INCR
+        return scalar
+
+
+class SentiText:
+    """
+    Identify sentiment-relevant string-level properties of input text.
+    """
+
+    def __init__(self, text, punc_list, regex_remove_punctuation):
+        if not isinstance(text, str):
+            text = str(text.encode("utf-8"))
+        self.text = text
+        self.PUNC_LIST = punc_list
+        self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
+        self.words_and_emoticons = self._words_and_emoticons()
+        # doesn't separate words from
+        # adjacent punctuation (keeps emoticons & contractions)
+        self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
+
+    def _words_plus_punc(self):
+        """
+        Returns mapping of form:
+        {
+            'cat,': 'cat',
+            ',cat': 'cat',
+        }
+        """
+        no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
+        # removes punctuation (but loses emoticons & contractions)
+        words_only = no_punc_text.split()
+        # remove singletons
+        words_only = {w for w in words_only if len(w) > 1}
+        # the product gives ('cat', ',') and (',', 'cat')
+        punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
+        punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
+        words_punc_dict = punc_before
+        words_punc_dict.update(punc_after)
+        return words_punc_dict
+
+    def _words_and_emoticons(self):
+        """
+        Removes leading and trailing puncutation
+        Leaves contractions and most emoticons
+            Does not preserve punc-plus-letter emoticons (e.g. :D)
+        """
+        wes = self.text.split()
+        words_punc_dict = self._words_plus_punc()
+        wes = [we for we in wes if len(we) > 1]
+        for i, we in enumerate(wes):
+            if we in words_punc_dict:
+                wes[i] = words_punc_dict[we]
+        return wes
+
+    def allcap_differential(self, words):
+        """
+        Check whether just some words in the input are ALL CAPS
+
+        :param list words: The words to inspect
+        :returns: `True` if some but not all items in `words` are ALL CAPS
+        """
+        is_different = False
+        allcap_words = 0
+        for word in words:
+            if word.isupper():
+                allcap_words += 1
+        cap_differential = len(words) - allcap_words
+        if 0 < cap_differential < len(words):
+            is_different = True
+        return is_different
+
+
+class SentimentIntensityAnalyzer:
+    """
+    Give a sentiment intensity score to sentences.
+    """
+
+    def __init__(
+        self,
+        lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
+    ):
+        self.lexicon_file = nltk.data.load(lexicon_file)
+        self.lexicon = self.make_lex_dict()
+        self.constants = VaderConstants()
+
+    def make_lex_dict(self):
+        """
+        Convert lexicon file to a dictionary
+        """
+        lex_dict = {}
+        for line in self.lexicon_file.split("\n"):
+            (word, measure) = line.strip().split("\t")[0:2]
+            lex_dict[word] = float(measure)
+        return lex_dict
+
+    def polarity_scores(self, text):
+        """
+        Return a float for sentiment strength based on the input text.
+        Positive values are positive valence, negative value are negative
+        valence.
+
+        :note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you
+            are interested in processing the text in the hashtags too, then we recommend
+            preprocessing your data to remove the #, after which the hashtag text may be
+            matched as if it was a normal word in the sentence.
+        """
+        # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
+        sentitext = SentiText(
+            text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION
+        )
+        sentiments = []
+        words_and_emoticons = sentitext.words_and_emoticons
+        for item in words_and_emoticons:
+            valence = 0
+            i = words_and_emoticons.index(item)
+            if (
+                i < len(words_and_emoticons) - 1
+                and item.lower() == "kind"
+                and words_and_emoticons[i + 1].lower() == "of"
+            ) or item.lower() in self.constants.BOOSTER_DICT:
+                sentiments.append(valence)
+                continue
+
+            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
+
+        sentiments = self._but_check(words_and_emoticons, sentiments)
+
+        return self.score_valence(sentiments, text)
+
+    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
+        is_cap_diff = sentitext.is_cap_diff
+        words_and_emoticons = sentitext.words_and_emoticons
+        item_lowercase = item.lower()
+        if item_lowercase in self.lexicon:
+            # get the sentiment valence
+            valence = self.lexicon[item_lowercase]
+
+            # check if sentiment laden word is in ALL CAPS (while others aren't)
+            if item.isupper() and is_cap_diff:
+                if valence > 0:
+                    valence += self.constants.C_INCR
+                else:
+                    valence -= self.constants.C_INCR
+
+            for start_i in range(0, 3):
+                if (
+                    i > start_i
+                    and words_and_emoticons[i - (start_i + 1)].lower()
+                    not in self.lexicon
+                ):
+                    # dampen the scalar modifier of preceding words and emoticons
+                    # (excluding the ones that immediately preceed the item) based
+                    # on their distance from the current item.
+                    s = self.constants.scalar_inc_dec(
+                        words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
+                    )
+                    if start_i == 1 and s != 0:
+                        s = s * 0.95
+                    if start_i == 2 and s != 0:
+                        s = s * 0.9
+                    valence = valence + s
+                    valence = self._never_check(
+                        valence, words_and_emoticons, start_i, i
+                    )
+                    if start_i == 2:
+                        valence = self._idioms_check(valence, words_and_emoticons, i)
+
+                        # future work: consider other sentiment-laden idioms
+                        # other_idioms =
+                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
+                        #  "upper hand": 1, "break a leg": 2,
+                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
+                        #  "on the ball": 2,"under the weather": -2}
+
+            valence = self._least_check(valence, words_and_emoticons, i)
+
+        sentiments.append(valence)
+        return sentiments
+
+    def _least_check(self, valence, words_and_emoticons, i):
+        # check for negation case using "least"
+        if (
+            i > 1
+            and words_and_emoticons[i - 1].lower() not in self.lexicon
+            and words_and_emoticons[i - 1].lower() == "least"
+        ):
+            if (
+                words_and_emoticons[i - 2].lower() != "at"
+                and words_and_emoticons[i - 2].lower() != "very"
+            ):
+                valence = valence * self.constants.N_SCALAR
+        elif (
+            i > 0
+            and words_and_emoticons[i - 1].lower() not in self.lexicon
+            and words_and_emoticons[i - 1].lower() == "least"
+        ):
+            valence = valence * self.constants.N_SCALAR
+        return valence
+
+    def _but_check(self, words_and_emoticons, sentiments):
+        words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons]
+        but = {"but"} & set(words_and_emoticons)
+        if but:
+            bi = words_and_emoticons.index(next(iter(but)))
+            for sidx, sentiment in enumerate(sentiments):
+                if sidx < bi:
+                    sentiments[sidx] = sentiment * 0.5
+                elif sidx > bi:
+                    sentiments[sidx] = sentiment * 1.5
+        return sentiments
+
+    def _idioms_check(self, valence, words_and_emoticons, i):
+        onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}"
+
+        twoonezero = "{} {} {}".format(
+            words_and_emoticons[i - 2],
+            words_and_emoticons[i - 1],
+            words_and_emoticons[i],
+        )
+
+        twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}"
+
+        threetwoone = "{} {} {}".format(
+            words_and_emoticons[i - 3],
+            words_and_emoticons[i - 2],
+            words_and_emoticons[i - 1],
+        )
+
+        threetwo = "{} {}".format(
+            words_and_emoticons[i - 3], words_and_emoticons[i - 2]
+        )
+
+        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
+
+        for seq in sequences:
+            if seq in self.constants.SPECIAL_CASE_IDIOMS:
+                valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
+                break
+
+        if len(words_and_emoticons) - 1 > i:
+            zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}"
+            if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
+                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
+        if len(words_and_emoticons) - 1 > i + 1:
+            zeroonetwo = "{} {} {}".format(
+                words_and_emoticons[i],
+                words_and_emoticons[i + 1],
+                words_and_emoticons[i + 2],
+            )
+            if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
+                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
+
+        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
+        if (
+            threetwo in self.constants.BOOSTER_DICT
+            or twoone in self.constants.BOOSTER_DICT
+        ):
+            valence = valence + self.constants.B_DECR
+        return valence
+
+    def _never_check(self, valence, words_and_emoticons, start_i, i):
+        if start_i == 0:
+            if self.constants.negated([words_and_emoticons[i - 1]]):
+                valence = valence * self.constants.N_SCALAR
+        if start_i == 1:
+            if words_and_emoticons[i - 2] == "never" and (
+                words_and_emoticons[i - 1] == "so"
+                or words_and_emoticons[i - 1] == "this"
+            ):
+                valence = valence * 1.5
+            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
+                valence = valence * self.constants.N_SCALAR
+        if start_i == 2:
+            if (
+                words_and_emoticons[i - 3] == "never"
+                and (
+                    words_and_emoticons[i - 2] == "so"
+                    or words_and_emoticons[i - 2] == "this"
+                )
+                or (
+                    words_and_emoticons[i - 1] == "so"
+                    or words_and_emoticons[i - 1] == "this"
+                )
+            ):
+                valence = valence * 1.25
+            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
+                valence = valence * self.constants.N_SCALAR
+        return valence
+
+    def _punctuation_emphasis(self, sum_s, text):
+        # add emphasis from exclamation points and question marks
+        ep_amplifier = self._amplify_ep(text)
+        qm_amplifier = self._amplify_qm(text)
+        punct_emph_amplifier = ep_amplifier + qm_amplifier
+        return punct_emph_amplifier
+
+    def _amplify_ep(self, text):
+        # check for added emphasis resulting from exclamation points (up to 4 of them)
+        ep_count = text.count("!")
+        if ep_count > 4:
+            ep_count = 4
+        # (empirically derived mean sentiment intensity rating increase for
+        # exclamation points)
+        ep_amplifier = ep_count * 0.292
+        return ep_amplifier
+
+    def _amplify_qm(self, text):
+        # check for added emphasis resulting from question marks (2 or 3+)
+        qm_count = text.count("?")
+        qm_amplifier = 0
+        if qm_count > 1:
+            if qm_count <= 3:
+                # (empirically derived mean sentiment intensity rating increase for
+                # question marks)
+                qm_amplifier = qm_count * 0.18
+            else:
+                qm_amplifier = 0.96
+        return qm_amplifier
+
+    def _sift_sentiment_scores(self, sentiments):
+        # want separate positive versus negative sentiment scores
+        pos_sum = 0.0
+        neg_sum = 0.0
+        neu_count = 0
+        for sentiment_score in sentiments:
+            if sentiment_score > 0:
+                pos_sum += (
+                    float(sentiment_score) + 1
+                )  # compensates for neutral words that are counted as 1
+            if sentiment_score < 0:
+                neg_sum += (
+                    float(sentiment_score) - 1
+                )  # when used with math.fabs(), compensates for neutrals
+            if sentiment_score == 0:
+                neu_count += 1
+        return pos_sum, neg_sum, neu_count
+
+    def score_valence(self, sentiments, text):
+        if sentiments:
+            sum_s = float(sum(sentiments))
+            # compute and add emphasis from punctuation in text
+            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
+            if sum_s > 0:
+                sum_s += punct_emph_amplifier
+            elif sum_s < 0:
+                sum_s -= punct_emph_amplifier
+
+            compound = self.constants.normalize(sum_s)
+            # discriminate between positive, negative and neutral sentiment scores
+            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
+
+            if pos_sum > math.fabs(neg_sum):
+                pos_sum += punct_emph_amplifier
+            elif pos_sum < math.fabs(neg_sum):
+                neg_sum -= punct_emph_amplifier
+
+            total = pos_sum + math.fabs(neg_sum) + neu_count
+            pos = math.fabs(pos_sum / total)
+            neg = math.fabs(neg_sum / total)
+            neu = math.fabs(neu_count / total)
+
+        else:
+            compound = 0.0
+            pos = 0.0
+            neg = 0.0
+            neu = 0.0
+
+        sentiment_dict = {
+            "neg": round(neg, 3),
+            "neu": round(neu, 3),
+            "pos": round(pos, 3),
+            "compound": round(compound, 4),
+        }
+
+        return sentiment_dict