updates
This commit is contained in:
@@ -0,0 +1,13 @@
|
||||
# Natural Language Toolkit: Sentiment Analysis
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
NLTK Sentiment Analysis Package
|
||||
|
||||
"""
|
||||
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer
|
||||
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,255 @@
|
||||
#
|
||||
# Natural Language Toolkit: Sentiment Analyzer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks
|
||||
using NLTK features and classifiers, especially for teaching and demonstrative
|
||||
purposes.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.classify.util import accuracy as eval_accuracy
|
||||
from nltk.classify.util import apply_features
|
||||
from nltk.collocations import BigramCollocationFinder
|
||||
from nltk.metrics import BigramAssocMeasures
|
||||
from nltk.metrics import f_measure as eval_f_measure
|
||||
from nltk.metrics import precision as eval_precision
|
||||
from nltk.metrics import recall as eval_recall
|
||||
from nltk.probability import FreqDist
|
||||
|
||||
|
||||
class SentimentAnalyzer:
|
||||
"""
|
||||
A Sentiment Analysis tool based on machine learning approaches.
|
||||
"""
|
||||
|
||||
def __init__(self, classifier=None):
|
||||
self.feat_extractors = defaultdict(list)
|
||||
self.classifier = classifier
|
||||
|
||||
def all_words(self, documents, labeled=None):
|
||||
"""
|
||||
Return all words/tokens from the documents (with duplicates).
|
||||
|
||||
:param documents: a list of (words, label) tuples.
|
||||
:param labeled: if `True`, assume that each document is represented by a
|
||||
(words, label) tuple: (list(str), str). If `False`, each document is
|
||||
considered as being a simple list of strings: list(str).
|
||||
:rtype: list(str)
|
||||
:return: A list of all words/tokens in `documents`.
|
||||
"""
|
||||
all_words = []
|
||||
if labeled is None:
|
||||
labeled = documents and isinstance(documents[0], tuple)
|
||||
if labeled:
|
||||
for words, _sentiment in documents:
|
||||
all_words.extend(words)
|
||||
elif not labeled:
|
||||
for words in documents:
|
||||
all_words.extend(words)
|
||||
return all_words
|
||||
|
||||
def apply_features(self, documents, labeled=None):
|
||||
"""
|
||||
Apply all feature extractor functions to the documents. This is a wrapper
|
||||
around `nltk.classify.util.apply_features`.
|
||||
|
||||
If `labeled=False`, return featuresets as:
|
||||
[feature_func(doc) for doc in documents]
|
||||
If `labeled=True`, return featuresets as:
|
||||
[(feature_func(tok), label) for (tok, label) in toks]
|
||||
|
||||
:param documents: a list of documents. `If labeled=True`, the method expects
|
||||
a list of (words, label) tuples.
|
||||
:rtype: LazyMap
|
||||
"""
|
||||
return apply_features(self.extract_features, documents, labeled)
|
||||
|
||||
def unigram_word_feats(self, words, top_n=None, min_freq=0):
|
||||
"""
|
||||
Return most common top_n word features.
|
||||
|
||||
:param words: a list of words/tokens.
|
||||
:param top_n: number of best words/tokens to use, sorted by frequency.
|
||||
:rtype: list(str)
|
||||
:return: A list of `top_n` words/tokens (with no duplicates) sorted by
|
||||
frequency.
|
||||
"""
|
||||
# Stopwords are not removed
|
||||
unigram_feats_freqs = FreqDist(word for word in words)
|
||||
return [
|
||||
w
|
||||
for w, f in unigram_feats_freqs.most_common(top_n)
|
||||
if unigram_feats_freqs[w] > min_freq
|
||||
]
|
||||
|
||||
def bigram_collocation_feats(
|
||||
self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi
|
||||
):
|
||||
"""
|
||||
Return `top_n` bigram features (using `assoc_measure`).
|
||||
Note that this method is based on bigram collocations measures, and not
|
||||
on simple bigram frequency.
|
||||
|
||||
:param documents: a list (or iterable) of tokens.
|
||||
:param top_n: number of best words/tokens to use, sorted by association
|
||||
measure.
|
||||
:param assoc_measure: bigram association measure to use as score function.
|
||||
:param min_freq: the minimum number of occurrencies of bigrams to take
|
||||
into consideration.
|
||||
|
||||
:return: `top_n` ngrams scored by the given association measure.
|
||||
"""
|
||||
finder = BigramCollocationFinder.from_documents(documents)
|
||||
finder.apply_freq_filter(min_freq)
|
||||
return finder.nbest(assoc_measure, top_n)
|
||||
|
||||
def classify(self, instance):
|
||||
"""
|
||||
Classify a single instance applying the features that have already been
|
||||
stored in the SentimentAnalyzer.
|
||||
|
||||
:param instance: a list (or iterable) of tokens.
|
||||
:return: the classification result given by applying the classifier.
|
||||
"""
|
||||
instance_feats = self.apply_features([instance], labeled=False)
|
||||
return self.classifier.classify(instance_feats[0])
|
||||
|
||||
def add_feat_extractor(self, function, **kwargs):
|
||||
"""
|
||||
Add a new function to extract features from a document. This function will
|
||||
be used in extract_features().
|
||||
Important: in this step our kwargs are only representing additional parameters,
|
||||
and NOT the document we have to parse. The document will always be the first
|
||||
parameter in the parameter list, and it will be added in the extract_features()
|
||||
function.
|
||||
|
||||
:param function: the extractor function to add to the list of feature extractors.
|
||||
:param kwargs: additional parameters required by the `function` function.
|
||||
"""
|
||||
self.feat_extractors[function].append(kwargs)
|
||||
|
||||
def extract_features(self, document):
|
||||
"""
|
||||
Apply extractor functions (and their parameters) to the present document.
|
||||
We pass `document` as the first parameter of the extractor functions.
|
||||
If we want to use the same extractor function multiple times, we have to
|
||||
add it to the extractors with `add_feat_extractor` using multiple sets of
|
||||
parameters (one for each call of the extractor function).
|
||||
|
||||
:param document: the document that will be passed as argument to the
|
||||
feature extractor functions.
|
||||
:return: A dictionary of populated features extracted from the document.
|
||||
:rtype: dict
|
||||
"""
|
||||
all_features = {}
|
||||
for extractor in self.feat_extractors:
|
||||
for param_set in self.feat_extractors[extractor]:
|
||||
feats = extractor(document, **param_set)
|
||||
all_features.update(feats)
|
||||
return all_features
|
||||
|
||||
def train(self, trainer, training_set, save_classifier=None, **kwargs):
|
||||
"""
|
||||
Train classifier on the training set, optionally saving the output in the
|
||||
file specified by `save_classifier`.
|
||||
Additional arguments depend on the specific trainer used. For example,
|
||||
a MaxentClassifier can use `max_iter` parameter to specify the number
|
||||
of iterations, while a NaiveBayesClassifier cannot.
|
||||
|
||||
:param trainer: `train` method of a classifier.
|
||||
E.g.: NaiveBayesClassifier.train
|
||||
:param training_set: the training set to be passed as argument to the
|
||||
classifier `train` method.
|
||||
:param save_classifier: the filename of the file where the classifier
|
||||
will be stored (optional).
|
||||
:param kwargs: additional parameters that will be passed as arguments to
|
||||
the classifier `train` function.
|
||||
:return: A classifier instance trained on the training set.
|
||||
:rtype:
|
||||
"""
|
||||
print("Training classifier")
|
||||
self.classifier = trainer(training_set, **kwargs)
|
||||
if save_classifier:
|
||||
self.save_file(self.classifier, save_classifier)
|
||||
|
||||
return self.classifier
|
||||
|
||||
def save_file(self, content, filename):
|
||||
"""
|
||||
Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
|
||||
"""
|
||||
print("Saving", filename, file=sys.stderr)
|
||||
with open(filename, "wb") as storage_file:
|
||||
import pickle
|
||||
|
||||
# The protocol=2 parameter is for python2 compatibility
|
||||
pickle.dump(content, storage_file, protocol=2)
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
test_set,
|
||||
classifier=None,
|
||||
accuracy=True,
|
||||
f_measure=True,
|
||||
precision=True,
|
||||
recall=True,
|
||||
verbose=False,
|
||||
):
|
||||
"""
|
||||
Evaluate and print classifier performance on the test set.
|
||||
|
||||
:param test_set: A list of (tokens, label) tuples to use as gold set.
|
||||
:param classifier: a classifier instance (previously trained).
|
||||
:param accuracy: if `True`, evaluate classifier accuracy.
|
||||
:param f_measure: if `True`, evaluate classifier f_measure.
|
||||
:param precision: if `True`, evaluate classifier precision.
|
||||
:param recall: if `True`, evaluate classifier recall.
|
||||
:return: evaluation results.
|
||||
:rtype: dict(str): float
|
||||
"""
|
||||
if classifier is None:
|
||||
classifier = self.classifier
|
||||
print(f"Evaluating {type(classifier).__name__} results...")
|
||||
metrics_results = {}
|
||||
if accuracy:
|
||||
accuracy_score = eval_accuracy(classifier, test_set)
|
||||
metrics_results["Accuracy"] = accuracy_score
|
||||
|
||||
gold_results = defaultdict(set)
|
||||
test_results = defaultdict(set)
|
||||
labels = set()
|
||||
for i, (feats, label) in enumerate(test_set):
|
||||
labels.add(label)
|
||||
gold_results[label].add(i)
|
||||
observed = classifier.classify(feats)
|
||||
test_results[observed].add(i)
|
||||
|
||||
for label in labels:
|
||||
if precision:
|
||||
precision_score = eval_precision(
|
||||
gold_results[label], test_results[label]
|
||||
)
|
||||
metrics_results[f"Precision [{label}]"] = precision_score
|
||||
if recall:
|
||||
recall_score = eval_recall(gold_results[label], test_results[label])
|
||||
metrics_results[f"Recall [{label}]"] = recall_score
|
||||
if f_measure:
|
||||
f_measure_score = eval_f_measure(
|
||||
gold_results[label], test_results[label]
|
||||
)
|
||||
metrics_results[f"F-measure [{label}]"] = f_measure_score
|
||||
|
||||
# Print evaluation results (in alphabetical order)
|
||||
if verbose:
|
||||
for result in sorted(metrics_results):
|
||||
print(f"{result}: {metrics_results[result]}")
|
||||
|
||||
return metrics_results
|
||||
887
Backend/venv/lib/python3.12/site-packages/nltk/sentiment/util.py
Normal file
887
Backend/venv/lib/python3.12/site-packages/nltk/sentiment/util.py
Normal file
@@ -0,0 +1,887 @@
|
||||
#
|
||||
# Natural Language Toolkit: Sentiment Analyzer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Utility methods for Sentiment Analysis.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from copy import deepcopy
|
||||
|
||||
import nltk
|
||||
from nltk.corpus import CategorizedPlaintextCorpusReader
|
||||
from nltk.data import load
|
||||
from nltk.tokenize import PunktTokenizer
|
||||
from nltk.tokenize.casual import EMOTICON_RE
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# { Regular expressions
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
# Regular expression for negation by Christopher Potts
|
||||
NEGATION = r"""
|
||||
(?:
|
||||
^(?:never|no|nothing|nowhere|noone|none|not|
|
||||
havent|hasnt|hadnt|cant|couldnt|shouldnt|
|
||||
wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
|
||||
)$
|
||||
)
|
||||
|
|
||||
n't"""
|
||||
|
||||
NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
|
||||
|
||||
CLAUSE_PUNCT = r"^[.:;!?]$"
|
||||
CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
|
||||
|
||||
# Happy and sad emoticons
|
||||
|
||||
HAPPY = {
|
||||
":-)",
|
||||
":)",
|
||||
";)",
|
||||
":o)",
|
||||
":]",
|
||||
":3",
|
||||
":c)",
|
||||
":>",
|
||||
"=]",
|
||||
"8)",
|
||||
"=)",
|
||||
":}",
|
||||
":^)",
|
||||
":-D",
|
||||
":D",
|
||||
"8-D",
|
||||
"8D",
|
||||
"x-D",
|
||||
"xD",
|
||||
"X-D",
|
||||
"XD",
|
||||
"=-D",
|
||||
"=D",
|
||||
"=-3",
|
||||
"=3",
|
||||
":-))",
|
||||
":'-)",
|
||||
":')",
|
||||
":*",
|
||||
":^*",
|
||||
">:P",
|
||||
":-P",
|
||||
":P",
|
||||
"X-P",
|
||||
"x-p",
|
||||
"xp",
|
||||
"XP",
|
||||
":-p",
|
||||
":p",
|
||||
"=p",
|
||||
":-b",
|
||||
":b",
|
||||
">:)",
|
||||
">;)",
|
||||
">:-)",
|
||||
"<3",
|
||||
}
|
||||
|
||||
SAD = {
|
||||
":L",
|
||||
":-/",
|
||||
">:/",
|
||||
":S",
|
||||
">:[",
|
||||
":@",
|
||||
":-(",
|
||||
":[",
|
||||
":-||",
|
||||
"=L",
|
||||
":<",
|
||||
":-[",
|
||||
":-<",
|
||||
"=\\",
|
||||
"=/",
|
||||
">:(",
|
||||
":(",
|
||||
">.<",
|
||||
":'-(",
|
||||
":'(",
|
||||
":\\",
|
||||
":-c",
|
||||
":c",
|
||||
":{",
|
||||
">:\\",
|
||||
";(",
|
||||
}
|
||||
|
||||
|
||||
def timer(method):
|
||||
"""
|
||||
A timer decorator to measure execution performance of methods.
|
||||
"""
|
||||
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = method(*args, **kw)
|
||||
end = time.time()
|
||||
tot_time = end - start
|
||||
hours = tot_time // 3600
|
||||
mins = tot_time // 60 % 60
|
||||
# in Python 2.x round() will return a float, so we convert it to int
|
||||
secs = int(round(tot_time % 60))
|
||||
if hours == 0 and mins == 0 and secs < 10:
|
||||
print(f"[TIMER] {method.__name__}(): {method.__name__:.3f} seconds")
|
||||
else:
|
||||
print(f"[TIMER] {method.__name__}(): {hours}h {mins}m {secs}s")
|
||||
return result
|
||||
|
||||
return timed
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# { Feature extractor functions
|
||||
# ////////////////////////////////////////////////////////////
|
||||
"""
|
||||
Feature extractor functions are declared outside the SentimentAnalyzer class.
|
||||
Users should have the possibility to create their own feature extractors
|
||||
without modifying SentimentAnalyzer.
|
||||
"""
|
||||
|
||||
|
||||
def extract_unigram_feats(document, unigrams, handle_negation=False):
|
||||
"""
|
||||
Populate a dictionary of unigram features, reflecting the presence/absence in
|
||||
the document of each of the tokens in `unigrams`.
|
||||
|
||||
:param document: a list of words/tokens.
|
||||
:param unigrams: a list of words/tokens whose presence/absence has to be
|
||||
checked in `document`.
|
||||
:param handle_negation: if `handle_negation == True` apply `mark_negation`
|
||||
method to `document` before checking for unigram presence/absence.
|
||||
:return: a dictionary of unigram features {unigram : boolean}.
|
||||
|
||||
>>> words = ['ice', 'police', 'riot']
|
||||
>>> document = 'ice is melting due to global warming'.split()
|
||||
>>> sorted(extract_unigram_feats(document, words).items())
|
||||
[('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
|
||||
"""
|
||||
features = {}
|
||||
if handle_negation:
|
||||
document = mark_negation(document)
|
||||
for word in unigrams:
|
||||
features[f"contains({word})"] = word in set(document)
|
||||
return features
|
||||
|
||||
|
||||
def extract_bigram_feats(document, bigrams):
|
||||
"""
|
||||
Populate a dictionary of bigram features, reflecting the presence/absence in
|
||||
the document of each of the tokens in `bigrams`. This extractor function only
|
||||
considers contiguous bigrams obtained by `nltk.bigrams`.
|
||||
|
||||
:param document: a list of words/tokens.
|
||||
:param unigrams: a list of bigrams whose presence/absence has to be
|
||||
checked in `document`.
|
||||
:return: a dictionary of bigram features {bigram : boolean}.
|
||||
|
||||
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
|
||||
>>> document = 'ice is melting due to global warming'.split()
|
||||
>>> sorted(extract_bigram_feats(document, bigrams).items()) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('contains(global - warming)', True), ('contains(love - you)', False),
|
||||
('contains(police - prevented)', False)]
|
||||
"""
|
||||
features = {}
|
||||
for bigr in bigrams:
|
||||
features[f"contains({bigr[0]} - {bigr[1]})"] = bigr in nltk.bigrams(document)
|
||||
return features
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# { Helper Functions
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def mark_negation(document, double_neg_flip=False, shallow=False):
|
||||
"""
|
||||
Append _NEG suffix to words that appear in the scope between a negation
|
||||
and a punctuation mark.
|
||||
|
||||
:param document: a list of words/tokens, or a tuple (words, label).
|
||||
:param shallow: if True, the method will modify the original document in place.
|
||||
:param double_neg_flip: if True, double negation is considered affirmation
|
||||
(we activate/deactivate negation scope every time we find a negation).
|
||||
:return: if `shallow == True` the method will modify the original document
|
||||
and return it. If `shallow == False` the method will return a modified
|
||||
document, leaving the original unmodified.
|
||||
|
||||
>>> sent = "I didn't like this movie . It was bad .".split()
|
||||
>>> mark_negation(sent)
|
||||
['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
|
||||
"""
|
||||
if not shallow:
|
||||
document = deepcopy(document)
|
||||
# check if the document is labeled. If so, do not consider the label.
|
||||
labeled = document and isinstance(document[0], (tuple, list))
|
||||
if labeled:
|
||||
doc = document[0]
|
||||
else:
|
||||
doc = document
|
||||
neg_scope = False
|
||||
for i, word in enumerate(doc):
|
||||
if NEGATION_RE.search(word):
|
||||
if not neg_scope or (neg_scope and double_neg_flip):
|
||||
neg_scope = not neg_scope
|
||||
continue
|
||||
else:
|
||||
doc[i] += "_NEG"
|
||||
elif neg_scope and CLAUSE_PUNCT_RE.search(word):
|
||||
neg_scope = not neg_scope
|
||||
elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
|
||||
doc[i] += "_NEG"
|
||||
|
||||
return document
|
||||
|
||||
|
||||
def output_markdown(filename, **kwargs):
|
||||
"""
|
||||
Write the output of an analysis to a file.
|
||||
"""
|
||||
with codecs.open(filename, "at") as outfile:
|
||||
text = "\n*** \n\n"
|
||||
text += "{} \n\n".format(time.strftime("%d/%m/%Y, %H:%M"))
|
||||
for k in sorted(kwargs):
|
||||
if isinstance(kwargs[k], dict):
|
||||
dictionary = kwargs[k]
|
||||
text += f" - **{k}:**\n"
|
||||
for entry in sorted(dictionary):
|
||||
text += f" - {entry}: {dictionary[entry]} \n"
|
||||
elif isinstance(kwargs[k], list):
|
||||
text += f" - **{k}:**\n"
|
||||
for entry in kwargs[k]:
|
||||
text += f" - {entry}\n"
|
||||
else:
|
||||
text += f" - **{k}:** {kwargs[k]} \n"
|
||||
outfile.write(text)
|
||||
|
||||
|
||||
def split_train_test(all_instances, n=None):
|
||||
"""
|
||||
Randomly split `n` instances of the dataset into train and test sets.
|
||||
|
||||
:param all_instances: a list of instances (e.g. documents) that will be split.
|
||||
:param n: the number of instances to consider (in case we want to use only a
|
||||
subset).
|
||||
:return: two lists of instances. Train set is 8/10 of the total and test set
|
||||
is 2/10 of the total.
|
||||
"""
|
||||
random.seed(12345)
|
||||
random.shuffle(all_instances)
|
||||
if not n or n > len(all_instances):
|
||||
n = len(all_instances)
|
||||
train_set = all_instances[: int(0.8 * n)]
|
||||
test_set = all_instances[int(0.8 * n) : n]
|
||||
|
||||
return train_set, test_set
|
||||
|
||||
|
||||
def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"The plot function requires matplotlib to be installed."
|
||||
"See https://matplotlib.org/"
|
||||
) from e
|
||||
|
||||
plt.locator_params(axis="y", nbins=3)
|
||||
axes = plt.axes()
|
||||
axes.yaxis.grid()
|
||||
plt.plot(x_values, y_values, "ro", color="red")
|
||||
plt.ylim(ymin=-1.2, ymax=1.2)
|
||||
plt.tight_layout(pad=5)
|
||||
if x_labels:
|
||||
plt.xticks(x_values, x_labels, rotation="vertical")
|
||||
if y_labels:
|
||||
plt.yticks([-1, 0, 1], y_labels, rotation="horizontal")
|
||||
# Pad margins so that markers are not clipped by the axes
|
||||
plt.margins(0.2)
|
||||
plt.show()
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# { Parsing and conversion functions
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def json2csv_preprocess(
|
||||
json_file,
|
||||
outfile,
|
||||
fields,
|
||||
encoding="utf8",
|
||||
errors="replace",
|
||||
gzip_compress=False,
|
||||
skip_retweets=True,
|
||||
skip_tongue_tweets=True,
|
||||
skip_ambiguous_tweets=True,
|
||||
strip_off_emoticons=True,
|
||||
remove_duplicates=True,
|
||||
limit=None,
|
||||
):
|
||||
"""
|
||||
Convert json file to csv file, preprocessing each row to obtain a suitable
|
||||
dataset for tweets Semantic Analysis.
|
||||
|
||||
:param json_file: the original json file containing tweets.
|
||||
:param outfile: the output csv filename.
|
||||
:param fields: a list of fields that will be extracted from the json file and
|
||||
kept in the output csv file.
|
||||
:param encoding: the encoding of the files.
|
||||
:param errors: the error handling strategy for the output writer.
|
||||
:param gzip_compress: if True, create a compressed GZIP file.
|
||||
|
||||
:param skip_retweets: if True, remove retweets.
|
||||
:param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"
|
||||
emoticons.
|
||||
:param skip_ambiguous_tweets: if True, remove tweets containing both happy
|
||||
and sad emoticons.
|
||||
:param strip_off_emoticons: if True, strip off emoticons from all tweets.
|
||||
:param remove_duplicates: if True, remove tweets appearing more than once.
|
||||
:param limit: an integer to set the number of tweets to convert. After the
|
||||
limit is reached the conversion will stop. It can be useful to create
|
||||
subsets of the original tweets json data.
|
||||
"""
|
||||
with codecs.open(json_file, encoding=encoding) as fp:
|
||||
(writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
|
||||
# write the list of fields as header
|
||||
writer.writerow(fields)
|
||||
|
||||
if remove_duplicates == True:
|
||||
tweets_cache = []
|
||||
i = 0
|
||||
for line in fp:
|
||||
tweet = json.loads(line)
|
||||
row = extract_fields(tweet, fields)
|
||||
try:
|
||||
text = row[fields.index("text")]
|
||||
# Remove retweets
|
||||
if skip_retweets == True:
|
||||
if re.search(r"\bRT\b", text):
|
||||
continue
|
||||
# Remove tweets containing ":P" and ":-P" emoticons
|
||||
if skip_tongue_tweets == True:
|
||||
if re.search(r"\:\-?P\b", text):
|
||||
continue
|
||||
# Remove tweets containing both happy and sad emoticons
|
||||
if skip_ambiguous_tweets == True:
|
||||
all_emoticons = EMOTICON_RE.findall(text)
|
||||
if all_emoticons:
|
||||
if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD):
|
||||
continue
|
||||
# Strip off emoticons from all tweets
|
||||
if strip_off_emoticons == True:
|
||||
row[fields.index("text")] = re.sub(
|
||||
r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text)
|
||||
)
|
||||
# Remove duplicate tweets
|
||||
if remove_duplicates == True:
|
||||
if row[fields.index("text")] in tweets_cache:
|
||||
continue
|
||||
else:
|
||||
tweets_cache.append(row[fields.index("text")])
|
||||
except ValueError:
|
||||
pass
|
||||
writer.writerow(row)
|
||||
i += 1
|
||||
if limit and i >= limit:
|
||||
break
|
||||
outf.close()
|
||||
|
||||
|
||||
def parse_tweets_set(
|
||||
filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True
|
||||
):
|
||||
"""
|
||||
Parse csv file containing tweets and output data a list of (text, label) tuples.
|
||||
|
||||
:param filename: the input csv filename.
|
||||
:param label: the label to be appended to each tweet contained in the csv file.
|
||||
:param word_tokenizer: the tokenizer instance that will be used to tokenize
|
||||
each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
|
||||
If no word_tokenizer is specified, tweets will not be tokenized.
|
||||
:param sent_tokenizer: the tokenizer that will be used to split each tweet into
|
||||
sentences.
|
||||
:param skip_header: if True, skip the first line of the csv file (which usually
|
||||
contains headers).
|
||||
|
||||
:return: a list of (text, label) tuples.
|
||||
"""
|
||||
tweets = []
|
||||
if not sent_tokenizer:
|
||||
sent_tokenizer = PunktTokenizer()
|
||||
|
||||
with codecs.open(filename, "rt") as csvfile:
|
||||
reader = csv.reader(csvfile)
|
||||
if skip_header == True:
|
||||
next(reader, None) # skip the header
|
||||
i = 0
|
||||
for tweet_id, text in reader:
|
||||
# text = text[1]
|
||||
i += 1
|
||||
sys.stdout.write(f"Loaded {i} tweets\r")
|
||||
# Apply sentence and word tokenizer to text
|
||||
if word_tokenizer:
|
||||
tweet = [
|
||||
w
|
||||
for sent in sent_tokenizer.tokenize(text)
|
||||
for w in word_tokenizer.tokenize(sent)
|
||||
]
|
||||
else:
|
||||
tweet = text
|
||||
tweets.append((tweet, label))
|
||||
|
||||
print(f"Loaded {i} tweets")
|
||||
return tweets
|
||||
|
||||
|
||||
# ////////////////////////////////////////////////////////////
|
||||
# { Demos
|
||||
# ////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
def demo_tweets(trainer, n_instances=None, output=None):
|
||||
"""
|
||||
Train and test Naive Bayes classifier on 10000 tweets, tokenized using
|
||||
TweetTokenizer.
|
||||
Features are composed of:
|
||||
|
||||
- 1000 most frequent unigrams
|
||||
- 100 top bigrams (using BigramAssocMeasures.pmi)
|
||||
|
||||
:param trainer: `train` method of a classifier.
|
||||
:param n_instances: the number of total tweets that have to be used for
|
||||
training and testing. Tweets will be equally split between positive and
|
||||
negative.
|
||||
:param output: the output file where results have to be reported.
|
||||
"""
|
||||
from nltk.corpus import stopwords, twitter_samples
|
||||
from nltk.sentiment import SentimentAnalyzer
|
||||
from nltk.tokenize import TweetTokenizer
|
||||
|
||||
# Different customizations for the TweetTokenizer
|
||||
tokenizer = TweetTokenizer(preserve_case=False)
|
||||
# tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
|
||||
# tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
|
||||
|
||||
if n_instances is not None:
|
||||
n_instances = int(n_instances / 2)
|
||||
|
||||
fields = ["id", "text"]
|
||||
positive_json = twitter_samples.abspath("positive_tweets.json")
|
||||
positive_csv = "positive_tweets.csv"
|
||||
json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
|
||||
|
||||
negative_json = twitter_samples.abspath("negative_tweets.json")
|
||||
negative_csv = "negative_tweets.csv"
|
||||
json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
|
||||
|
||||
neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer)
|
||||
pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer)
|
||||
|
||||
# We separately split subjective and objective instances to keep a balanced
|
||||
# uniform class distribution in both train and test sets.
|
||||
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
|
||||
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
|
||||
|
||||
training_tweets = train_pos_docs + train_neg_docs
|
||||
testing_tweets = test_pos_docs + test_neg_docs
|
||||
|
||||
sentim_analyzer = SentimentAnalyzer()
|
||||
# stopwords = stopwords.words('english')
|
||||
# all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
|
||||
all_words = [word for word in sentim_analyzer.all_words(training_tweets)]
|
||||
|
||||
# Add simple unigram word features
|
||||
unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
|
||||
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
|
||||
|
||||
# Add bigram collocation features
|
||||
bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
|
||||
[tweet[0] for tweet in training_tweets], top_n=100, min_freq=12
|
||||
)
|
||||
sentim_analyzer.add_feat_extractor(
|
||||
extract_bigram_feats, bigrams=bigram_collocs_feats
|
||||
)
|
||||
|
||||
training_set = sentim_analyzer.apply_features(training_tweets)
|
||||
test_set = sentim_analyzer.apply_features(testing_tweets)
|
||||
|
||||
classifier = sentim_analyzer.train(trainer, training_set)
|
||||
# classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
|
||||
try:
|
||||
classifier.show_most_informative_features()
|
||||
except AttributeError:
|
||||
print(
|
||||
"Your classifier does not provide a show_most_informative_features() method."
|
||||
)
|
||||
results = sentim_analyzer.evaluate(test_set)
|
||||
|
||||
if output:
|
||||
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
|
||||
output_markdown(
|
||||
output,
|
||||
Dataset="labeled_tweets",
|
||||
Classifier=type(classifier).__name__,
|
||||
Tokenizer=tokenizer.__class__.__name__,
|
||||
Feats=extr,
|
||||
Results=results,
|
||||
Instances=n_instances,
|
||||
)
|
||||
|
||||
|
||||
def demo_movie_reviews(trainer, n_instances=None, output=None):
|
||||
"""
|
||||
Train classifier on all instances of the Movie Reviews dataset.
|
||||
The corpus has been preprocessed using the default sentence tokenizer and
|
||||
WordPunctTokenizer.
|
||||
Features are composed of:
|
||||
|
||||
- most frequent unigrams
|
||||
|
||||
:param trainer: `train` method of a classifier.
|
||||
:param n_instances: the number of total reviews that have to be used for
|
||||
training and testing. Reviews will be equally split between positive and
|
||||
negative.
|
||||
:param output: the output file where results have to be reported.
|
||||
"""
|
||||
from nltk.corpus import movie_reviews
|
||||
from nltk.sentiment import SentimentAnalyzer
|
||||
|
||||
if n_instances is not None:
|
||||
n_instances = int(n_instances / 2)
|
||||
|
||||
pos_docs = [
|
||||
(list(movie_reviews.words(pos_id)), "pos")
|
||||
for pos_id in movie_reviews.fileids("pos")[:n_instances]
|
||||
]
|
||||
neg_docs = [
|
||||
(list(movie_reviews.words(neg_id)), "neg")
|
||||
for neg_id in movie_reviews.fileids("neg")[:n_instances]
|
||||
]
|
||||
# We separately split positive and negative instances to keep a balanced
|
||||
# uniform class distribution in both train and test sets.
|
||||
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
|
||||
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
|
||||
|
||||
training_docs = train_pos_docs + train_neg_docs
|
||||
testing_docs = test_pos_docs + test_neg_docs
|
||||
|
||||
sentim_analyzer = SentimentAnalyzer()
|
||||
all_words = sentim_analyzer.all_words(training_docs)
|
||||
|
||||
# Add simple unigram word features
|
||||
unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
|
||||
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
|
||||
# Apply features to obtain a feature-value representation of our datasets
|
||||
training_set = sentim_analyzer.apply_features(training_docs)
|
||||
test_set = sentim_analyzer.apply_features(testing_docs)
|
||||
|
||||
classifier = sentim_analyzer.train(trainer, training_set)
|
||||
try:
|
||||
classifier.show_most_informative_features()
|
||||
except AttributeError:
|
||||
print(
|
||||
"Your classifier does not provide a show_most_informative_features() method."
|
||||
)
|
||||
results = sentim_analyzer.evaluate(test_set)
|
||||
|
||||
if output:
|
||||
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
|
||||
output_markdown(
|
||||
output,
|
||||
Dataset="Movie_reviews",
|
||||
Classifier=type(classifier).__name__,
|
||||
Tokenizer="WordPunctTokenizer",
|
||||
Feats=extr,
|
||||
Results=results,
|
||||
Instances=n_instances,
|
||||
)
|
||||
|
||||
|
||||
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
|
||||
"""
|
||||
Train and test a classifier on instances of the Subjective Dataset by Pang and
|
||||
Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
|
||||
All tokens (words and punctuation marks) are separated by a whitespace, so
|
||||
we use the basic WhitespaceTokenizer to parse the data.
|
||||
|
||||
:param trainer: `train` method of a classifier.
|
||||
:param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
|
||||
:param n_instances: the number of total sentences that have to be used for
|
||||
training and testing. Sentences will be equally split between positive
|
||||
and negative.
|
||||
:param output: the output file where results have to be reported.
|
||||
"""
|
||||
from nltk.corpus import subjectivity
|
||||
from nltk.sentiment import SentimentAnalyzer
|
||||
|
||||
if n_instances is not None:
|
||||
n_instances = int(n_instances / 2)
|
||||
|
||||
subj_docs = [
|
||||
(sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances]
|
||||
]
|
||||
obj_docs = [
|
||||
(sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances]
|
||||
]
|
||||
|
||||
# We separately split subjective and objective instances to keep a balanced
|
||||
# uniform class distribution in both train and test sets.
|
||||
train_subj_docs, test_subj_docs = split_train_test(subj_docs)
|
||||
train_obj_docs, test_obj_docs = split_train_test(obj_docs)
|
||||
|
||||
training_docs = train_subj_docs + train_obj_docs
|
||||
testing_docs = test_subj_docs + test_obj_docs
|
||||
|
||||
sentim_analyzer = SentimentAnalyzer()
|
||||
all_words_neg = sentim_analyzer.all_words(
|
||||
[mark_negation(doc) for doc in training_docs]
|
||||
)
|
||||
|
||||
# Add simple unigram word features handling negation
|
||||
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
|
||||
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
|
||||
|
||||
# Apply features to obtain a feature-value representation of our datasets
|
||||
training_set = sentim_analyzer.apply_features(training_docs)
|
||||
test_set = sentim_analyzer.apply_features(testing_docs)
|
||||
|
||||
classifier = sentim_analyzer.train(trainer, training_set)
|
||||
try:
|
||||
classifier.show_most_informative_features()
|
||||
except AttributeError:
|
||||
print(
|
||||
"Your classifier does not provide a show_most_informative_features() method."
|
||||
)
|
||||
results = sentim_analyzer.evaluate(test_set)
|
||||
|
||||
if save_analyzer == True:
|
||||
sentim_analyzer.save_file(sentim_analyzer, "sa_subjectivity.pickle")
|
||||
|
||||
if output:
|
||||
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
|
||||
output_markdown(
|
||||
output,
|
||||
Dataset="subjectivity",
|
||||
Classifier=type(classifier).__name__,
|
||||
Tokenizer="WhitespaceTokenizer",
|
||||
Feats=extr,
|
||||
Instances=n_instances,
|
||||
Results=results,
|
||||
)
|
||||
|
||||
return sentim_analyzer
|
||||
|
||||
|
||||
def demo_sent_subjectivity(text):
|
||||
"""
|
||||
Classify a single sentence as subjective or objective using a stored
|
||||
SentimentAnalyzer.
|
||||
|
||||
:param text: a sentence whose subjectivity has to be classified.
|
||||
"""
|
||||
from nltk.classify import NaiveBayesClassifier
|
||||
from nltk.tokenize import regexp
|
||||
|
||||
word_tokenizer = regexp.WhitespaceTokenizer()
|
||||
try:
|
||||
sentim_analyzer = load("sa_subjectivity.pickle")
|
||||
except LookupError:
|
||||
print("Cannot find the sentiment analyzer you want to load.")
|
||||
print("Training a new one using NaiveBayesClassifier.")
|
||||
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
|
||||
|
||||
# Tokenize and convert to lower case
|
||||
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
|
||||
print(sentim_analyzer.classify(tokenized_text))
|
||||
|
||||
|
||||
def demo_liu_hu_lexicon(sentence, plot=False):
|
||||
"""
|
||||
Basic example of sentiment classification using Liu and Hu opinion lexicon.
|
||||
This function simply counts the number of positive, negative and neutral words
|
||||
in the sentence and classifies it depending on which polarity is more represented.
|
||||
Words that do not appear in the lexicon are considered as neutral.
|
||||
|
||||
:param sentence: a sentence whose polarity has to be classified.
|
||||
:param plot: if True, plot a visual representation of the sentence polarity.
|
||||
"""
|
||||
from nltk.corpus import opinion_lexicon
|
||||
from nltk.tokenize import treebank
|
||||
|
||||
tokenizer = treebank.TreebankWordTokenizer()
|
||||
pos_words = 0
|
||||
neg_words = 0
|
||||
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
|
||||
|
||||
x = list(range(len(tokenized_sent))) # x axis for the plot
|
||||
y = []
|
||||
|
||||
for word in tokenized_sent:
|
||||
if word in opinion_lexicon.positive():
|
||||
pos_words += 1
|
||||
y.append(1) # positive
|
||||
elif word in opinion_lexicon.negative():
|
||||
neg_words += 1
|
||||
y.append(-1) # negative
|
||||
else:
|
||||
y.append(0) # neutral
|
||||
|
||||
if pos_words > neg_words:
|
||||
print("Positive")
|
||||
elif pos_words < neg_words:
|
||||
print("Negative")
|
||||
elif pos_words == neg_words:
|
||||
print("Neutral")
|
||||
|
||||
if plot == True:
|
||||
_show_plot(
|
||||
x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"]
|
||||
)
|
||||
|
||||
|
||||
def demo_vader_instance(text):
|
||||
"""
|
||||
Output polarity scores for a text using Vader approach.
|
||||
|
||||
:param text: a text whose polarity has to be evaluated.
|
||||
"""
|
||||
from nltk.sentiment import SentimentIntensityAnalyzer
|
||||
|
||||
vader_analyzer = SentimentIntensityAnalyzer()
|
||||
print(vader_analyzer.polarity_scores(text))
|
||||
|
||||
|
||||
def demo_vader_tweets(n_instances=None, output=None):
|
||||
"""
|
||||
Classify 10000 positive and negative tweets using Vader approach.
|
||||
|
||||
:param n_instances: the number of total tweets that have to be classified.
|
||||
:param output: the output file where results have to be reported.
|
||||
"""
|
||||
from collections import defaultdict
|
||||
|
||||
from nltk.corpus import twitter_samples
|
||||
from nltk.metrics import accuracy as eval_accuracy
|
||||
from nltk.metrics import f_measure as eval_f_measure
|
||||
from nltk.metrics import precision as eval_precision
|
||||
from nltk.metrics import recall as eval_recall
|
||||
from nltk.sentiment import SentimentIntensityAnalyzer
|
||||
|
||||
if n_instances is not None:
|
||||
n_instances = int(n_instances / 2)
|
||||
|
||||
fields = ["id", "text"]
|
||||
positive_json = twitter_samples.abspath("positive_tweets.json")
|
||||
positive_csv = "positive_tweets.csv"
|
||||
json2csv_preprocess(
|
||||
positive_json,
|
||||
positive_csv,
|
||||
fields,
|
||||
strip_off_emoticons=False,
|
||||
limit=n_instances,
|
||||
)
|
||||
|
||||
negative_json = twitter_samples.abspath("negative_tweets.json")
|
||||
negative_csv = "negative_tweets.csv"
|
||||
json2csv_preprocess(
|
||||
negative_json,
|
||||
negative_csv,
|
||||
fields,
|
||||
strip_off_emoticons=False,
|
||||
limit=n_instances,
|
||||
)
|
||||
|
||||
pos_docs = parse_tweets_set(positive_csv, label="pos")
|
||||
neg_docs = parse_tweets_set(negative_csv, label="neg")
|
||||
|
||||
# We separately split subjective and objective instances to keep a balanced
|
||||
# uniform class distribution in both train and test sets.
|
||||
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
|
||||
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
|
||||
|
||||
training_tweets = train_pos_docs + train_neg_docs
|
||||
testing_tweets = test_pos_docs + test_neg_docs
|
||||
|
||||
vader_analyzer = SentimentIntensityAnalyzer()
|
||||
|
||||
gold_results = defaultdict(set)
|
||||
test_results = defaultdict(set)
|
||||
acc_gold_results = []
|
||||
acc_test_results = []
|
||||
labels = set()
|
||||
num = 0
|
||||
for i, (text, label) in enumerate(testing_tweets):
|
||||
labels.add(label)
|
||||
gold_results[label].add(i)
|
||||
acc_gold_results.append(label)
|
||||
score = vader_analyzer.polarity_scores(text)["compound"]
|
||||
if score > 0:
|
||||
observed = "pos"
|
||||
else:
|
||||
observed = "neg"
|
||||
num += 1
|
||||
acc_test_results.append(observed)
|
||||
test_results[observed].add(i)
|
||||
metrics_results = {}
|
||||
for label in labels:
|
||||
accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
|
||||
metrics_results["Accuracy"] = accuracy_score
|
||||
precision_score = eval_precision(gold_results[label], test_results[label])
|
||||
metrics_results[f"Precision [{label}]"] = precision_score
|
||||
recall_score = eval_recall(gold_results[label], test_results[label])
|
||||
metrics_results[f"Recall [{label}]"] = recall_score
|
||||
f_measure_score = eval_f_measure(gold_results[label], test_results[label])
|
||||
metrics_results[f"F-measure [{label}]"] = f_measure_score
|
||||
|
||||
for result in sorted(metrics_results):
|
||||
print(f"{result}: {metrics_results[result]}")
|
||||
|
||||
if output:
|
||||
output_markdown(
|
||||
output,
|
||||
Approach="Vader",
|
||||
Dataset="labeled_tweets",
|
||||
Instances=n_instances,
|
||||
Results=metrics_results,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
from nltk.classify import MaxentClassifier, NaiveBayesClassifier
|
||||
from nltk.classify.scikitlearn import SklearnClassifier
|
||||
from nltk.twitter.common import _outf_writer, extract_fields
|
||||
|
||||
naive_bayes = NaiveBayesClassifier.train
|
||||
svm = SklearnClassifier(LinearSVC()).train
|
||||
maxent = MaxentClassifier.train
|
||||
|
||||
demo_tweets(naive_bayes)
|
||||
# demo_movie_reviews(svm)
|
||||
# demo_subjectivity(svm)
|
||||
# demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ")
|
||||
# demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True)
|
||||
# demo_vader_instance("This movie was actually neither that funny, nor super witty.")
|
||||
# demo_vader_tweets()
|
||||
@@ -0,0 +1,633 @@
|
||||
# Natural Language Toolkit: vader
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
|
||||
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
|
||||
# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
|
||||
# George Berry <geb97@cornell.edu> (modifications)
|
||||
# Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
# Modifications to the original VADER code have been made in order to
|
||||
# integrate it into NLTK. These have involved changes to
|
||||
# ensure Python 3 compatibility, and refactoring to achieve greater modularity.
|
||||
|
||||
"""
|
||||
If you use the VADER sentiment analysis tools, please cite:
|
||||
|
||||
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
|
||||
Sentiment Analysis of Social Media Text. Eighth International Conference on
|
||||
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
||||
"""
|
||||
|
||||
import math
|
||||
import re
|
||||
import string
|
||||
from itertools import product
|
||||
|
||||
import nltk.data
|
||||
from nltk.util import pairwise
|
||||
|
||||
|
||||
class VaderConstants:
|
||||
"""
|
||||
A class to keep the Vader lists and constants.
|
||||
"""
|
||||
|
||||
##Constants##
|
||||
# (empirically derived mean sentiment intensity rating increase for booster words)
|
||||
B_INCR = 0.293
|
||||
B_DECR = -0.293
|
||||
|
||||
# (empirically derived mean sentiment intensity rating increase for using
|
||||
# ALLCAPs to emphasize a word)
|
||||
C_INCR = 0.733
|
||||
|
||||
N_SCALAR = -0.74
|
||||
|
||||
NEGATE = {
|
||||
"aint",
|
||||
"arent",
|
||||
"cannot",
|
||||
"cant",
|
||||
"couldnt",
|
||||
"darent",
|
||||
"didnt",
|
||||
"doesnt",
|
||||
"ain't",
|
||||
"aren't",
|
||||
"can't",
|
||||
"couldn't",
|
||||
"daren't",
|
||||
"didn't",
|
||||
"doesn't",
|
||||
"dont",
|
||||
"hadnt",
|
||||
"hasnt",
|
||||
"havent",
|
||||
"isnt",
|
||||
"mightnt",
|
||||
"mustnt",
|
||||
"neither",
|
||||
"don't",
|
||||
"hadn't",
|
||||
"hasn't",
|
||||
"haven't",
|
||||
"isn't",
|
||||
"mightn't",
|
||||
"mustn't",
|
||||
"neednt",
|
||||
"needn't",
|
||||
"never",
|
||||
"none",
|
||||
"nope",
|
||||
"nor",
|
||||
"not",
|
||||
"nothing",
|
||||
"nowhere",
|
||||
"oughtnt",
|
||||
"shant",
|
||||
"shouldnt",
|
||||
"uhuh",
|
||||
"wasnt",
|
||||
"werent",
|
||||
"oughtn't",
|
||||
"shan't",
|
||||
"shouldn't",
|
||||
"uh-uh",
|
||||
"wasn't",
|
||||
"weren't",
|
||||
"without",
|
||||
"wont",
|
||||
"wouldnt",
|
||||
"won't",
|
||||
"wouldn't",
|
||||
"rarely",
|
||||
"seldom",
|
||||
"despite",
|
||||
}
|
||||
|
||||
# booster/dampener 'intensifiers' or 'degree adverbs'
|
||||
# https://en.wiktionary.org/wiki/Category:English_degree_adverbs
|
||||
|
||||
BOOSTER_DICT = {
|
||||
"absolutely": B_INCR,
|
||||
"amazingly": B_INCR,
|
||||
"awfully": B_INCR,
|
||||
"completely": B_INCR,
|
||||
"considerably": B_INCR,
|
||||
"decidedly": B_INCR,
|
||||
"deeply": B_INCR,
|
||||
"effing": B_INCR,
|
||||
"enormously": B_INCR,
|
||||
"entirely": B_INCR,
|
||||
"especially": B_INCR,
|
||||
"exceptionally": B_INCR,
|
||||
"extremely": B_INCR,
|
||||
"fabulously": B_INCR,
|
||||
"flipping": B_INCR,
|
||||
"flippin": B_INCR,
|
||||
"fricking": B_INCR,
|
||||
"frickin": B_INCR,
|
||||
"frigging": B_INCR,
|
||||
"friggin": B_INCR,
|
||||
"fully": B_INCR,
|
||||
"fucking": B_INCR,
|
||||
"greatly": B_INCR,
|
||||
"hella": B_INCR,
|
||||
"highly": B_INCR,
|
||||
"hugely": B_INCR,
|
||||
"incredibly": B_INCR,
|
||||
"intensely": B_INCR,
|
||||
"majorly": B_INCR,
|
||||
"more": B_INCR,
|
||||
"most": B_INCR,
|
||||
"particularly": B_INCR,
|
||||
"purely": B_INCR,
|
||||
"quite": B_INCR,
|
||||
"really": B_INCR,
|
||||
"remarkably": B_INCR,
|
||||
"so": B_INCR,
|
||||
"substantially": B_INCR,
|
||||
"thoroughly": B_INCR,
|
||||
"totally": B_INCR,
|
||||
"tremendously": B_INCR,
|
||||
"uber": B_INCR,
|
||||
"unbelievably": B_INCR,
|
||||
"unusually": B_INCR,
|
||||
"utterly": B_INCR,
|
||||
"very": B_INCR,
|
||||
"almost": B_DECR,
|
||||
"barely": B_DECR,
|
||||
"hardly": B_DECR,
|
||||
"just enough": B_DECR,
|
||||
"kind of": B_DECR,
|
||||
"kinda": B_DECR,
|
||||
"kindof": B_DECR,
|
||||
"kind-of": B_DECR,
|
||||
"less": B_DECR,
|
||||
"little": B_DECR,
|
||||
"marginally": B_DECR,
|
||||
"occasionally": B_DECR,
|
||||
"partly": B_DECR,
|
||||
"scarcely": B_DECR,
|
||||
"slightly": B_DECR,
|
||||
"somewhat": B_DECR,
|
||||
"sort of": B_DECR,
|
||||
"sorta": B_DECR,
|
||||
"sortof": B_DECR,
|
||||
"sort-of": B_DECR,
|
||||
}
|
||||
|
||||
# check for special case idioms using a sentiment-laden keyword known to SAGE
|
||||
SPECIAL_CASE_IDIOMS = {
|
||||
"the shit": 3,
|
||||
"the bomb": 3,
|
||||
"bad ass": 1.5,
|
||||
"yeah right": -2,
|
||||
"cut the mustard": 2,
|
||||
"kiss of death": -1.5,
|
||||
"hand to mouth": -2,
|
||||
}
|
||||
|
||||
# for removing punctuation
|
||||
REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]")
|
||||
|
||||
PUNC_LIST = [
|
||||
".",
|
||||
"!",
|
||||
"?",
|
||||
",",
|
||||
";",
|
||||
":",
|
||||
"-",
|
||||
"'",
|
||||
'"',
|
||||
"!!",
|
||||
"!!!",
|
||||
"??",
|
||||
"???",
|
||||
"?!?",
|
||||
"!?!",
|
||||
"?!?!",
|
||||
"!?!?",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def negated(self, input_words, include_nt=True):
|
||||
"""
|
||||
Determine if input contains negation words
|
||||
"""
|
||||
neg_words = self.NEGATE
|
||||
if any(word.lower() in neg_words for word in input_words):
|
||||
return True
|
||||
if include_nt:
|
||||
if any("n't" in word.lower() for word in input_words):
|
||||
return True
|
||||
for first, second in pairwise(input_words):
|
||||
if second.lower() == "least" and first.lower() != "at":
|
||||
return True
|
||||
return False
|
||||
|
||||
def normalize(self, score, alpha=15):
|
||||
"""
|
||||
Normalize the score to be between -1 and 1 using an alpha that
|
||||
approximates the max expected value
|
||||
"""
|
||||
norm_score = score / math.sqrt((score * score) + alpha)
|
||||
return norm_score
|
||||
|
||||
def scalar_inc_dec(self, word, valence, is_cap_diff):
|
||||
"""
|
||||
Check if the preceding words increase, decrease, or negate/nullify the
|
||||
valence
|
||||
"""
|
||||
scalar = 0.0
|
||||
word_lower = word.lower()
|
||||
if word_lower in self.BOOSTER_DICT:
|
||||
scalar = self.BOOSTER_DICT[word_lower]
|
||||
if valence < 0:
|
||||
scalar *= -1
|
||||
# check if booster/dampener word is in ALLCAPS (while others aren't)
|
||||
if word.isupper() and is_cap_diff:
|
||||
if valence > 0:
|
||||
scalar += self.C_INCR
|
||||
else:
|
||||
scalar -= self.C_INCR
|
||||
return scalar
|
||||
|
||||
|
||||
class SentiText:
|
||||
"""
|
||||
Identify sentiment-relevant string-level properties of input text.
|
||||
"""
|
||||
|
||||
def __init__(self, text, punc_list, regex_remove_punctuation):
|
||||
if not isinstance(text, str):
|
||||
text = str(text.encode("utf-8"))
|
||||
self.text = text
|
||||
self.PUNC_LIST = punc_list
|
||||
self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
|
||||
self.words_and_emoticons = self._words_and_emoticons()
|
||||
# doesn't separate words from
|
||||
# adjacent punctuation (keeps emoticons & contractions)
|
||||
self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
|
||||
|
||||
def _words_plus_punc(self):
|
||||
"""
|
||||
Returns mapping of form:
|
||||
{
|
||||
'cat,': 'cat',
|
||||
',cat': 'cat',
|
||||
}
|
||||
"""
|
||||
no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
|
||||
# removes punctuation (but loses emoticons & contractions)
|
||||
words_only = no_punc_text.split()
|
||||
# remove singletons
|
||||
words_only = {w for w in words_only if len(w) > 1}
|
||||
# the product gives ('cat', ',') and (',', 'cat')
|
||||
punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
|
||||
punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
|
||||
words_punc_dict = punc_before
|
||||
words_punc_dict.update(punc_after)
|
||||
return words_punc_dict
|
||||
|
||||
def _words_and_emoticons(self):
|
||||
"""
|
||||
Removes leading and trailing puncutation
|
||||
Leaves contractions and most emoticons
|
||||
Does not preserve punc-plus-letter emoticons (e.g. :D)
|
||||
"""
|
||||
wes = self.text.split()
|
||||
words_punc_dict = self._words_plus_punc()
|
||||
wes = [we for we in wes if len(we) > 1]
|
||||
for i, we in enumerate(wes):
|
||||
if we in words_punc_dict:
|
||||
wes[i] = words_punc_dict[we]
|
||||
return wes
|
||||
|
||||
def allcap_differential(self, words):
|
||||
"""
|
||||
Check whether just some words in the input are ALL CAPS
|
||||
|
||||
:param list words: The words to inspect
|
||||
:returns: `True` if some but not all items in `words` are ALL CAPS
|
||||
"""
|
||||
is_different = False
|
||||
allcap_words = 0
|
||||
for word in words:
|
||||
if word.isupper():
|
||||
allcap_words += 1
|
||||
cap_differential = len(words) - allcap_words
|
||||
if 0 < cap_differential < len(words):
|
||||
is_different = True
|
||||
return is_different
|
||||
|
||||
|
||||
class SentimentIntensityAnalyzer:
|
||||
"""
|
||||
Give a sentiment intensity score to sentences.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
|
||||
):
|
||||
self.lexicon_file = nltk.data.load(lexicon_file)
|
||||
self.lexicon = self.make_lex_dict()
|
||||
self.constants = VaderConstants()
|
||||
|
||||
def make_lex_dict(self):
|
||||
"""
|
||||
Convert lexicon file to a dictionary
|
||||
"""
|
||||
lex_dict = {}
|
||||
for line in self.lexicon_file.split("\n"):
|
||||
(word, measure) = line.strip().split("\t")[0:2]
|
||||
lex_dict[word] = float(measure)
|
||||
return lex_dict
|
||||
|
||||
def polarity_scores(self, text):
|
||||
"""
|
||||
Return a float for sentiment strength based on the input text.
|
||||
Positive values are positive valence, negative value are negative
|
||||
valence.
|
||||
|
||||
:note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you
|
||||
are interested in processing the text in the hashtags too, then we recommend
|
||||
preprocessing your data to remove the #, after which the hashtag text may be
|
||||
matched as if it was a normal word in the sentence.
|
||||
"""
|
||||
# text, words_and_emoticons, is_cap_diff = self.preprocess(text)
|
||||
sentitext = SentiText(
|
||||
text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION
|
||||
)
|
||||
sentiments = []
|
||||
words_and_emoticons = sentitext.words_and_emoticons
|
||||
for item in words_and_emoticons:
|
||||
valence = 0
|
||||
i = words_and_emoticons.index(item)
|
||||
if (
|
||||
i < len(words_and_emoticons) - 1
|
||||
and item.lower() == "kind"
|
||||
and words_and_emoticons[i + 1].lower() == "of"
|
||||
) or item.lower() in self.constants.BOOSTER_DICT:
|
||||
sentiments.append(valence)
|
||||
continue
|
||||
|
||||
sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
|
||||
|
||||
sentiments = self._but_check(words_and_emoticons, sentiments)
|
||||
|
||||
return self.score_valence(sentiments, text)
|
||||
|
||||
def sentiment_valence(self, valence, sentitext, item, i, sentiments):
|
||||
is_cap_diff = sentitext.is_cap_diff
|
||||
words_and_emoticons = sentitext.words_and_emoticons
|
||||
item_lowercase = item.lower()
|
||||
if item_lowercase in self.lexicon:
|
||||
# get the sentiment valence
|
||||
valence = self.lexicon[item_lowercase]
|
||||
|
||||
# check if sentiment laden word is in ALL CAPS (while others aren't)
|
||||
if item.isupper() and is_cap_diff:
|
||||
if valence > 0:
|
||||
valence += self.constants.C_INCR
|
||||
else:
|
||||
valence -= self.constants.C_INCR
|
||||
|
||||
for start_i in range(0, 3):
|
||||
if (
|
||||
i > start_i
|
||||
and words_and_emoticons[i - (start_i + 1)].lower()
|
||||
not in self.lexicon
|
||||
):
|
||||
# dampen the scalar modifier of preceding words and emoticons
|
||||
# (excluding the ones that immediately preceed the item) based
|
||||
# on their distance from the current item.
|
||||
s = self.constants.scalar_inc_dec(
|
||||
words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
|
||||
)
|
||||
if start_i == 1 and s != 0:
|
||||
s = s * 0.95
|
||||
if start_i == 2 and s != 0:
|
||||
s = s * 0.9
|
||||
valence = valence + s
|
||||
valence = self._never_check(
|
||||
valence, words_and_emoticons, start_i, i
|
||||
)
|
||||
if start_i == 2:
|
||||
valence = self._idioms_check(valence, words_and_emoticons, i)
|
||||
|
||||
# future work: consider other sentiment-laden idioms
|
||||
# other_idioms =
|
||||
# {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
|
||||
# "upper hand": 1, "break a leg": 2,
|
||||
# "cooking with gas": 2, "in the black": 2, "in the red": -2,
|
||||
# "on the ball": 2,"under the weather": -2}
|
||||
|
||||
valence = self._least_check(valence, words_and_emoticons, i)
|
||||
|
||||
sentiments.append(valence)
|
||||
return sentiments
|
||||
|
||||
def _least_check(self, valence, words_and_emoticons, i):
|
||||
# check for negation case using "least"
|
||||
if (
|
||||
i > 1
|
||||
and words_and_emoticons[i - 1].lower() not in self.lexicon
|
||||
and words_and_emoticons[i - 1].lower() == "least"
|
||||
):
|
||||
if (
|
||||
words_and_emoticons[i - 2].lower() != "at"
|
||||
and words_and_emoticons[i - 2].lower() != "very"
|
||||
):
|
||||
valence = valence * self.constants.N_SCALAR
|
||||
elif (
|
||||
i > 0
|
||||
and words_and_emoticons[i - 1].lower() not in self.lexicon
|
||||
and words_and_emoticons[i - 1].lower() == "least"
|
||||
):
|
||||
valence = valence * self.constants.N_SCALAR
|
||||
return valence
|
||||
|
||||
def _but_check(self, words_and_emoticons, sentiments):
|
||||
words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons]
|
||||
but = {"but"} & set(words_and_emoticons)
|
||||
if but:
|
||||
bi = words_and_emoticons.index(next(iter(but)))
|
||||
for sidx, sentiment in enumerate(sentiments):
|
||||
if sidx < bi:
|
||||
sentiments[sidx] = sentiment * 0.5
|
||||
elif sidx > bi:
|
||||
sentiments[sidx] = sentiment * 1.5
|
||||
return sentiments
|
||||
|
||||
def _idioms_check(self, valence, words_and_emoticons, i):
|
||||
onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}"
|
||||
|
||||
twoonezero = "{} {} {}".format(
|
||||
words_and_emoticons[i - 2],
|
||||
words_and_emoticons[i - 1],
|
||||
words_and_emoticons[i],
|
||||
)
|
||||
|
||||
twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}"
|
||||
|
||||
threetwoone = "{} {} {}".format(
|
||||
words_and_emoticons[i - 3],
|
||||
words_and_emoticons[i - 2],
|
||||
words_and_emoticons[i - 1],
|
||||
)
|
||||
|
||||
threetwo = "{} {}".format(
|
||||
words_and_emoticons[i - 3], words_and_emoticons[i - 2]
|
||||
)
|
||||
|
||||
sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
|
||||
|
||||
for seq in sequences:
|
||||
if seq in self.constants.SPECIAL_CASE_IDIOMS:
|
||||
valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
|
||||
break
|
||||
|
||||
if len(words_and_emoticons) - 1 > i:
|
||||
zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}"
|
||||
if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
|
||||
valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
|
||||
if len(words_and_emoticons) - 1 > i + 1:
|
||||
zeroonetwo = "{} {} {}".format(
|
||||
words_and_emoticons[i],
|
||||
words_and_emoticons[i + 1],
|
||||
words_and_emoticons[i + 2],
|
||||
)
|
||||
if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
|
||||
valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
|
||||
|
||||
# check for booster/dampener bi-grams such as 'sort of' or 'kind of'
|
||||
if (
|
||||
threetwo in self.constants.BOOSTER_DICT
|
||||
or twoone in self.constants.BOOSTER_DICT
|
||||
):
|
||||
valence = valence + self.constants.B_DECR
|
||||
return valence
|
||||
|
||||
def _never_check(self, valence, words_and_emoticons, start_i, i):
|
||||
if start_i == 0:
|
||||
if self.constants.negated([words_and_emoticons[i - 1]]):
|
||||
valence = valence * self.constants.N_SCALAR
|
||||
if start_i == 1:
|
||||
if words_and_emoticons[i - 2] == "never" and (
|
||||
words_and_emoticons[i - 1] == "so"
|
||||
or words_and_emoticons[i - 1] == "this"
|
||||
):
|
||||
valence = valence * 1.5
|
||||
elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
|
||||
valence = valence * self.constants.N_SCALAR
|
||||
if start_i == 2:
|
||||
if (
|
||||
words_and_emoticons[i - 3] == "never"
|
||||
and (
|
||||
words_and_emoticons[i - 2] == "so"
|
||||
or words_and_emoticons[i - 2] == "this"
|
||||
)
|
||||
or (
|
||||
words_and_emoticons[i - 1] == "so"
|
||||
or words_and_emoticons[i - 1] == "this"
|
||||
)
|
||||
):
|
||||
valence = valence * 1.25
|
||||
elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
|
||||
valence = valence * self.constants.N_SCALAR
|
||||
return valence
|
||||
|
||||
def _punctuation_emphasis(self, sum_s, text):
|
||||
# add emphasis from exclamation points and question marks
|
||||
ep_amplifier = self._amplify_ep(text)
|
||||
qm_amplifier = self._amplify_qm(text)
|
||||
punct_emph_amplifier = ep_amplifier + qm_amplifier
|
||||
return punct_emph_amplifier
|
||||
|
||||
def _amplify_ep(self, text):
|
||||
# check for added emphasis resulting from exclamation points (up to 4 of them)
|
||||
ep_count = text.count("!")
|
||||
if ep_count > 4:
|
||||
ep_count = 4
|
||||
# (empirically derived mean sentiment intensity rating increase for
|
||||
# exclamation points)
|
||||
ep_amplifier = ep_count * 0.292
|
||||
return ep_amplifier
|
||||
|
||||
def _amplify_qm(self, text):
|
||||
# check for added emphasis resulting from question marks (2 or 3+)
|
||||
qm_count = text.count("?")
|
||||
qm_amplifier = 0
|
||||
if qm_count > 1:
|
||||
if qm_count <= 3:
|
||||
# (empirically derived mean sentiment intensity rating increase for
|
||||
# question marks)
|
||||
qm_amplifier = qm_count * 0.18
|
||||
else:
|
||||
qm_amplifier = 0.96
|
||||
return qm_amplifier
|
||||
|
||||
def _sift_sentiment_scores(self, sentiments):
|
||||
# want separate positive versus negative sentiment scores
|
||||
pos_sum = 0.0
|
||||
neg_sum = 0.0
|
||||
neu_count = 0
|
||||
for sentiment_score in sentiments:
|
||||
if sentiment_score > 0:
|
||||
pos_sum += (
|
||||
float(sentiment_score) + 1
|
||||
) # compensates for neutral words that are counted as 1
|
||||
if sentiment_score < 0:
|
||||
neg_sum += (
|
||||
float(sentiment_score) - 1
|
||||
) # when used with math.fabs(), compensates for neutrals
|
||||
if sentiment_score == 0:
|
||||
neu_count += 1
|
||||
return pos_sum, neg_sum, neu_count
|
||||
|
||||
def score_valence(self, sentiments, text):
|
||||
if sentiments:
|
||||
sum_s = float(sum(sentiments))
|
||||
# compute and add emphasis from punctuation in text
|
||||
punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
|
||||
if sum_s > 0:
|
||||
sum_s += punct_emph_amplifier
|
||||
elif sum_s < 0:
|
||||
sum_s -= punct_emph_amplifier
|
||||
|
||||
compound = self.constants.normalize(sum_s)
|
||||
# discriminate between positive, negative and neutral sentiment scores
|
||||
pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
|
||||
|
||||
if pos_sum > math.fabs(neg_sum):
|
||||
pos_sum += punct_emph_amplifier
|
||||
elif pos_sum < math.fabs(neg_sum):
|
||||
neg_sum -= punct_emph_amplifier
|
||||
|
||||
total = pos_sum + math.fabs(neg_sum) + neu_count
|
||||
pos = math.fabs(pos_sum / total)
|
||||
neg = math.fabs(neg_sum / total)
|
||||
neu = math.fabs(neu_count / total)
|
||||
|
||||
else:
|
||||
compound = 0.0
|
||||
pos = 0.0
|
||||
neg = 0.0
|
||||
neu = 0.0
|
||||
|
||||
sentiment_dict = {
|
||||
"neg": round(neg, 3),
|
||||
"neu": round(neu, 3),
|
||||
"pos": round(pos, 3),
|
||||
"compound": round(compound, 4),
|
||||
}
|
||||
|
||||
return sentiment_dict
|
||||
Reference in New Issue
Block a user