updates
This commit is contained in:
@@ -0,0 +1,35 @@
|
||||
# Natural Language Toolkit: Twitter
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
NLTK Twitter Package
|
||||
|
||||
This package contains classes for retrieving Tweet documents using the
|
||||
Twitter API.
|
||||
|
||||
"""
|
||||
try:
|
||||
import twython
|
||||
except ImportError:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"The twython library has not been installed. "
|
||||
"Some functionality from the twitter package will not be available."
|
||||
)
|
||||
else:
|
||||
from nltk.twitter.util import Authenticate, credsfromfile
|
||||
from nltk.twitter.twitterclient import (
|
||||
Streamer,
|
||||
Query,
|
||||
Twitter,
|
||||
TweetViewer,
|
||||
TweetWriter,
|
||||
)
|
||||
|
||||
|
||||
from nltk.twitter.common import json2csv
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
145
Backend/venv/lib/python3.12/site-packages/nltk/twitter/api.py
Normal file
145
Backend/venv/lib/python3.12/site-packages/nltk/twitter/api.py
Normal file
@@ -0,0 +1,145 @@
|
||||
# Natural Language Toolkit: Twitter API
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# Lorenzo Rubio <lrnzcig@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
This module provides an interface for TweetHandlers, and support for timezone
|
||||
handling.
|
||||
"""
|
||||
|
||||
import time as _time
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from datetime import datetime, timedelta, timezone, tzinfo
|
||||
|
||||
|
||||
class LocalTimezoneOffsetWithUTC(tzinfo):
|
||||
"""
|
||||
This is not intended to be a general purpose class for dealing with the
|
||||
local timezone. In particular:
|
||||
|
||||
* it assumes that the date passed has been created using
|
||||
`datetime(..., tzinfo=Local)`, where `Local` is an instance of
|
||||
the object `LocalTimezoneOffsetWithUTC`;
|
||||
* for such an object, it returns the offset with UTC, used for date comparisons.
|
||||
|
||||
Reference: https://docs.python.org/3/library/datetime.html
|
||||
"""
|
||||
|
||||
STDOFFSET = timedelta(seconds=-_time.timezone)
|
||||
|
||||
if _time.daylight:
|
||||
DSTOFFSET = timedelta(seconds=-_time.altzone)
|
||||
else:
|
||||
DSTOFFSET = STDOFFSET
|
||||
|
||||
def utcoffset(self, dt):
|
||||
"""
|
||||
Access the relevant time offset.
|
||||
"""
|
||||
return self.DSTOFFSET
|
||||
|
||||
|
||||
LOCAL = LocalTimezoneOffsetWithUTC()
|
||||
|
||||
|
||||
class BasicTweetHandler(metaclass=ABCMeta):
|
||||
"""
|
||||
Minimal implementation of `TweetHandler`.
|
||||
|
||||
Counts the number of Tweets and decides when the client should stop
|
||||
fetching them.
|
||||
"""
|
||||
|
||||
def __init__(self, limit=20):
|
||||
self.limit = limit
|
||||
self.counter = 0
|
||||
|
||||
"""
|
||||
A flag to indicate to the client whether to stop fetching data given
|
||||
some condition (e.g., reaching a date limit).
|
||||
"""
|
||||
self.do_stop = False
|
||||
|
||||
"""
|
||||
Stores the id of the last fetched Tweet to handle pagination.
|
||||
"""
|
||||
self.max_id = None
|
||||
|
||||
def do_continue(self):
|
||||
"""
|
||||
Returns `False` if the client should stop fetching Tweets.
|
||||
"""
|
||||
return self.counter < self.limit and not self.do_stop
|
||||
|
||||
|
||||
class TweetHandlerI(BasicTweetHandler):
|
||||
"""
|
||||
Interface class whose subclasses should implement a handle method that
|
||||
Twitter clients can delegate to.
|
||||
"""
|
||||
|
||||
def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
|
||||
"""
|
||||
:param int limit: The number of data items to process in the current\
|
||||
round of processing.
|
||||
|
||||
:param tuple upper_date_limit: The date at which to stop collecting\
|
||||
new data. This should be entered as a tuple which can serve as the\
|
||||
argument to `datetime.datetime`.\
|
||||
E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
|
||||
|
||||
:param tuple lower_date_limit: The date at which to stop collecting\
|
||||
new data. See `upper_data_limit` for formatting.
|
||||
"""
|
||||
BasicTweetHandler.__init__(self, limit)
|
||||
|
||||
self.upper_date_limit = None
|
||||
self.lower_date_limit = None
|
||||
if upper_date_limit:
|
||||
self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
|
||||
if lower_date_limit:
|
||||
self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
|
||||
|
||||
self.startingup = True
|
||||
|
||||
@abstractmethod
|
||||
def handle(self, data):
|
||||
"""
|
||||
Deal appropriately with data returned by the Twitter API
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def on_finish(self):
|
||||
"""
|
||||
Actions when the tweet limit has been reached
|
||||
"""
|
||||
|
||||
def check_date_limit(self, data, verbose=False):
|
||||
"""
|
||||
Validate date limits.
|
||||
"""
|
||||
if self.upper_date_limit or self.lower_date_limit:
|
||||
date_fmt = "%a %b %d %H:%M:%S +0000 %Y"
|
||||
tweet_date = datetime.strptime(data["created_at"], date_fmt).replace(
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
|
||||
self.lower_date_limit and tweet_date < self.lower_date_limit
|
||||
):
|
||||
if self.upper_date_limit:
|
||||
message = "earlier"
|
||||
date_limit = self.upper_date_limit
|
||||
else:
|
||||
message = "later"
|
||||
date_limit = self.lower_date_limit
|
||||
if verbose:
|
||||
print(
|
||||
"Date limit {} is {} than date of current tweet {}".format(
|
||||
date_limit, message, tweet_date
|
||||
)
|
||||
)
|
||||
self.do_stop = True
|
||||
270
Backend/venv/lib/python3.12/site-packages/nltk/twitter/common.py
Normal file
270
Backend/venv/lib/python3.12/site-packages/nltk/twitter/common.py
Normal file
@@ -0,0 +1,270 @@
|
||||
# Natural Language Toolkit: Twitter client
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# Lorenzo Rubio <lrnzcig@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Utility functions for the `twitterclient` module which do not require
|
||||
the `twython` library to have been installed.
|
||||
"""
|
||||
import csv
|
||||
import gzip
|
||||
import json
|
||||
|
||||
from nltk.internals import deprecated
|
||||
|
||||
HIER_SEPARATOR = "."
|
||||
|
||||
|
||||
def extract_fields(tweet, fields):
|
||||
"""
|
||||
Extract field values from a full tweet and return them as a list
|
||||
|
||||
:param json tweet: The tweet in JSON format
|
||||
:param list fields: The fields to be extracted from the tweet
|
||||
:rtype: list(str)
|
||||
"""
|
||||
out = []
|
||||
for field in fields:
|
||||
try:
|
||||
_add_field_to_out(tweet, field, out)
|
||||
except TypeError as e:
|
||||
raise RuntimeError(
|
||||
"Fatal error when extracting fields. Cannot find field ", field
|
||||
) from e
|
||||
return out
|
||||
|
||||
|
||||
def _add_field_to_out(json, field, out):
|
||||
if _is_composed_key(field):
|
||||
key, value = _get_key_value_composed(field)
|
||||
_add_field_to_out(json[key], value, out)
|
||||
else:
|
||||
out += [json[field]]
|
||||
|
||||
|
||||
def _is_composed_key(field):
|
||||
return HIER_SEPARATOR in field
|
||||
|
||||
|
||||
def _get_key_value_composed(field):
|
||||
out = field.split(HIER_SEPARATOR)
|
||||
# there could be up to 3 levels
|
||||
key = out[0]
|
||||
value = HIER_SEPARATOR.join(out[1:])
|
||||
return key, value
|
||||
|
||||
|
||||
def _get_entity_recursive(json, entity):
|
||||
if not json:
|
||||
return None
|
||||
elif isinstance(json, dict):
|
||||
for key, value in json.items():
|
||||
if key == entity:
|
||||
return value
|
||||
# 'entities' and 'extended_entities' are wrappers in Twitter json
|
||||
# structure that contain other Twitter objects. See:
|
||||
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
|
||||
|
||||
if key == "entities" or key == "extended_entities":
|
||||
candidate = _get_entity_recursive(value, entity)
|
||||
if candidate is not None:
|
||||
return candidate
|
||||
return None
|
||||
elif isinstance(json, list):
|
||||
for item in json:
|
||||
candidate = _get_entity_recursive(item, entity)
|
||||
if candidate is not None:
|
||||
return candidate
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def json2csv(
|
||||
fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False
|
||||
):
|
||||
"""
|
||||
Extract selected fields from a file of line-separated JSON tweets and
|
||||
write to a file in CSV format.
|
||||
|
||||
This utility function allows a file of full tweets to be easily converted
|
||||
to a CSV file for easier processing. For example, just TweetIDs or
|
||||
just the text content of the Tweets can be extracted.
|
||||
|
||||
Additionally, the function allows combinations of fields of other Twitter
|
||||
objects (mainly the users, see below).
|
||||
|
||||
For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see
|
||||
`json2csv_entities`
|
||||
|
||||
:param str infile: The name of the file containing full tweets
|
||||
|
||||
:param str outfile: The name of the text file where results should be\
|
||||
written
|
||||
|
||||
:param list fields: The list of fields to be extracted. Useful examples\
|
||||
are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
|
||||
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
|
||||
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
|
||||
Additionally, it allows IDs from other Twitter objects, e. g.,\
|
||||
['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
|
||||
|
||||
:param error: Behaviour for encoding errors, see\
|
||||
https://docs.python.org/3/library/codecs.html#codec-base-classes
|
||||
|
||||
:param gzip_compress: if `True`, output files are compressed with gzip
|
||||
"""
|
||||
(writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
|
||||
# write the list of fields as header
|
||||
writer.writerow(fields)
|
||||
# process the file
|
||||
for line in fp:
|
||||
tweet = json.loads(line)
|
||||
row = extract_fields(tweet, fields)
|
||||
writer.writerow(row)
|
||||
outf.close()
|
||||
|
||||
|
||||
@deprecated("Use open() and csv.writer() directly instead.")
|
||||
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
|
||||
"""Get a CSV writer with optional compression."""
|
||||
return _outf_writer(outfile, encoding, errors, gzip_compress)
|
||||
|
||||
|
||||
def _outf_writer(outfile, encoding, errors, gzip_compress=False):
|
||||
if gzip_compress:
|
||||
outf = gzip.open(outfile, "wt", newline="", encoding=encoding, errors=errors)
|
||||
else:
|
||||
outf = open(outfile, "w", newline="", encoding=encoding, errors=errors)
|
||||
writer = csv.writer(outf)
|
||||
return (writer, outf)
|
||||
|
||||
|
||||
def json2csv_entities(
|
||||
tweets_file,
|
||||
outfile,
|
||||
main_fields,
|
||||
entity_type,
|
||||
entity_fields,
|
||||
encoding="utf8",
|
||||
errors="replace",
|
||||
gzip_compress=False,
|
||||
):
|
||||
"""
|
||||
Extract selected fields from a file of line-separated JSON tweets and
|
||||
write to a file in CSV format.
|
||||
|
||||
This utility function allows a file of full Tweets to be easily converted
|
||||
to a CSV file for easier processing of Twitter entities. For example, the
|
||||
hashtags or media elements of a tweet can be extracted.
|
||||
|
||||
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
|
||||
there will be two lines in the output file, one per hashtag
|
||||
|
||||
:param tweets_file: the file-like object containing full Tweets
|
||||
|
||||
:param str outfile: The path of the text file where results should be\
|
||||
written
|
||||
|
||||
:param list main_fields: The list of fields to be extracted from the main\
|
||||
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
|
||||
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
|
||||
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
|
||||
If `entity_type` is expressed with hierarchy, then it is the list of\
|
||||
fields of the object that corresponds to the key of the entity_type,\
|
||||
(e.g., for entity_type='user.urls', the fields in the main_fields list\
|
||||
belong to the user object; for entity_type='place.bounding_box', the\
|
||||
files in the main_field list belong to the place object of the tweet).
|
||||
|
||||
:param list entity_type: The name of the entity: 'hashtags', 'media',\
|
||||
'urls' and 'user_mentions' for the tweet object. For a user object,\
|
||||
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
|
||||
bounding box of the Tweet location, use `'place.bounding_box'`.
|
||||
|
||||
:param list entity_fields: The list of fields to be extracted from the\
|
||||
entity. E.g. `['text']` (of the Tweet)
|
||||
|
||||
:param error: Behaviour for encoding errors, see\
|
||||
https://docs.python.org/3/library/codecs.html#codec-base-classes
|
||||
|
||||
:param gzip_compress: if `True`, output files are compressed with gzip
|
||||
"""
|
||||
|
||||
(writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
|
||||
header = get_header_field_list(main_fields, entity_type, entity_fields)
|
||||
writer.writerow(header)
|
||||
for line in tweets_file:
|
||||
tweet = json.loads(line)
|
||||
if _is_composed_key(entity_type):
|
||||
key, value = _get_key_value_composed(entity_type)
|
||||
object_json = _get_entity_recursive(tweet, key)
|
||||
if not object_json:
|
||||
# this can happen in the case of "place"
|
||||
continue
|
||||
object_fields = extract_fields(object_json, main_fields)
|
||||
items = _get_entity_recursive(object_json, value)
|
||||
_write_to_file(object_fields, items, entity_fields, writer)
|
||||
else:
|
||||
tweet_fields = extract_fields(tweet, main_fields)
|
||||
items = _get_entity_recursive(tweet, entity_type)
|
||||
_write_to_file(tweet_fields, items, entity_fields, writer)
|
||||
outf.close()
|
||||
|
||||
|
||||
def get_header_field_list(main_fields, entity_type, entity_fields):
|
||||
if _is_composed_key(entity_type):
|
||||
key, value = _get_key_value_composed(entity_type)
|
||||
main_entity = key
|
||||
sub_entity = value
|
||||
else:
|
||||
main_entity = None
|
||||
sub_entity = entity_type
|
||||
|
||||
if main_entity:
|
||||
output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
|
||||
else:
|
||||
output1 = main_fields
|
||||
output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
|
||||
return output1 + output2
|
||||
|
||||
|
||||
def _write_to_file(object_fields, items, entity_fields, writer):
|
||||
if not items:
|
||||
# it could be that the entity is just not present for the tweet
|
||||
# e.g. tweet hashtag is always present, even as [], however
|
||||
# tweet media may not be present
|
||||
return
|
||||
if isinstance(items, dict):
|
||||
# this happens e.g. for "place" of a tweet
|
||||
row = object_fields
|
||||
# there might be composed keys in de list of required fields
|
||||
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
|
||||
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
|
||||
for field in entity_field_values:
|
||||
value = items[field]
|
||||
if isinstance(value, list):
|
||||
row += value
|
||||
else:
|
||||
row += [value]
|
||||
# now check required dictionaries
|
||||
for d in entity_field_composed:
|
||||
kd, vd = _get_key_value_composed(d)
|
||||
json_dict = items[kd]
|
||||
if not isinstance(json_dict, dict):
|
||||
raise RuntimeError(
|
||||
"""Key {} does not contain a dictionary
|
||||
in the json file""".format(
|
||||
kd
|
||||
)
|
||||
)
|
||||
row += [json_dict[vd]]
|
||||
writer.writerow(row)
|
||||
return
|
||||
# in general it is a list
|
||||
for item in items:
|
||||
row = object_fields + extract_fields(item, entity_fields)
|
||||
writer.writerow(row)
|
||||
@@ -0,0 +1,306 @@
|
||||
# Natural Language Toolkit: Twitter client
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# Lorenzo Rubio <lrnzcig@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Examples to demo the :py:mod:`twitterclient` code.
|
||||
|
||||
These demo functions should all run, with the following caveats:
|
||||
|
||||
* You must have obtained API keys from Twitter, and installed them according to
|
||||
the instructions in the `twitter HOWTO <https://www.nltk.org/howto/twitter.html>`_.
|
||||
|
||||
* If you are on a slow network, some of the calls to the Twitter API may
|
||||
timeout.
|
||||
|
||||
* If you are being rate limited while searching, you will receive a 420
|
||||
error response.
|
||||
|
||||
* Your terminal window / console must be able to display UTF-8 encoded characters.
|
||||
|
||||
For documentation about the Twitter APIs, see `The Streaming APIs Overview
|
||||
<https://dev.twitter.com/streaming/overview>`_ and `The REST APIs Overview
|
||||
<https://dev.twitter.com/rest/public>`_.
|
||||
|
||||
For error codes see Twitter's
|
||||
`Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import json
|
||||
from functools import wraps
|
||||
from io import StringIO
|
||||
|
||||
from nltk.twitter import (
|
||||
Query,
|
||||
Streamer,
|
||||
TweetViewer,
|
||||
TweetWriter,
|
||||
Twitter,
|
||||
credsfromfile,
|
||||
)
|
||||
|
||||
SPACER = "###################################"
|
||||
|
||||
|
||||
def verbose(func):
|
||||
"""Decorator for demo functions"""
|
||||
|
||||
@wraps(func)
|
||||
def with_formatting(*args, **kwargs):
|
||||
print()
|
||||
print(SPACER)
|
||||
print("Using %s" % (func.__name__))
|
||||
print(SPACER)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return with_formatting
|
||||
|
||||
|
||||
def yesterday():
|
||||
"""
|
||||
Get yesterday's datetime as a 5-tuple.
|
||||
"""
|
||||
date = datetime.datetime.now()
|
||||
date -= datetime.timedelta(days=1)
|
||||
date_tuple = date.timetuple()[:6]
|
||||
return date_tuple
|
||||
|
||||
|
||||
def setup():
|
||||
"""
|
||||
Initialize global variables for the demos.
|
||||
"""
|
||||
global USERIDS, FIELDS
|
||||
|
||||
USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"]
|
||||
# UserIDs corresponding to\
|
||||
# @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive
|
||||
FIELDS = ["id_str"]
|
||||
|
||||
|
||||
@verbose
|
||||
def twitterclass_demo():
|
||||
"""
|
||||
Use the simplified :class:`Twitter` class to write some tweets to a file.
|
||||
"""
|
||||
tw = Twitter()
|
||||
print("Track from the public stream\n")
|
||||
tw.tweets(keywords="love, hate", limit=10) # public stream
|
||||
print(SPACER)
|
||||
print("Search past Tweets\n")
|
||||
tw = Twitter()
|
||||
tw.tweets(keywords="love, hate", stream=False, limit=10) # search past tweets
|
||||
print(SPACER)
|
||||
print(
|
||||
"Follow two accounts in the public stream"
|
||||
+ " -- be prepared to wait a few minutes\n"
|
||||
)
|
||||
tw = Twitter()
|
||||
tw.tweets(follow=["759251", "6017542"], stream=True, limit=5) # public stream
|
||||
|
||||
|
||||
@verbose
|
||||
def sampletoscreen_demo(limit=20):
|
||||
"""
|
||||
Sample from the Streaming API and send output to terminal.
|
||||
"""
|
||||
oauth = credsfromfile()
|
||||
client = Streamer(**oauth)
|
||||
client.register(TweetViewer(limit=limit))
|
||||
client.sample()
|
||||
|
||||
|
||||
@verbose
|
||||
def tracktoscreen_demo(track="taylor swift", limit=10):
|
||||
"""
|
||||
Track keywords from the public Streaming API and send output to terminal.
|
||||
"""
|
||||
oauth = credsfromfile()
|
||||
client = Streamer(**oauth)
|
||||
client.register(TweetViewer(limit=limit))
|
||||
client.filter(track=track)
|
||||
|
||||
|
||||
@verbose
|
||||
def search_demo(keywords="nltk"):
|
||||
"""
|
||||
Use the REST API to search for past tweets containing a given keyword.
|
||||
"""
|
||||
oauth = credsfromfile()
|
||||
client = Query(**oauth)
|
||||
for tweet in client.search_tweets(keywords=keywords, limit=10):
|
||||
print(tweet["text"])
|
||||
|
||||
|
||||
@verbose
|
||||
def tweets_by_user_demo(user="NLTK_org", count=200):
|
||||
"""
|
||||
Use the REST API to search for past tweets by a given user.
|
||||
"""
|
||||
oauth = credsfromfile()
|
||||
client = Query(**oauth)
|
||||
client.register(TweetWriter())
|
||||
client.user_tweets(user, count)
|
||||
|
||||
|
||||
@verbose
|
||||
def lookup_by_userid_demo():
|
||||
"""
|
||||
Use the REST API to convert a userID to a screen name.
|
||||
"""
|
||||
oauth = credsfromfile()
|
||||
client = Query(**oauth)
|
||||
user_info = client.user_info_from_id(USERIDS)
|
||||
for info in user_info:
|
||||
name = info["screen_name"]
|
||||
followers = info["followers_count"]
|
||||
following = info["friends_count"]
|
||||
print(f"{name}, followers: {followers}, following: {following}")
|
||||
|
||||
|
||||
@verbose
|
||||
def followtoscreen_demo(limit=10):
|
||||
"""
|
||||
Using the Streaming API, select just the tweets from a specified list of
|
||||
userIDs.
|
||||
|
||||
This is will only give results in a reasonable time if the users in
|
||||
question produce a high volume of tweets, and may even so show some delay.
|
||||
"""
|
||||
oauth = credsfromfile()
|
||||
client = Streamer(**oauth)
|
||||
client.register(TweetViewer(limit=limit))
|
||||
client.statuses.filter(follow=USERIDS)
|
||||
|
||||
|
||||
@verbose
|
||||
def streamtofile_demo(limit=20):
|
||||
"""
|
||||
Write 20 tweets sampled from the public Streaming API to a file.
|
||||
"""
|
||||
oauth = credsfromfile()
|
||||
client = Streamer(**oauth)
|
||||
client.register(TweetWriter(limit=limit, repeat=False))
|
||||
client.statuses.sample()
|
||||
|
||||
|
||||
@verbose
|
||||
def limit_by_time_demo(keywords="nltk"):
|
||||
"""
|
||||
Query the REST API for Tweets about NLTK since yesterday and send
|
||||
the output to terminal.
|
||||
|
||||
This example makes the assumption that there are sufficient Tweets since
|
||||
yesterday for the date to be an effective cut-off.
|
||||
"""
|
||||
date = yesterday()
|
||||
dt_date = datetime.datetime(*date)
|
||||
oauth = credsfromfile()
|
||||
client = Query(**oauth)
|
||||
client.register(TweetViewer(limit=100, lower_date_limit=date))
|
||||
|
||||
print(f"Cutoff date: {dt_date}\n")
|
||||
|
||||
for tweet in client.search_tweets(keywords=keywords):
|
||||
print("{} ".format(tweet["created_at"]), end="")
|
||||
client.handler.handle(tweet)
|
||||
|
||||
|
||||
@verbose
|
||||
def corpusreader_demo():
|
||||
"""
|
||||
Use `TwitterCorpusReader` tp read a file of tweets, and print out
|
||||
|
||||
* some full tweets in JSON format;
|
||||
* some raw strings from the tweets (i.e., the value of the `text` field); and
|
||||
* the result of tokenising the raw strings.
|
||||
|
||||
"""
|
||||
from nltk.corpus import twitter_samples as tweets
|
||||
|
||||
print()
|
||||
print("Complete tweet documents")
|
||||
print(SPACER)
|
||||
for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
|
||||
print(json.dumps(tweet, indent=1, sort_keys=True))
|
||||
|
||||
print()
|
||||
print("Raw tweet strings:")
|
||||
print(SPACER)
|
||||
for text in tweets.strings("tweets.20150430-223406.json")[:15]:
|
||||
print(text)
|
||||
|
||||
print()
|
||||
print("Tokenized tweet strings:")
|
||||
print(SPACER)
|
||||
for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
|
||||
print(toks)
|
||||
|
||||
|
||||
@verbose
|
||||
def expand_tweetids_demo():
|
||||
"""
|
||||
Given a file object containing a list of Tweet IDs, fetch the
|
||||
corresponding full Tweets, if available.
|
||||
|
||||
"""
|
||||
ids_f = StringIO(
|
||||
"""\
|
||||
588665495492124672
|
||||
588665495487909888
|
||||
588665495508766721
|
||||
588665495513006080
|
||||
588665495517200384
|
||||
588665495487811584
|
||||
588665495525588992
|
||||
588665495487844352
|
||||
588665495492014081
|
||||
588665495512948737"""
|
||||
)
|
||||
oauth = credsfromfile()
|
||||
client = Query(**oauth)
|
||||
hydrated = client.expand_tweetids(ids_f)
|
||||
|
||||
for tweet in hydrated:
|
||||
id_str = tweet["id_str"]
|
||||
print(f"id: {id_str}")
|
||||
text = tweet["text"]
|
||||
if text.startswith("@null"):
|
||||
text = "[Tweet not available]"
|
||||
print(text + "\n")
|
||||
|
||||
|
||||
ALL = [
|
||||
twitterclass_demo,
|
||||
sampletoscreen_demo,
|
||||
tracktoscreen_demo,
|
||||
search_demo,
|
||||
tweets_by_user_demo,
|
||||
lookup_by_userid_demo,
|
||||
followtoscreen_demo,
|
||||
streamtofile_demo,
|
||||
limit_by_time_demo,
|
||||
corpusreader_demo,
|
||||
expand_tweetids_demo,
|
||||
]
|
||||
|
||||
"""
|
||||
Select demo functions to run. E.g. replace the following line with "DEMOS =
|
||||
ALL[8:]" to execute only the final three demos.
|
||||
"""
|
||||
DEMOS = ALL[:]
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup()
|
||||
|
||||
for demo in DEMOS:
|
||||
demo()
|
||||
|
||||
print("\n" + SPACER)
|
||||
print("All demos completed")
|
||||
print(SPACER)
|
||||
@@ -0,0 +1,562 @@
|
||||
# Natural Language Toolkit: Twitter client
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# Lorenzo Rubio <lrnzcig@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
NLTK Twitter client
|
||||
|
||||
This module offers methods for collecting and processing Tweets. Most of the
|
||||
functionality depends on access to the Twitter APIs, and this is handled via
|
||||
the third party Twython library.
|
||||
|
||||
If one of the methods below returns an integer, it is probably a `Twitter
|
||||
error code <https://dev.twitter.com/overview/api/response-codes>`_. For
|
||||
example, the response of '420' means that you have reached the limit of the
|
||||
requests you can currently make to the Twitter API. Currently, `rate limits
|
||||
for the search API <https://dev.twitter.com/rest/public/rate-limiting>`_ are
|
||||
divided into 15 minute windows.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import gzip
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import requests
|
||||
from twython import Twython, TwythonStreamer
|
||||
from twython.exceptions import TwythonError, TwythonRateLimitError
|
||||
|
||||
from nltk.twitter.api import BasicTweetHandler, TweetHandlerI
|
||||
from nltk.twitter.util import credsfromfile, guess_path
|
||||
|
||||
|
||||
class Streamer(TwythonStreamer):
|
||||
"""
|
||||
Retrieve data from the Twitter Streaming API.
|
||||
|
||||
The streaming API requires
|
||||
`OAuth 1.0 <https://en.wikipedia.org/wiki/OAuth>`_ authentication.
|
||||
"""
|
||||
|
||||
def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
|
||||
self.handler = None
|
||||
self.do_continue = True
|
||||
TwythonStreamer.__init__(
|
||||
self, app_key, app_secret, oauth_token, oauth_token_secret
|
||||
)
|
||||
|
||||
def register(self, handler):
|
||||
"""
|
||||
Register a method for handling Tweets.
|
||||
|
||||
:param TweetHandlerI handler: method for viewing
|
||||
"""
|
||||
self.handler = handler
|
||||
|
||||
def on_success(self, data):
|
||||
"""
|
||||
:param data: response from Twitter API
|
||||
"""
|
||||
if self.do_continue:
|
||||
if self.handler is not None:
|
||||
if "text" in data:
|
||||
self.handler.counter += 1
|
||||
self.handler.handle(data)
|
||||
self.do_continue = self.handler.do_continue()
|
||||
else:
|
||||
raise ValueError("No data handler has been registered.")
|
||||
else:
|
||||
self.disconnect()
|
||||
self.handler.on_finish()
|
||||
|
||||
def on_error(self, status_code, data):
|
||||
"""
|
||||
:param status_code: The status code returned by the Twitter API
|
||||
:param data: The response from Twitter API
|
||||
|
||||
"""
|
||||
print(status_code)
|
||||
|
||||
def sample(self):
|
||||
"""
|
||||
Wrapper for 'statuses / sample' API call
|
||||
"""
|
||||
while self.do_continue:
|
||||
# Stream in an endless loop until limit is reached. See twython
|
||||
# issue 288: https://github.com/ryanmcgrath/twython/issues/288
|
||||
# colditzjb commented on 9 Dec 2014
|
||||
|
||||
try:
|
||||
self.statuses.sample()
|
||||
except requests.exceptions.ChunkedEncodingError as e:
|
||||
if e is not None:
|
||||
print(f"Error (stream will continue): {e}")
|
||||
continue
|
||||
|
||||
def filter(self, track="", follow="", lang="en"):
|
||||
"""
|
||||
Wrapper for 'statuses / filter' API call
|
||||
"""
|
||||
while self.do_continue:
|
||||
# Stream in an endless loop until limit is reached
|
||||
|
||||
try:
|
||||
if track == "" and follow == "":
|
||||
msg = "Please supply a value for 'track', 'follow'"
|
||||
raise ValueError(msg)
|
||||
self.statuses.filter(track=track, follow=follow, lang=lang)
|
||||
except requests.exceptions.ChunkedEncodingError as e:
|
||||
if e is not None:
|
||||
print(f"Error (stream will continue): {e}")
|
||||
continue
|
||||
|
||||
|
||||
class Query(Twython):
|
||||
"""
|
||||
Retrieve data from the Twitter REST API.
|
||||
"""
|
||||
|
||||
def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
|
||||
"""
|
||||
:param app_key: (optional) Your applications key
|
||||
:param app_secret: (optional) Your applications secret key
|
||||
:param oauth_token: (optional) When using **OAuth 1**, combined with
|
||||
oauth_token_secret to make authenticated calls
|
||||
:param oauth_token_secret: (optional) When using **OAuth 1** combined
|
||||
with oauth_token to make authenticated calls
|
||||
"""
|
||||
self.handler = None
|
||||
self.do_continue = True
|
||||
Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret)
|
||||
|
||||
def register(self, handler):
|
||||
"""
|
||||
Register a method for handling Tweets.
|
||||
|
||||
:param TweetHandlerI handler: method for viewing or writing Tweets to a file.
|
||||
"""
|
||||
self.handler = handler
|
||||
|
||||
def expand_tweetids(self, ids_f, verbose=True):
|
||||
"""
|
||||
Given a file object containing a list of Tweet IDs, fetch the
|
||||
corresponding full Tweets from the Twitter API.
|
||||
|
||||
The API call `statuses/lookup` will fail to retrieve a Tweet if the
|
||||
user has deleted it.
|
||||
|
||||
This call to the Twitter API is rate-limited. See
|
||||
<https://dev.twitter.com/rest/reference/get/statuses/lookup> for details.
|
||||
|
||||
:param ids_f: input file object consisting of Tweet IDs, one to a line
|
||||
:return: iterable of Tweet objects in JSON format
|
||||
"""
|
||||
ids = [line.strip() for line in ids_f if line]
|
||||
|
||||
if verbose:
|
||||
print(f"Counted {len(ids)} Tweet IDs in {ids_f}.")
|
||||
|
||||
# The Twitter endpoint takes lists of up to 100 ids, so we chunk the
|
||||
# ids.
|
||||
id_chunks = [ids[i : i + 100] for i in range(0, len(ids), 100)]
|
||||
|
||||
chunked_tweets = (self.lookup_status(id=chunk) for chunk in id_chunks)
|
||||
|
||||
return itertools.chain.from_iterable(chunked_tweets)
|
||||
|
||||
def _search_tweets(self, keywords, limit=100, lang="en"):
|
||||
"""
|
||||
Assumes that the handler has been informed. Fetches Tweets from
|
||||
search_tweets generator output and passses them to handler
|
||||
|
||||
:param str keywords: A list of query terms to search for, written as\
|
||||
a comma-separated string.
|
||||
:param int limit: Number of Tweets to process
|
||||
:param str lang: language
|
||||
"""
|
||||
while True:
|
||||
tweets = self.search_tweets(
|
||||
keywords=keywords, limit=limit, lang=lang, max_id=self.handler.max_id
|
||||
)
|
||||
for tweet in tweets:
|
||||
self.handler.handle(tweet)
|
||||
if not (self.handler.do_continue() and self.handler.repeat):
|
||||
break
|
||||
self.handler.on_finish()
|
||||
|
||||
def search_tweets(
|
||||
self,
|
||||
keywords,
|
||||
limit=100,
|
||||
lang="en",
|
||||
max_id=None,
|
||||
retries_after_twython_exception=0,
|
||||
):
|
||||
"""
|
||||
Call the REST API ``'search/tweets'`` endpoint with some plausible
|
||||
defaults. See `the Twitter search documentation
|
||||
<https://dev.twitter.com/rest/public/search>`_ for more information
|
||||
about admissible search parameters.
|
||||
|
||||
:param str keywords: A list of query terms to search for, written as\
|
||||
a comma-separated string
|
||||
:param int limit: Number of Tweets to process
|
||||
:param str lang: language
|
||||
:param int max_id: id of the last tweet fetched
|
||||
:param int retries_after_twython_exception: number of retries when\
|
||||
searching Tweets before raising an exception
|
||||
:rtype: python generator
|
||||
"""
|
||||
if not self.handler:
|
||||
# if no handler is provided, `BasicTweetHandler` provides minimum
|
||||
# functionality for limiting the number of Tweets retrieved
|
||||
self.handler = BasicTweetHandler(limit=limit)
|
||||
|
||||
count_from_query = 0
|
||||
if max_id:
|
||||
self.handler.max_id = max_id
|
||||
else:
|
||||
results = self.search(
|
||||
q=keywords, count=min(100, limit), lang=lang, result_type="recent"
|
||||
)
|
||||
count = len(results["statuses"])
|
||||
if count == 0:
|
||||
print("No Tweets available through REST API for those keywords")
|
||||
return
|
||||
count_from_query = count
|
||||
self.handler.max_id = results["statuses"][count - 1]["id"] - 1
|
||||
|
||||
for result in results["statuses"]:
|
||||
yield result
|
||||
self.handler.counter += 1
|
||||
if self.handler.do_continue() == False:
|
||||
return
|
||||
|
||||
# Pagination loop: keep fetching Tweets until the desired count is
|
||||
# reached while dealing with Twitter rate limits.
|
||||
retries = 0
|
||||
while count_from_query < limit:
|
||||
try:
|
||||
mcount = min(100, limit - count_from_query)
|
||||
results = self.search(
|
||||
q=keywords,
|
||||
count=mcount,
|
||||
lang=lang,
|
||||
max_id=self.handler.max_id,
|
||||
result_type="recent",
|
||||
)
|
||||
except TwythonRateLimitError as e:
|
||||
print(f"Waiting for 15 minutes -{e}")
|
||||
time.sleep(15 * 60) # wait 15 minutes
|
||||
continue
|
||||
except TwythonError as e:
|
||||
print(f"Fatal error in Twython request -{e}")
|
||||
if retries_after_twython_exception == retries:
|
||||
raise e
|
||||
retries += 1
|
||||
|
||||
count = len(results["statuses"])
|
||||
if count == 0:
|
||||
print("No more Tweets available through rest api")
|
||||
return
|
||||
count_from_query += count
|
||||
# the max_id is also present in the Tweet metadata
|
||||
# results['search_metadata']['next_results'], but as part of a
|
||||
# query and difficult to fetch. This is doing the equivalent
|
||||
# (last tweet id minus one)
|
||||
self.handler.max_id = results["statuses"][count - 1]["id"] - 1
|
||||
|
||||
for result in results["statuses"]:
|
||||
yield result
|
||||
self.handler.counter += 1
|
||||
if self.handler.do_continue() == False:
|
||||
return
|
||||
|
||||
def user_info_from_id(self, userids):
|
||||
"""
|
||||
Convert a list of userIDs into a variety of information about the users.
|
||||
|
||||
See <https://dev.twitter.com/rest/reference/get/users/show>.
|
||||
|
||||
:param list userids: A list of integer strings corresponding to Twitter userIDs
|
||||
:rtype: list(json)
|
||||
"""
|
||||
return [self.show_user(user_id=userid) for userid in userids]
|
||||
|
||||
def user_tweets(self, screen_name, limit, include_rts="false"):
|
||||
"""
|
||||
Return a collection of the most recent Tweets posted by the user
|
||||
|
||||
:param str user: The user's screen name; the initial '@' symbol\
|
||||
should be omitted
|
||||
:param int limit: The number of Tweets to recover; 200 is the maximum allowed
|
||||
:param str include_rts: Whether to include statuses which have been\
|
||||
retweeted by the user; possible values are 'true' and 'false'
|
||||
"""
|
||||
data = self.get_user_timeline(
|
||||
screen_name=screen_name, count=limit, include_rts=include_rts
|
||||
)
|
||||
for item in data:
|
||||
self.handler.handle(item)
|
||||
|
||||
|
||||
class Twitter:
|
||||
"""
|
||||
Wrapper class with restricted functionality and fewer options.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._oauth = credsfromfile()
|
||||
self.streamer = Streamer(**self._oauth)
|
||||
self.query = Query(**self._oauth)
|
||||
|
||||
def tweets(
|
||||
self,
|
||||
keywords="",
|
||||
follow="",
|
||||
to_screen=True,
|
||||
stream=True,
|
||||
limit=100,
|
||||
date_limit=None,
|
||||
lang="en",
|
||||
repeat=False,
|
||||
gzip_compress=False,
|
||||
):
|
||||
"""
|
||||
Process some Tweets in a simple manner.
|
||||
|
||||
:param str keywords: Keywords to use for searching or filtering
|
||||
:param list follow: UserIDs to use for filtering Tweets from the public stream
|
||||
:param bool to_screen: If `True`, display the tweet texts on the screen,\
|
||||
otherwise print to a file
|
||||
|
||||
:param bool stream: If `True`, use the live public stream,\
|
||||
otherwise search past public Tweets
|
||||
|
||||
:param int limit: The number of data items to process in the current\
|
||||
round of processing.
|
||||
|
||||
:param tuple date_limit: The date at which to stop collecting\
|
||||
new data. This should be entered as a tuple which can serve as the\
|
||||
argument to `datetime.datetime`.\
|
||||
E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
|
||||
Note that, in the case of streaming, this is the maximum date, i.e.\
|
||||
a date in the future; if not, it is the minimum date, i.e. a date\
|
||||
in the past
|
||||
|
||||
:param str lang: language
|
||||
|
||||
:param bool repeat: A flag to determine whether multiple files should\
|
||||
be written. If `True`, the length of each file will be set by the\
|
||||
value of `limit`. Use only if `to_screen` is `False`. See also
|
||||
:py:func:`handle`.
|
||||
|
||||
:param gzip_compress: if `True`, output files are compressed with gzip.
|
||||
"""
|
||||
if stream:
|
||||
upper_date_limit = date_limit
|
||||
lower_date_limit = None
|
||||
else:
|
||||
upper_date_limit = None
|
||||
lower_date_limit = date_limit
|
||||
|
||||
if to_screen:
|
||||
handler = TweetViewer(
|
||||
limit=limit,
|
||||
upper_date_limit=upper_date_limit,
|
||||
lower_date_limit=lower_date_limit,
|
||||
)
|
||||
else:
|
||||
handler = TweetWriter(
|
||||
limit=limit,
|
||||
upper_date_limit=upper_date_limit,
|
||||
lower_date_limit=lower_date_limit,
|
||||
repeat=repeat,
|
||||
gzip_compress=gzip_compress,
|
||||
)
|
||||
|
||||
if to_screen:
|
||||
handler = TweetViewer(limit=limit)
|
||||
else:
|
||||
if stream:
|
||||
upper_date_limit = date_limit
|
||||
lower_date_limit = None
|
||||
else:
|
||||
upper_date_limit = None
|
||||
lower_date_limit = date_limit
|
||||
|
||||
handler = TweetWriter(
|
||||
limit=limit,
|
||||
upper_date_limit=upper_date_limit,
|
||||
lower_date_limit=lower_date_limit,
|
||||
repeat=repeat,
|
||||
gzip_compress=gzip_compress,
|
||||
)
|
||||
|
||||
if stream:
|
||||
self.streamer.register(handler)
|
||||
if keywords == "" and follow == "":
|
||||
self.streamer.sample()
|
||||
else:
|
||||
self.streamer.filter(track=keywords, follow=follow, lang=lang)
|
||||
else:
|
||||
self.query.register(handler)
|
||||
if keywords == "":
|
||||
raise ValueError("Please supply at least one keyword to search for.")
|
||||
else:
|
||||
self.query._search_tweets(keywords, limit=limit, lang=lang)
|
||||
|
||||
|
||||
class TweetViewer(TweetHandlerI):
|
||||
"""
|
||||
Handle data by sending it to the terminal.
|
||||
"""
|
||||
|
||||
def handle(self, data):
|
||||
"""
|
||||
Direct data to `sys.stdout`
|
||||
|
||||
:return: return ``False`` if processing should cease, otherwise return ``True``.
|
||||
:rtype: bool
|
||||
:param data: Tweet object returned by Twitter API
|
||||
"""
|
||||
text = data["text"]
|
||||
print(text)
|
||||
|
||||
self.check_date_limit(data)
|
||||
if self.do_stop:
|
||||
return
|
||||
|
||||
def on_finish(self):
|
||||
print(f"Written {self.counter} Tweets")
|
||||
|
||||
|
||||
class TweetWriter(TweetHandlerI):
|
||||
"""
|
||||
Handle data by writing it to a file.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
limit=2000,
|
||||
upper_date_limit=None,
|
||||
lower_date_limit=None,
|
||||
fprefix="tweets",
|
||||
subdir="twitter-files",
|
||||
repeat=False,
|
||||
gzip_compress=False,
|
||||
):
|
||||
"""
|
||||
The difference between the upper and lower date limits depends on
|
||||
whether Tweets are coming in an ascending date order (i.e. when
|
||||
streaming) or descending date order (i.e. when searching past Tweets).
|
||||
|
||||
:param int limit: number of data items to process in the current\
|
||||
round of processing.
|
||||
|
||||
:param tuple upper_date_limit: The date at which to stop collecting new\
|
||||
data. This should be entered as a tuple which can serve as the\
|
||||
argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\
|
||||
40)` for 12:30 pm on April 1 2015.
|
||||
|
||||
:param tuple lower_date_limit: The date at which to stop collecting new\
|
||||
data. See `upper_data_limit` for formatting.
|
||||
|
||||
:param str fprefix: The prefix to use in creating file names for Tweet\
|
||||
collections.
|
||||
|
||||
:param str subdir: The name of the directory where Tweet collection\
|
||||
files should be stored.
|
||||
|
||||
:param bool repeat: flag to determine whether multiple files should be\
|
||||
written. If `True`, the length of each file will be set by the value\
|
||||
of `limit`. See also :py:func:`handle`.
|
||||
|
||||
:param gzip_compress: if `True`, output files are compressed with gzip.
|
||||
"""
|
||||
self.fprefix = fprefix
|
||||
self.subdir = guess_path(subdir)
|
||||
self.gzip_compress = gzip_compress
|
||||
self.fname = self.timestamped_file()
|
||||
self.repeat = repeat
|
||||
self.output = None
|
||||
TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit)
|
||||
|
||||
def timestamped_file(self):
|
||||
"""
|
||||
:return: timestamped file name
|
||||
:rtype: str
|
||||
"""
|
||||
subdir = self.subdir
|
||||
fprefix = self.fprefix
|
||||
if subdir:
|
||||
if not os.path.exists(subdir):
|
||||
os.mkdir(subdir)
|
||||
|
||||
fname = os.path.join(subdir, fprefix)
|
||||
fmt = "%Y%m%d-%H%M%S"
|
||||
timestamp = datetime.datetime.now().strftime(fmt)
|
||||
if self.gzip_compress:
|
||||
suffix = ".gz"
|
||||
else:
|
||||
suffix = ""
|
||||
outfile = f"{fname}.{timestamp}.json{suffix}"
|
||||
return outfile
|
||||
|
||||
def handle(self, data):
|
||||
"""
|
||||
Write Twitter data as line-delimited JSON into one or more files.
|
||||
|
||||
:return: return `False` if processing should cease, otherwise return `True`.
|
||||
:param data: tweet object returned by Twitter API
|
||||
"""
|
||||
if self.startingup:
|
||||
if self.gzip_compress:
|
||||
self.output = gzip.open(self.fname, "w")
|
||||
else:
|
||||
self.output = open(self.fname, "w")
|
||||
print(f"Writing to {self.fname}")
|
||||
|
||||
json_data = json.dumps(data)
|
||||
if self.gzip_compress:
|
||||
self.output.write((json_data + "\n").encode("utf-8"))
|
||||
else:
|
||||
self.output.write(json_data + "\n")
|
||||
|
||||
self.check_date_limit(data)
|
||||
if self.do_stop:
|
||||
return
|
||||
|
||||
self.startingup = False
|
||||
|
||||
def on_finish(self):
|
||||
print(f"Written {self.counter} Tweets")
|
||||
if self.output:
|
||||
self.output.close()
|
||||
|
||||
def do_continue(self):
|
||||
if self.repeat == False:
|
||||
return TweetHandlerI.do_continue(self)
|
||||
|
||||
if self.do_stop:
|
||||
# stop for a functional cause (e.g. date limit)
|
||||
return False
|
||||
|
||||
if self.counter == self.limit:
|
||||
# repeat is True, thus close output file and
|
||||
# create a new one
|
||||
self._restart_file()
|
||||
return True
|
||||
|
||||
def _restart_file(self):
|
||||
self.on_finish()
|
||||
self.fname = self.timestamped_file()
|
||||
self.startingup = True
|
||||
self.counter = 0
|
||||
147
Backend/venv/lib/python3.12/site-packages/nltk/twitter/util.py
Normal file
147
Backend/venv/lib/python3.12/site-packages/nltk/twitter/util.py
Normal file
@@ -0,0 +1,147 @@
|
||||
# Natural Language Toolkit: Twitter client
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# Lorenzo Rubio <lrnzcig@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Authentication utilities to accompany `twitterclient`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pprint
|
||||
|
||||
from twython import Twython
|
||||
|
||||
|
||||
def credsfromfile(creds_file=None, subdir=None, verbose=False):
|
||||
"""
|
||||
Convenience function for authentication
|
||||
"""
|
||||
return Authenticate().load_creds(
|
||||
creds_file=creds_file, subdir=subdir, verbose=verbose
|
||||
)
|
||||
|
||||
|
||||
class Authenticate:
|
||||
"""
|
||||
Methods for authenticating with Twitter.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.creds_file = "credentials.txt"
|
||||
self.creds_fullpath = None
|
||||
|
||||
self.oauth = {}
|
||||
try:
|
||||
self.twitter_dir = os.environ["TWITTER"]
|
||||
self.creds_subdir = self.twitter_dir
|
||||
except KeyError:
|
||||
self.twitter_dir = None
|
||||
self.creds_subdir = None
|
||||
|
||||
def load_creds(self, creds_file=None, subdir=None, verbose=False):
|
||||
"""
|
||||
Read OAuth credentials from a text file.
|
||||
|
||||
File format for OAuth 1::
|
||||
|
||||
app_key=YOUR_APP_KEY
|
||||
app_secret=YOUR_APP_SECRET
|
||||
oauth_token=OAUTH_TOKEN
|
||||
oauth_token_secret=OAUTH_TOKEN_SECRET
|
||||
|
||||
|
||||
File format for OAuth 2::
|
||||
|
||||
app_key=YOUR_APP_KEY
|
||||
app_secret=YOUR_APP_SECRET
|
||||
access_token=ACCESS_TOKEN
|
||||
|
||||
:param str file_name: File containing credentials. ``None`` (default) reads
|
||||
data from `TWITTER/'credentials.txt'`
|
||||
"""
|
||||
if creds_file is not None:
|
||||
self.creds_file = creds_file
|
||||
|
||||
if subdir is None:
|
||||
if self.creds_subdir is None:
|
||||
msg = (
|
||||
"Supply a value to the 'subdir' parameter or"
|
||||
+ " set the TWITTER environment variable."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
else:
|
||||
self.creds_subdir = subdir
|
||||
|
||||
self.creds_fullpath = os.path.normpath(
|
||||
os.path.join(self.creds_subdir, self.creds_file)
|
||||
)
|
||||
|
||||
if not os.path.isfile(self.creds_fullpath):
|
||||
raise OSError(f"Cannot find file {self.creds_fullpath}")
|
||||
|
||||
with open(self.creds_fullpath) as infile:
|
||||
if verbose:
|
||||
print(f"Reading credentials file {self.creds_fullpath}")
|
||||
|
||||
for line in infile:
|
||||
if "=" in line:
|
||||
name, value = line.split("=", 1)
|
||||
self.oauth[name.strip()] = value.strip()
|
||||
|
||||
self._validate_creds_file(verbose=verbose)
|
||||
|
||||
return self.oauth
|
||||
|
||||
def _validate_creds_file(self, verbose=False):
|
||||
"""Check validity of a credentials file."""
|
||||
oauth1 = False
|
||||
oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"]
|
||||
oauth2 = False
|
||||
oauth2_keys = ["app_key", "app_secret", "access_token"]
|
||||
if all(k in self.oauth for k in oauth1_keys):
|
||||
oauth1 = True
|
||||
elif all(k in self.oauth for k in oauth2_keys):
|
||||
oauth2 = True
|
||||
|
||||
if not (oauth1 or oauth2):
|
||||
msg = f"Missing or incorrect entries in {self.creds_file}\n"
|
||||
msg += pprint.pformat(self.oauth)
|
||||
raise ValueError(msg)
|
||||
elif verbose:
|
||||
print(f'Credentials file "{self.creds_file}" looks good')
|
||||
|
||||
|
||||
def add_access_token(creds_file=None):
|
||||
"""
|
||||
For OAuth 2, retrieve an access token for an app and append it to a
|
||||
credentials file.
|
||||
"""
|
||||
if creds_file is None:
|
||||
path = os.path.dirname(__file__)
|
||||
creds_file = os.path.join(path, "credentials2.txt")
|
||||
oauth2 = credsfromfile(creds_file=creds_file)
|
||||
app_key = oauth2["app_key"]
|
||||
app_secret = oauth2["app_secret"]
|
||||
|
||||
twitter = Twython(app_key, app_secret, oauth_version=2)
|
||||
access_token = twitter.obtain_access_token()
|
||||
tok = f"access_token={access_token}\n"
|
||||
with open(creds_file, "a") as infile:
|
||||
print(tok, file=infile)
|
||||
|
||||
|
||||
def guess_path(pth):
|
||||
"""
|
||||
If the path is not absolute, guess that it is a subdirectory of the
|
||||
user's home directory.
|
||||
|
||||
:param str pth: The pathname of the directory where files of tweets should be written
|
||||
"""
|
||||
if os.path.isabs(pth):
|
||||
return pth
|
||||
else:
|
||||
return os.path.expanduser(os.path.join("~", pth))
|
||||
Reference in New Issue
Block a user