This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,35 @@
# Natural Language Toolkit: Twitter
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
NLTK Twitter Package
This package contains classes for retrieving Tweet documents using the
Twitter API.
"""
try:
import twython
except ImportError:
import warnings
warnings.warn(
"The twython library has not been installed. "
"Some functionality from the twitter package will not be available."
)
else:
from nltk.twitter.util import Authenticate, credsfromfile
from nltk.twitter.twitterclient import (
Streamer,
Query,
Twitter,
TweetViewer,
TweetWriter,
)
from nltk.twitter.common import json2csv

View File

@@ -0,0 +1,145 @@
# Natural Language Toolkit: Twitter API
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
This module provides an interface for TweetHandlers, and support for timezone
handling.
"""
import time as _time
from abc import ABCMeta, abstractmethod
from datetime import datetime, timedelta, timezone, tzinfo
class LocalTimezoneOffsetWithUTC(tzinfo):
"""
This is not intended to be a general purpose class for dealing with the
local timezone. In particular:
* it assumes that the date passed has been created using
`datetime(..., tzinfo=Local)`, where `Local` is an instance of
the object `LocalTimezoneOffsetWithUTC`;
* for such an object, it returns the offset with UTC, used for date comparisons.
Reference: https://docs.python.org/3/library/datetime.html
"""
STDOFFSET = timedelta(seconds=-_time.timezone)
if _time.daylight:
DSTOFFSET = timedelta(seconds=-_time.altzone)
else:
DSTOFFSET = STDOFFSET
def utcoffset(self, dt):
"""
Access the relevant time offset.
"""
return self.DSTOFFSET
LOCAL = LocalTimezoneOffsetWithUTC()
class BasicTweetHandler(metaclass=ABCMeta):
"""
Minimal implementation of `TweetHandler`.
Counts the number of Tweets and decides when the client should stop
fetching them.
"""
def __init__(self, limit=20):
self.limit = limit
self.counter = 0
"""
A flag to indicate to the client whether to stop fetching data given
some condition (e.g., reaching a date limit).
"""
self.do_stop = False
"""
Stores the id of the last fetched Tweet to handle pagination.
"""
self.max_id = None
def do_continue(self):
"""
Returns `False` if the client should stop fetching Tweets.
"""
return self.counter < self.limit and not self.do_stop
class TweetHandlerI(BasicTweetHandler):
"""
Interface class whose subclasses should implement a handle method that
Twitter clients can delegate to.
"""
def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
"""
:param int limit: The number of data items to process in the current\
round of processing.
:param tuple upper_date_limit: The date at which to stop collecting\
new data. This should be entered as a tuple which can serve as the\
argument to `datetime.datetime`.\
E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
:param tuple lower_date_limit: The date at which to stop collecting\
new data. See `upper_data_limit` for formatting.
"""
BasicTweetHandler.__init__(self, limit)
self.upper_date_limit = None
self.lower_date_limit = None
if upper_date_limit:
self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
if lower_date_limit:
self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
self.startingup = True
@abstractmethod
def handle(self, data):
"""
Deal appropriately with data returned by the Twitter API
"""
@abstractmethod
def on_finish(self):
"""
Actions when the tweet limit has been reached
"""
def check_date_limit(self, data, verbose=False):
"""
Validate date limits.
"""
if self.upper_date_limit or self.lower_date_limit:
date_fmt = "%a %b %d %H:%M:%S +0000 %Y"
tweet_date = datetime.strptime(data["created_at"], date_fmt).replace(
tzinfo=timezone.utc
)
if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
self.lower_date_limit and tweet_date < self.lower_date_limit
):
if self.upper_date_limit:
message = "earlier"
date_limit = self.upper_date_limit
else:
message = "later"
date_limit = self.lower_date_limit
if verbose:
print(
"Date limit {} is {} than date of current tweet {}".format(
date_limit, message, tweet_date
)
)
self.do_stop = True

View File

@@ -0,0 +1,270 @@
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Utility functions for the `twitterclient` module which do not require
the `twython` library to have been installed.
"""
import csv
import gzip
import json
from nltk.internals import deprecated
HIER_SEPARATOR = "."
def extract_fields(tweet, fields):
"""
Extract field values from a full tweet and return them as a list
:param json tweet: The tweet in JSON format
:param list fields: The fields to be extracted from the tweet
:rtype: list(str)
"""
out = []
for field in fields:
try:
_add_field_to_out(tweet, field, out)
except TypeError as e:
raise RuntimeError(
"Fatal error when extracting fields. Cannot find field ", field
) from e
return out
def _add_field_to_out(json, field, out):
if _is_composed_key(field):
key, value = _get_key_value_composed(field)
_add_field_to_out(json[key], value, out)
else:
out += [json[field]]
def _is_composed_key(field):
return HIER_SEPARATOR in field
def _get_key_value_composed(field):
out = field.split(HIER_SEPARATOR)
# there could be up to 3 levels
key = out[0]
value = HIER_SEPARATOR.join(out[1:])
return key, value
def _get_entity_recursive(json, entity):
if not json:
return None
elif isinstance(json, dict):
for key, value in json.items():
if key == entity:
return value
# 'entities' and 'extended_entities' are wrappers in Twitter json
# structure that contain other Twitter objects. See:
# https://dev.twitter.com/overview/api/entities-in-twitter-objects
if key == "entities" or key == "extended_entities":
candidate = _get_entity_recursive(value, entity)
if candidate is not None:
return candidate
return None
elif isinstance(json, list):
for item in json:
candidate = _get_entity_recursive(item, entity)
if candidate is not None:
return candidate
return None
else:
return None
def json2csv(
fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False
):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full tweets to be easily converted
to a CSV file for easier processing. For example, just TweetIDs or
just the text content of the Tweets can be extracted.
Additionally, the function allows combinations of fields of other Twitter
objects (mainly the users, see below).
For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see
`json2csv_entities`
:param str infile: The name of the file containing full tweets
:param str outfile: The name of the text file where results should be\
written
:param list fields: The list of fields to be extracted. Useful examples\
are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
Additionally, it allows IDs from other Twitter objects, e. g.,\
['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, output files are compressed with gzip
"""
(writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
# write the list of fields as header
writer.writerow(fields)
# process the file
for line in fp:
tweet = json.loads(line)
row = extract_fields(tweet, fields)
writer.writerow(row)
outf.close()
@deprecated("Use open() and csv.writer() directly instead.")
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
"""Get a CSV writer with optional compression."""
return _outf_writer(outfile, encoding, errors, gzip_compress)
def _outf_writer(outfile, encoding, errors, gzip_compress=False):
if gzip_compress:
outf = gzip.open(outfile, "wt", newline="", encoding=encoding, errors=errors)
else:
outf = open(outfile, "w", newline="", encoding=encoding, errors=errors)
writer = csv.writer(outf)
return (writer, outf)
def json2csv_entities(
tweets_file,
outfile,
main_fields,
entity_type,
entity_fields,
encoding="utf8",
errors="replace",
gzip_compress=False,
):
"""
Extract selected fields from a file of line-separated JSON tweets and
write to a file in CSV format.
This utility function allows a file of full Tweets to be easily converted
to a CSV file for easier processing of Twitter entities. For example, the
hashtags or media elements of a tweet can be extracted.
It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
there will be two lines in the output file, one per hashtag
:param tweets_file: the file-like object containing full Tweets
:param str outfile: The path of the text file where results should be\
written
:param list main_fields: The list of fields to be extracted from the main\
object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
<https://dev.twitter.com/overview/api/tweets> for a full list of fields.
e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
If `entity_type` is expressed with hierarchy, then it is the list of\
fields of the object that corresponds to the key of the entity_type,\
(e.g., for entity_type='user.urls', the fields in the main_fields list\
belong to the user object; for entity_type='place.bounding_box', the\
files in the main_field list belong to the place object of the tweet).
:param list entity_type: The name of the entity: 'hashtags', 'media',\
'urls' and 'user_mentions' for the tweet object. For a user object,\
this needs to be expressed with a hierarchy: `'user.urls'`. For the\
bounding box of the Tweet location, use `'place.bounding_box'`.
:param list entity_fields: The list of fields to be extracted from the\
entity. E.g. `['text']` (of the Tweet)
:param error: Behaviour for encoding errors, see\
https://docs.python.org/3/library/codecs.html#codec-base-classes
:param gzip_compress: if `True`, output files are compressed with gzip
"""
(writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
header = get_header_field_list(main_fields, entity_type, entity_fields)
writer.writerow(header)
for line in tweets_file:
tweet = json.loads(line)
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
object_json = _get_entity_recursive(tweet, key)
if not object_json:
# this can happen in the case of "place"
continue
object_fields = extract_fields(object_json, main_fields)
items = _get_entity_recursive(object_json, value)
_write_to_file(object_fields, items, entity_fields, writer)
else:
tweet_fields = extract_fields(tweet, main_fields)
items = _get_entity_recursive(tweet, entity_type)
_write_to_file(tweet_fields, items, entity_fields, writer)
outf.close()
def get_header_field_list(main_fields, entity_type, entity_fields):
if _is_composed_key(entity_type):
key, value = _get_key_value_composed(entity_type)
main_entity = key
sub_entity = value
else:
main_entity = None
sub_entity = entity_type
if main_entity:
output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
else:
output1 = main_fields
output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
return output1 + output2
def _write_to_file(object_fields, items, entity_fields, writer):
if not items:
# it could be that the entity is just not present for the tweet
# e.g. tweet hashtag is always present, even as [], however
# tweet media may not be present
return
if isinstance(items, dict):
# this happens e.g. for "place" of a tweet
row = object_fields
# there might be composed keys in de list of required fields
entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
for field in entity_field_values:
value = items[field]
if isinstance(value, list):
row += value
else:
row += [value]
# now check required dictionaries
for d in entity_field_composed:
kd, vd = _get_key_value_composed(d)
json_dict = items[kd]
if not isinstance(json_dict, dict):
raise RuntimeError(
"""Key {} does not contain a dictionary
in the json file""".format(
kd
)
)
row += [json_dict[vd]]
writer.writerow(row)
return
# in general it is a list
for item in items:
row = object_fields + extract_fields(item, entity_fields)
writer.writerow(row)

View File

@@ -0,0 +1,306 @@
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Examples to demo the :py:mod:`twitterclient` code.
These demo functions should all run, with the following caveats:
* You must have obtained API keys from Twitter, and installed them according to
the instructions in the `twitter HOWTO <https://www.nltk.org/howto/twitter.html>`_.
* If you are on a slow network, some of the calls to the Twitter API may
timeout.
* If you are being rate limited while searching, you will receive a 420
error response.
* Your terminal window / console must be able to display UTF-8 encoded characters.
For documentation about the Twitter APIs, see `The Streaming APIs Overview
<https://dev.twitter.com/streaming/overview>`_ and `The REST APIs Overview
<https://dev.twitter.com/rest/public>`_.
For error codes see Twitter's
`Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
"""
import datetime
import json
from functools import wraps
from io import StringIO
from nltk.twitter import (
Query,
Streamer,
TweetViewer,
TweetWriter,
Twitter,
credsfromfile,
)
SPACER = "###################################"
def verbose(func):
"""Decorator for demo functions"""
@wraps(func)
def with_formatting(*args, **kwargs):
print()
print(SPACER)
print("Using %s" % (func.__name__))
print(SPACER)
return func(*args, **kwargs)
return with_formatting
def yesterday():
"""
Get yesterday's datetime as a 5-tuple.
"""
date = datetime.datetime.now()
date -= datetime.timedelta(days=1)
date_tuple = date.timetuple()[:6]
return date_tuple
def setup():
"""
Initialize global variables for the demos.
"""
global USERIDS, FIELDS
USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"]
# UserIDs corresponding to\
# @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive
FIELDS = ["id_str"]
@verbose
def twitterclass_demo():
"""
Use the simplified :class:`Twitter` class to write some tweets to a file.
"""
tw = Twitter()
print("Track from the public stream\n")
tw.tweets(keywords="love, hate", limit=10) # public stream
print(SPACER)
print("Search past Tweets\n")
tw = Twitter()
tw.tweets(keywords="love, hate", stream=False, limit=10) # search past tweets
print(SPACER)
print(
"Follow two accounts in the public stream"
+ " -- be prepared to wait a few minutes\n"
)
tw = Twitter()
tw.tweets(follow=["759251", "6017542"], stream=True, limit=5) # public stream
@verbose
def sampletoscreen_demo(limit=20):
"""
Sample from the Streaming API and send output to terminal.
"""
oauth = credsfromfile()
client = Streamer(**oauth)
client.register(TweetViewer(limit=limit))
client.sample()
@verbose
def tracktoscreen_demo(track="taylor swift", limit=10):
"""
Track keywords from the public Streaming API and send output to terminal.
"""
oauth = credsfromfile()
client = Streamer(**oauth)
client.register(TweetViewer(limit=limit))
client.filter(track=track)
@verbose
def search_demo(keywords="nltk"):
"""
Use the REST API to search for past tweets containing a given keyword.
"""
oauth = credsfromfile()
client = Query(**oauth)
for tweet in client.search_tweets(keywords=keywords, limit=10):
print(tweet["text"])
@verbose
def tweets_by_user_demo(user="NLTK_org", count=200):
"""
Use the REST API to search for past tweets by a given user.
"""
oauth = credsfromfile()
client = Query(**oauth)
client.register(TweetWriter())
client.user_tweets(user, count)
@verbose
def lookup_by_userid_demo():
"""
Use the REST API to convert a userID to a screen name.
"""
oauth = credsfromfile()
client = Query(**oauth)
user_info = client.user_info_from_id(USERIDS)
for info in user_info:
name = info["screen_name"]
followers = info["followers_count"]
following = info["friends_count"]
print(f"{name}, followers: {followers}, following: {following}")
@verbose
def followtoscreen_demo(limit=10):
"""
Using the Streaming API, select just the tweets from a specified list of
userIDs.
This is will only give results in a reasonable time if the users in
question produce a high volume of tweets, and may even so show some delay.
"""
oauth = credsfromfile()
client = Streamer(**oauth)
client.register(TweetViewer(limit=limit))
client.statuses.filter(follow=USERIDS)
@verbose
def streamtofile_demo(limit=20):
"""
Write 20 tweets sampled from the public Streaming API to a file.
"""
oauth = credsfromfile()
client = Streamer(**oauth)
client.register(TweetWriter(limit=limit, repeat=False))
client.statuses.sample()
@verbose
def limit_by_time_demo(keywords="nltk"):
"""
Query the REST API for Tweets about NLTK since yesterday and send
the output to terminal.
This example makes the assumption that there are sufficient Tweets since
yesterday for the date to be an effective cut-off.
"""
date = yesterday()
dt_date = datetime.datetime(*date)
oauth = credsfromfile()
client = Query(**oauth)
client.register(TweetViewer(limit=100, lower_date_limit=date))
print(f"Cutoff date: {dt_date}\n")
for tweet in client.search_tweets(keywords=keywords):
print("{} ".format(tweet["created_at"]), end="")
client.handler.handle(tweet)
@verbose
def corpusreader_demo():
"""
Use `TwitterCorpusReader` tp read a file of tweets, and print out
* some full tweets in JSON format;
* some raw strings from the tweets (i.e., the value of the `text` field); and
* the result of tokenising the raw strings.
"""
from nltk.corpus import twitter_samples as tweets
print()
print("Complete tweet documents")
print(SPACER)
for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
print(json.dumps(tweet, indent=1, sort_keys=True))
print()
print("Raw tweet strings:")
print(SPACER)
for text in tweets.strings("tweets.20150430-223406.json")[:15]:
print(text)
print()
print("Tokenized tweet strings:")
print(SPACER)
for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
print(toks)
@verbose
def expand_tweetids_demo():
"""
Given a file object containing a list of Tweet IDs, fetch the
corresponding full Tweets, if available.
"""
ids_f = StringIO(
"""\
588665495492124672
588665495487909888
588665495508766721
588665495513006080
588665495517200384
588665495487811584
588665495525588992
588665495487844352
588665495492014081
588665495512948737"""
)
oauth = credsfromfile()
client = Query(**oauth)
hydrated = client.expand_tweetids(ids_f)
for tweet in hydrated:
id_str = tweet["id_str"]
print(f"id: {id_str}")
text = tweet["text"]
if text.startswith("@null"):
text = "[Tweet not available]"
print(text + "\n")
ALL = [
twitterclass_demo,
sampletoscreen_demo,
tracktoscreen_demo,
search_demo,
tweets_by_user_demo,
lookup_by_userid_demo,
followtoscreen_demo,
streamtofile_demo,
limit_by_time_demo,
corpusreader_demo,
expand_tweetids_demo,
]
"""
Select demo functions to run. E.g. replace the following line with "DEMOS =
ALL[8:]" to execute only the final three demos.
"""
DEMOS = ALL[:]
if __name__ == "__main__":
setup()
for demo in DEMOS:
demo()
print("\n" + SPACER)
print("All demos completed")
print(SPACER)

View File

@@ -0,0 +1,562 @@
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
NLTK Twitter client
This module offers methods for collecting and processing Tweets. Most of the
functionality depends on access to the Twitter APIs, and this is handled via
the third party Twython library.
If one of the methods below returns an integer, it is probably a `Twitter
error code <https://dev.twitter.com/overview/api/response-codes>`_. For
example, the response of '420' means that you have reached the limit of the
requests you can currently make to the Twitter API. Currently, `rate limits
for the search API <https://dev.twitter.com/rest/public/rate-limiting>`_ are
divided into 15 minute windows.
"""
import datetime
import gzip
import itertools
import json
import os
import time
import requests
from twython import Twython, TwythonStreamer
from twython.exceptions import TwythonError, TwythonRateLimitError
from nltk.twitter.api import BasicTweetHandler, TweetHandlerI
from nltk.twitter.util import credsfromfile, guess_path
class Streamer(TwythonStreamer):
"""
Retrieve data from the Twitter Streaming API.
The streaming API requires
`OAuth 1.0 <https://en.wikipedia.org/wiki/OAuth>`_ authentication.
"""
def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
self.handler = None
self.do_continue = True
TwythonStreamer.__init__(
self, app_key, app_secret, oauth_token, oauth_token_secret
)
def register(self, handler):
"""
Register a method for handling Tweets.
:param TweetHandlerI handler: method for viewing
"""
self.handler = handler
def on_success(self, data):
"""
:param data: response from Twitter API
"""
if self.do_continue:
if self.handler is not None:
if "text" in data:
self.handler.counter += 1
self.handler.handle(data)
self.do_continue = self.handler.do_continue()
else:
raise ValueError("No data handler has been registered.")
else:
self.disconnect()
self.handler.on_finish()
def on_error(self, status_code, data):
"""
:param status_code: The status code returned by the Twitter API
:param data: The response from Twitter API
"""
print(status_code)
def sample(self):
"""
Wrapper for 'statuses / sample' API call
"""
while self.do_continue:
# Stream in an endless loop until limit is reached. See twython
# issue 288: https://github.com/ryanmcgrath/twython/issues/288
# colditzjb commented on 9 Dec 2014
try:
self.statuses.sample()
except requests.exceptions.ChunkedEncodingError as e:
if e is not None:
print(f"Error (stream will continue): {e}")
continue
def filter(self, track="", follow="", lang="en"):
"""
Wrapper for 'statuses / filter' API call
"""
while self.do_continue:
# Stream in an endless loop until limit is reached
try:
if track == "" and follow == "":
msg = "Please supply a value for 'track', 'follow'"
raise ValueError(msg)
self.statuses.filter(track=track, follow=follow, lang=lang)
except requests.exceptions.ChunkedEncodingError as e:
if e is not None:
print(f"Error (stream will continue): {e}")
continue
class Query(Twython):
"""
Retrieve data from the Twitter REST API.
"""
def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
"""
:param app_key: (optional) Your applications key
:param app_secret: (optional) Your applications secret key
:param oauth_token: (optional) When using **OAuth 1**, combined with
oauth_token_secret to make authenticated calls
:param oauth_token_secret: (optional) When using **OAuth 1** combined
with oauth_token to make authenticated calls
"""
self.handler = None
self.do_continue = True
Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret)
def register(self, handler):
"""
Register a method for handling Tweets.
:param TweetHandlerI handler: method for viewing or writing Tweets to a file.
"""
self.handler = handler
def expand_tweetids(self, ids_f, verbose=True):
"""
Given a file object containing a list of Tweet IDs, fetch the
corresponding full Tweets from the Twitter API.
The API call `statuses/lookup` will fail to retrieve a Tweet if the
user has deleted it.
This call to the Twitter API is rate-limited. See
<https://dev.twitter.com/rest/reference/get/statuses/lookup> for details.
:param ids_f: input file object consisting of Tweet IDs, one to a line
:return: iterable of Tweet objects in JSON format
"""
ids = [line.strip() for line in ids_f if line]
if verbose:
print(f"Counted {len(ids)} Tweet IDs in {ids_f}.")
# The Twitter endpoint takes lists of up to 100 ids, so we chunk the
# ids.
id_chunks = [ids[i : i + 100] for i in range(0, len(ids), 100)]
chunked_tweets = (self.lookup_status(id=chunk) for chunk in id_chunks)
return itertools.chain.from_iterable(chunked_tweets)
def _search_tweets(self, keywords, limit=100, lang="en"):
"""
Assumes that the handler has been informed. Fetches Tweets from
search_tweets generator output and passses them to handler
:param str keywords: A list of query terms to search for, written as\
a comma-separated string.
:param int limit: Number of Tweets to process
:param str lang: language
"""
while True:
tweets = self.search_tweets(
keywords=keywords, limit=limit, lang=lang, max_id=self.handler.max_id
)
for tweet in tweets:
self.handler.handle(tweet)
if not (self.handler.do_continue() and self.handler.repeat):
break
self.handler.on_finish()
def search_tweets(
self,
keywords,
limit=100,
lang="en",
max_id=None,
retries_after_twython_exception=0,
):
"""
Call the REST API ``'search/tweets'`` endpoint with some plausible
defaults. See `the Twitter search documentation
<https://dev.twitter.com/rest/public/search>`_ for more information
about admissible search parameters.
:param str keywords: A list of query terms to search for, written as\
a comma-separated string
:param int limit: Number of Tweets to process
:param str lang: language
:param int max_id: id of the last tweet fetched
:param int retries_after_twython_exception: number of retries when\
searching Tweets before raising an exception
:rtype: python generator
"""
if not self.handler:
# if no handler is provided, `BasicTweetHandler` provides minimum
# functionality for limiting the number of Tweets retrieved
self.handler = BasicTweetHandler(limit=limit)
count_from_query = 0
if max_id:
self.handler.max_id = max_id
else:
results = self.search(
q=keywords, count=min(100, limit), lang=lang, result_type="recent"
)
count = len(results["statuses"])
if count == 0:
print("No Tweets available through REST API for those keywords")
return
count_from_query = count
self.handler.max_id = results["statuses"][count - 1]["id"] - 1
for result in results["statuses"]:
yield result
self.handler.counter += 1
if self.handler.do_continue() == False:
return
# Pagination loop: keep fetching Tweets until the desired count is
# reached while dealing with Twitter rate limits.
retries = 0
while count_from_query < limit:
try:
mcount = min(100, limit - count_from_query)
results = self.search(
q=keywords,
count=mcount,
lang=lang,
max_id=self.handler.max_id,
result_type="recent",
)
except TwythonRateLimitError as e:
print(f"Waiting for 15 minutes -{e}")
time.sleep(15 * 60) # wait 15 minutes
continue
except TwythonError as e:
print(f"Fatal error in Twython request -{e}")
if retries_after_twython_exception == retries:
raise e
retries += 1
count = len(results["statuses"])
if count == 0:
print("No more Tweets available through rest api")
return
count_from_query += count
# the max_id is also present in the Tweet metadata
# results['search_metadata']['next_results'], but as part of a
# query and difficult to fetch. This is doing the equivalent
# (last tweet id minus one)
self.handler.max_id = results["statuses"][count - 1]["id"] - 1
for result in results["statuses"]:
yield result
self.handler.counter += 1
if self.handler.do_continue() == False:
return
def user_info_from_id(self, userids):
"""
Convert a list of userIDs into a variety of information about the users.
See <https://dev.twitter.com/rest/reference/get/users/show>.
:param list userids: A list of integer strings corresponding to Twitter userIDs
:rtype: list(json)
"""
return [self.show_user(user_id=userid) for userid in userids]
def user_tweets(self, screen_name, limit, include_rts="false"):
"""
Return a collection of the most recent Tweets posted by the user
:param str user: The user's screen name; the initial '@' symbol\
should be omitted
:param int limit: The number of Tweets to recover; 200 is the maximum allowed
:param str include_rts: Whether to include statuses which have been\
retweeted by the user; possible values are 'true' and 'false'
"""
data = self.get_user_timeline(
screen_name=screen_name, count=limit, include_rts=include_rts
)
for item in data:
self.handler.handle(item)
class Twitter:
"""
Wrapper class with restricted functionality and fewer options.
"""
def __init__(self):
self._oauth = credsfromfile()
self.streamer = Streamer(**self._oauth)
self.query = Query(**self._oauth)
def tweets(
self,
keywords="",
follow="",
to_screen=True,
stream=True,
limit=100,
date_limit=None,
lang="en",
repeat=False,
gzip_compress=False,
):
"""
Process some Tweets in a simple manner.
:param str keywords: Keywords to use for searching or filtering
:param list follow: UserIDs to use for filtering Tweets from the public stream
:param bool to_screen: If `True`, display the tweet texts on the screen,\
otherwise print to a file
:param bool stream: If `True`, use the live public stream,\
otherwise search past public Tweets
:param int limit: The number of data items to process in the current\
round of processing.
:param tuple date_limit: The date at which to stop collecting\
new data. This should be entered as a tuple which can serve as the\
argument to `datetime.datetime`.\
E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
Note that, in the case of streaming, this is the maximum date, i.e.\
a date in the future; if not, it is the minimum date, i.e. a date\
in the past
:param str lang: language
:param bool repeat: A flag to determine whether multiple files should\
be written. If `True`, the length of each file will be set by the\
value of `limit`. Use only if `to_screen` is `False`. See also
:py:func:`handle`.
:param gzip_compress: if `True`, output files are compressed with gzip.
"""
if stream:
upper_date_limit = date_limit
lower_date_limit = None
else:
upper_date_limit = None
lower_date_limit = date_limit
if to_screen:
handler = TweetViewer(
limit=limit,
upper_date_limit=upper_date_limit,
lower_date_limit=lower_date_limit,
)
else:
handler = TweetWriter(
limit=limit,
upper_date_limit=upper_date_limit,
lower_date_limit=lower_date_limit,
repeat=repeat,
gzip_compress=gzip_compress,
)
if to_screen:
handler = TweetViewer(limit=limit)
else:
if stream:
upper_date_limit = date_limit
lower_date_limit = None
else:
upper_date_limit = None
lower_date_limit = date_limit
handler = TweetWriter(
limit=limit,
upper_date_limit=upper_date_limit,
lower_date_limit=lower_date_limit,
repeat=repeat,
gzip_compress=gzip_compress,
)
if stream:
self.streamer.register(handler)
if keywords == "" and follow == "":
self.streamer.sample()
else:
self.streamer.filter(track=keywords, follow=follow, lang=lang)
else:
self.query.register(handler)
if keywords == "":
raise ValueError("Please supply at least one keyword to search for.")
else:
self.query._search_tweets(keywords, limit=limit, lang=lang)
class TweetViewer(TweetHandlerI):
"""
Handle data by sending it to the terminal.
"""
def handle(self, data):
"""
Direct data to `sys.stdout`
:return: return ``False`` if processing should cease, otherwise return ``True``.
:rtype: bool
:param data: Tweet object returned by Twitter API
"""
text = data["text"]
print(text)
self.check_date_limit(data)
if self.do_stop:
return
def on_finish(self):
print(f"Written {self.counter} Tweets")
class TweetWriter(TweetHandlerI):
"""
Handle data by writing it to a file.
"""
def __init__(
self,
limit=2000,
upper_date_limit=None,
lower_date_limit=None,
fprefix="tweets",
subdir="twitter-files",
repeat=False,
gzip_compress=False,
):
"""
The difference between the upper and lower date limits depends on
whether Tweets are coming in an ascending date order (i.e. when
streaming) or descending date order (i.e. when searching past Tweets).
:param int limit: number of data items to process in the current\
round of processing.
:param tuple upper_date_limit: The date at which to stop collecting new\
data. This should be entered as a tuple which can serve as the\
argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\
40)` for 12:30 pm on April 1 2015.
:param tuple lower_date_limit: The date at which to stop collecting new\
data. See `upper_data_limit` for formatting.
:param str fprefix: The prefix to use in creating file names for Tweet\
collections.
:param str subdir: The name of the directory where Tweet collection\
files should be stored.
:param bool repeat: flag to determine whether multiple files should be\
written. If `True`, the length of each file will be set by the value\
of `limit`. See also :py:func:`handle`.
:param gzip_compress: if `True`, output files are compressed with gzip.
"""
self.fprefix = fprefix
self.subdir = guess_path(subdir)
self.gzip_compress = gzip_compress
self.fname = self.timestamped_file()
self.repeat = repeat
self.output = None
TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit)
def timestamped_file(self):
"""
:return: timestamped file name
:rtype: str
"""
subdir = self.subdir
fprefix = self.fprefix
if subdir:
if not os.path.exists(subdir):
os.mkdir(subdir)
fname = os.path.join(subdir, fprefix)
fmt = "%Y%m%d-%H%M%S"
timestamp = datetime.datetime.now().strftime(fmt)
if self.gzip_compress:
suffix = ".gz"
else:
suffix = ""
outfile = f"{fname}.{timestamp}.json{suffix}"
return outfile
def handle(self, data):
"""
Write Twitter data as line-delimited JSON into one or more files.
:return: return `False` if processing should cease, otherwise return `True`.
:param data: tweet object returned by Twitter API
"""
if self.startingup:
if self.gzip_compress:
self.output = gzip.open(self.fname, "w")
else:
self.output = open(self.fname, "w")
print(f"Writing to {self.fname}")
json_data = json.dumps(data)
if self.gzip_compress:
self.output.write((json_data + "\n").encode("utf-8"))
else:
self.output.write(json_data + "\n")
self.check_date_limit(data)
if self.do_stop:
return
self.startingup = False
def on_finish(self):
print(f"Written {self.counter} Tweets")
if self.output:
self.output.close()
def do_continue(self):
if self.repeat == False:
return TweetHandlerI.do_continue(self)
if self.do_stop:
# stop for a functional cause (e.g. date limit)
return False
if self.counter == self.limit:
# repeat is True, thus close output file and
# create a new one
self._restart_file()
return True
def _restart_file(self):
self.on_finish()
self.fname = self.timestamped_file()
self.startingup = True
self.counter = 0

View File

@@ -0,0 +1,147 @@
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Authentication utilities to accompany `twitterclient`.
"""
import os
import pprint
from twython import Twython
def credsfromfile(creds_file=None, subdir=None, verbose=False):
"""
Convenience function for authentication
"""
return Authenticate().load_creds(
creds_file=creds_file, subdir=subdir, verbose=verbose
)
class Authenticate:
"""
Methods for authenticating with Twitter.
"""
def __init__(self):
self.creds_file = "credentials.txt"
self.creds_fullpath = None
self.oauth = {}
try:
self.twitter_dir = os.environ["TWITTER"]
self.creds_subdir = self.twitter_dir
except KeyError:
self.twitter_dir = None
self.creds_subdir = None
def load_creds(self, creds_file=None, subdir=None, verbose=False):
"""
Read OAuth credentials from a text file.
File format for OAuth 1::
app_key=YOUR_APP_KEY
app_secret=YOUR_APP_SECRET
oauth_token=OAUTH_TOKEN
oauth_token_secret=OAUTH_TOKEN_SECRET
File format for OAuth 2::
app_key=YOUR_APP_KEY
app_secret=YOUR_APP_SECRET
access_token=ACCESS_TOKEN
:param str file_name: File containing credentials. ``None`` (default) reads
data from `TWITTER/'credentials.txt'`
"""
if creds_file is not None:
self.creds_file = creds_file
if subdir is None:
if self.creds_subdir is None:
msg = (
"Supply a value to the 'subdir' parameter or"
+ " set the TWITTER environment variable."
)
raise ValueError(msg)
else:
self.creds_subdir = subdir
self.creds_fullpath = os.path.normpath(
os.path.join(self.creds_subdir, self.creds_file)
)
if not os.path.isfile(self.creds_fullpath):
raise OSError(f"Cannot find file {self.creds_fullpath}")
with open(self.creds_fullpath) as infile:
if verbose:
print(f"Reading credentials file {self.creds_fullpath}")
for line in infile:
if "=" in line:
name, value = line.split("=", 1)
self.oauth[name.strip()] = value.strip()
self._validate_creds_file(verbose=verbose)
return self.oauth
def _validate_creds_file(self, verbose=False):
"""Check validity of a credentials file."""
oauth1 = False
oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"]
oauth2 = False
oauth2_keys = ["app_key", "app_secret", "access_token"]
if all(k in self.oauth for k in oauth1_keys):
oauth1 = True
elif all(k in self.oauth for k in oauth2_keys):
oauth2 = True
if not (oauth1 or oauth2):
msg = f"Missing or incorrect entries in {self.creds_file}\n"
msg += pprint.pformat(self.oauth)
raise ValueError(msg)
elif verbose:
print(f'Credentials file "{self.creds_file}" looks good')
def add_access_token(creds_file=None):
"""
For OAuth 2, retrieve an access token for an app and append it to a
credentials file.
"""
if creds_file is None:
path = os.path.dirname(__file__)
creds_file = os.path.join(path, "credentials2.txt")
oauth2 = credsfromfile(creds_file=creds_file)
app_key = oauth2["app_key"]
app_secret = oauth2["app_secret"]
twitter = Twython(app_key, app_secret, oauth_version=2)
access_token = twitter.obtain_access_token()
tok = f"access_token={access_token}\n"
with open(creds_file, "a") as infile:
print(tok, file=infile)
def guess_path(pth):
"""
If the path is not absolute, guess that it is a subdirectory of the
user's home directory.
:param str pth: The pathname of the directory where files of tweets should be written
"""
if os.path.isabs(pth):
return pth
else:
return os.path.expanduser(os.path.join("~", pth))