Files
Nuliga2/env/lib/python3.6/site-packages/twitter/archiver.py
2024-11-18 08:59:34 +01:00

458 lines
17 KiB
Python

"""USAGE
twitter-archiver [options] <-|user> [<user> ...]
DESCRIPTION
Archive tweets of users, sorted by date from oldest to newest, in
the following format: <id> <date> <<screen_name>> <tweet_text>
Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
resume archiving on next run. Archive file name is the user name.
Provide "-" instead of users to read users from standard input.
OPTIONS
-o --oauth authenticate to Twitter using OAuth (default: no)
-s --save-dir <path> directory to save archives (default: current dir)
-a --api-rate see current API rate limit status
-t --timeline <file> archive own timeline into given file name (requires
OAuth, max 800 statuses)
-m --mentions <file> archive own mentions instead of timeline into
given file name (requires OAuth, max 800 statuses)
-v --favorites archive user's favorites instead of timeline
-f --follow-redirects follow redirects of urls
-r --redirect-sites follow redirects for this comma separated list of hosts
-d --dms <file> archive own direct messages (both received and
sent) into given file name.
-i --isoformat store dates in ISO format (specifically RFC 3339)
AUTHENTICATION
Authenticate to Twitter using OAuth to archive tweets of private profiles
and have higher API rate limits. OAuth authentication tokens are stored
in ~/.twitter-archiver_oauth.
"""
from __future__ import print_function
import os, sys, time as _time, calendar, functools
from datetime import time, date, datetime
from getopt import gnu_getopt as getopt, GetoptError
try:
import urllib.request as urllib2
import http.client as httplib
except ImportError:
import urllib2
import httplib
# T-Archiver (Twitter-Archiver) application registered by @stalkr_
CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
from .api import Twitter, TwitterError
from .oauth import OAuth, read_token_file
from .oauth_dance import oauth_dance
from .auth import NoAuth
from .util import Fail, err, expand_line, parse_host_list
from .follow import lookup
from .timezones import utc as UTC, Local
def parse_args(args, options):
"""Parse arguments from command-line to set options."""
long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
short_opts = "hos:at:m:vfr:d:i"
opts, extra_args = getopt(args, short_opts, long_opts)
for opt, arg in opts:
if opt in ('-h', '--help'):
print(__doc__)
raise SystemExit(0)
elif opt in ('-o', '--oauth'):
options['oauth'] = True
elif opt in ('-s', '--save-dir'):
options['save-dir'] = arg
elif opt in ('-a', '--api-rate'):
options['api-rate' ] = True
elif opt in ('-t', '--timeline'):
options['timeline'] = arg
elif opt in ('-m', '--mentions'):
options['mentions'] = arg
elif opt in ('-v', '--favorites'):
options['favorites'] = True
elif opt in ('-f', '--follow-redirects'):
options['follow-redirects'] = True
elif opt in ('-r', '--redirect-sites'):
options['redirect-sites'] = arg
elif opt in ('-d', '--dms'):
options['dms'] = arg
elif opt in ('-i', '--isoformat'):
options['isoformat'] = True
options['extra_args'] = extra_args
def load_tweets(filename):
"""Load tweets from file into dict, see save_tweets()."""
try:
archive = open(filename,"r")
except IOError: # no archive (yet)
return {}
tweets = {}
for line in archive.readlines():
try:
tid, text = line.strip().split(" ", 1)
tweets[int(tid)] = text.decode("utf-8")
except Exception as e:
err("loading tweet %s failed due to %s" % (line, unicode(e)))
archive.close()
return tweets
def save_tweets(filename, tweets):
"""Save tweets from dict to file.
Save tweets from dict to UTF-8 encoded file, one per line:
<tweet id (number)> <tweet text>
Tweet text is:
<date> <<user>> [RT @<user>: ]<text>
Args:
filename: A string representing the file name to save tweets to.
tweets: A dict mapping tweet-ids (int) to tweet text (str).
"""
if len(tweets) == 0:
return
try:
archive = open(filename,"w")
except IOError as e:
err("Cannot save tweets: %s" % str(e))
return
for k in sorted(tweets.keys()):
try:
archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
except Exception as ex:
err("archiving tweet %s failed due to %s" % (k, unicode(ex)))
archive.close()
def format_date(utc, isoformat=False):
"""Parse Twitter's UTC date into UTC or local time."""
u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
# This is the least painful way I could find to create a non-naive
# datetime including a UTC timezone. Alternative suggestions
# welcome.
unew = datetime.combine(u.date(), time(u.time().hour,
u.time().minute, u.time().second, tzinfo=UTC))
# Convert to localtime
unew = unew.astimezone(Local)
if isoformat:
return unew.isoformat()
else:
return unew.strftime('%Y-%m-%d %H:%M:%S %Z')
def expand_format_text(hosts, text):
"""Following redirects in links."""
return direct_format_text(expand_line(text, hosts))
def direct_format_text(text):
"""Transform special chars in text to have only one line."""
return text.replace('\n','\\n').replace('\r','\\r')
def statuses_resolve_uids(twitter, tl):
"""Resolve user ids to screen names from statuses."""
# get all user ids that needs a lookup (no screen_name key)
user_ids = []
for t in tl:
rt = t.get('retweeted_status')
if rt and not rt['user'].get('screen_name'):
user_ids.append(rt['user']['id'])
if not t['user'].get('screen_name'):
user_ids.append(t['user']['id'])
# resolve all of them at once
names = lookup(twitter, list(set(user_ids)))
# build new statuses with resolved uids
new_tl = []
for t in tl:
rt = t.get('retweeted_status')
if rt and not rt['user'].get('screen_name'):
name = names[rt['user']['id']]
t['retweeted_status']['user']['screen_name'] = name
if not t['user'].get('screen_name'):
name = names[t['user']['id']]
t['user']['screen_name'] = name
new_tl.append(t)
return new_tl
def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False):
"""Get a portion of the statuses of a screen name."""
kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
if max_id:
kwargs['max_id'] = max_id
tweets = {}
if mentions:
tl = twitter.statuses.mentions_timeline(**kwargs)
elif favorites:
tl = twitter.favorites.list(**kwargs)
elif received_dms != None:
if received_dms:
tl = twitter.direct_messages(**kwargs)
else: # sent DMs
tl = twitter.direct_messages.sent(**kwargs)
else: # timeline
if screen_name:
tl = twitter.statuses.user_timeline(**kwargs)
else: # self
tl = twitter.statuses.home_timeline(**kwargs)
# some tweets do not provide screen name but user id, resolve those
# this isn't a valid operation for DMs, so special-case them
if received_dms == None:
newtl = statuses_resolve_uids(twitter, tl)
else:
newtl = tl
for t in newtl:
text = t['text']
rt = t.get('retweeted_status')
if rt:
text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
# DMs don't include mentions by default, so in order to show who
# the recipient was, we synthesise a mention. If we're not
# operating on DMs, behave as normal
if received_dms == None:
tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat),
t['user']['screen_name'],
format_text(text))
else:
tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat),
t['sender_screen_name'],
t['recipient']['screen_name'],
format_text(text))
return tweets
def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False):
"""Get all the statuses for a screen name."""
max_id = None
fail = Fail()
# get portions of statuses, incrementing max id until no new tweets appear
while True:
try:
portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat)
except TwitterError as e:
if e.e.code == 401:
err("Fail: %i Unauthorized (tweets of that user are protected)"
% e.e.code)
break
elif e.e.code == 429:
err("Fail: %i API rate limit exceeded" % e.e.code)
rls = twitter.application.rate_limit_status()
reset = rls.rate_limit_reset
reset = _time.asctime(_time.localtime(reset))
delay = int(rls.rate_limit_reset
- _time.time()) + 5 # avoid race
err("Interval limit of %i requests reached, next reset on %s: "
"going to sleep for %i secs" % (rls.rate_limit_limit,
reset, delay))
fail.wait(delay)
continue
elif e.e.code == 404:
err("Fail: %i This profile does not exist" % e.e.code)
break
elif e.e.code == 502:
err("Fail: %i Service currently unavailable, retrying..."
% e.e.code)
else:
err("Fail: %s\nRetrying..." % str(e)[:500])
fail.wait(3)
except urllib2.URLError as e:
err("Fail: urllib2.URLError %s - Retrying..." % str(e))
fail.wait(3)
except httplib.error as e:
err("Fail: httplib.error %s - Retrying..." % str(e))
fail.wait(3)
except KeyError as e:
err("Fail: KeyError %s - Retrying..." % str(e))
fail.wait(3)
else:
new = -len(tweets)
tweets.update(portion)
new += len(tweets)
err("Browsing %s statuses, new tweets: %i"
% (screen_name if screen_name else "home", new))
if new < 190:
break
max_id = min(portion.keys())-1 # browse backwards
fail = Fail()
def rate_limit_status(twitter):
"""Print current Twitter API rate limit status."""
rls = twitter.application.rate_limit_status()
print("Remaining API requests: %i/%i (interval limit)"
% (rls.rate_limit_remaining, rls.rate_limit_limit))
print("Next reset in %is (%s)"
% (int(rls.rate_limit_reset - _time.time()),
_time.asctime(_time.localtime(rls.rate_limit_reset))))
def main(args=sys.argv[1:]):
options = {
'oauth': False,
'save-dir': ".",
'api-rate': False,
'timeline': "",
'mentions': "",
'dms': "",
'favorites': False,
'follow-redirects': False,
'redirect-sites': None,
'isoformat': False,
}
try:
parse_args(args, options)
except GetoptError as e:
err("I can't do that, %s." % e)
raise SystemExit(1)
# exit if no user given
# except if asking for API rate, or archive of timeline or mentions
if not options['extra_args'] and not (options['api-rate'] or
options['timeline'] or
options['mentions'] or
options['dms']):
print(__doc__)
return
# authenticate using OAuth, asking for token if necessary
if options['oauth']:
oauth_filename = (os.environ.get('HOME',
os.environ.get('USERPROFILE', ''))
+ os.sep
+ '.twitter-archiver_oauth')
if not os.path.exists(oauth_filename):
oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
oauth_filename)
oauth_token, oauth_token_secret = read_token_file(oauth_filename)
auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
CONSUMER_SECRET)
else:
auth = NoAuth()
twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com')
if options['api-rate']:
rate_limit_status(twitter)
return
global format_text
if options['follow-redirects'] or options['redirect-sites'] :
if options['redirect-sites']:
hosts = parse_host_list(options['redirect-sites'])
else:
hosts = None
format_text = functools.partial(expand_format_text, hosts)
else:
format_text = direct_format_text
# save own timeline or mentions (the user used in OAuth)
if options['timeline'] or options['mentions']:
if isinstance(auth, NoAuth):
err("You must be authenticated to save timeline or mentions.")
raise SystemExit(1)
if options['timeline']:
filename = options['save-dir'] + os.sep + options['timeline']
print("* Archiving own timeline in %s" % filename)
elif options['mentions']:
filename = options['save-dir'] + os.sep + options['mentions']
print("* Archiving own mentions in %s" % filename)
tweets = {}
try:
tweets = load_tweets(filename)
except Exception as e:
err("Error when loading saved tweets: %s - continuing without"
% str(e))
try:
statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
except KeyboardInterrupt:
err()
err("Interrupted")
raise SystemExit(1)
save_tweets(filename, tweets)
if options['timeline']:
print("Total tweets in own timeline: %i" % len(tweets))
elif options['mentions']:
print("Total mentions: %i" % len(tweets))
if options['dms']:
if isinstance(auth, NoAuth):
err("You must be authenticated to save DMs.")
raise SystemExit(1)
filename = options['save-dir'] + os.sep + options['dms']
print("* Archiving own DMs in %s" % filename)
dms = {}
try:
dms = load_tweets(filename)
except Exception as e:
err("Error when loading saved DMs: %s - continuing without"
% str(e))
try:
statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat'])
statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat'])
except KeyboardInterrupt:
err()
err("Interrupted")
raise SystemExit(1)
save_tweets(filename, dms)
print("Total DMs sent and received: %i" % len(dms))
# read users from command-line or stdin
users = options['extra_args']
if len(users) == 1 and users[0] == "-":
users = [line.strip() for line in sys.stdin.readlines()]
# save tweets for every user
total, total_new = 0, 0
for user in users:
filename = options['save-dir'] + os.sep + user
if options['favorites']:
filename = filename + "-favorites"
print("* Archiving %s tweets in %s" % (user, filename))
tweets = {}
try:
tweets = load_tweets(filename)
except Exception as e:
err("Error when loading saved tweets: %s - continuing without"
% str(e))
new = 0
before = len(tweets)
try:
statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
except KeyboardInterrupt:
err()
err("Interrupted")
raise SystemExit(1)
save_tweets(filename, tweets)
total += len(tweets)
new = len(tweets) - before
total_new += new
print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
print("Total: %i tweets (%i new) for %i users"
% (total, total_new, len(users)))