-
Notifications
You must be signed in to change notification settings - Fork 0
/
client.py
executable file
·269 lines (221 loc) · 10.4 KB
/
client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""DistributedCrawling client for LastFM resources."""
__version__ = "0.4.lastfm"
__date__ = "2008-11-11"
__author__ = "Tiago Alves Macambira & Rafael Sachetto"
__copyright__ = "Copyright (c) 2006-2008 Tiago Alves Macambira"
__license__ = "X11"
from optparse import OptionParser
import os
import urllib2
import time
import logging
from socket import gethostname
from DistributedCrawler.client import upload_aux
from DistributedCrawler.client import BaseClient, getUUID, log_backtrace, log_urllib2_exception
from DistributedCrawler.client.daemonize import createDaemon, reconfigStdout
#from digg_article_retriever import ArticleRetriever, \
# __version__ as articleretriever_version
from common import FINDUSERS_VALID_GENDERS, FINDUSERS_SEPARATOR
from retrievers import FindUsersRetriver, get_user_encoded_profile, \
retrieve_encoded_user_library_snapshot, LibrarySnapshotsRetriever, \
PageNotFound, __version__ as articleretriever_version
######################################################################
# Crawling Clients
######################################################################
class LastFMClient(BaseClient):
"""Client for crawling digg articles."""
MIN_SLEEP = 120
def __init__(self, client_id, base_url, store_dir=None):
"""LastFMClient Constructor."""
# Parent class constructor
BaseClient.__init__(self, client_id, base_url, store_dir)
# Informing the version of our ArticleRetriver
self.headers["client-arver"] = articleretriever_version
# Registering Command Handlers
self.handlers["FINDUSERS"] = self.findusers
self.handlers["GETPROFILE"] = self.getprofile
self.handlers["GETLIBRARY"] = self.getlibrary
def _write_to_store(self, article_id, data):
"""Write a (compressed) article to store.
Article_id is turned into something filesystem safe here.
"""
# FIXME - Must customize this function for lastfm content
# FIXME - Must customize this function for lastfm content
# FIXME - Must customize this function for lastfm content
safe_id = article_id.replace('/', '_')
safe_id += '.xml.gz'
BaseClient._write_to_store(self, safe_id, data)
def findusers(self, params):
"""Retrieve pages from the users search and send it to the server."""
# Retrieve the search gender (male, female) and search page
gender, page_num = params[0], int(params[1:])
assert gender in FINDUSERS_VALID_GENDERS
# Download search result pages
logging.info("FINDUSERS %s BEGIN", params)
retriever = FindUsersRetriver()
found_users = retriever.get_users_from_pages(gender, page_num)
logging.info("FINDUSERS %s GOT FINDUSERS DATA", params)
# Setup form and headers
# Although we used "upload" code, this is a plain POST
upload_headers = dict(self.headers)
form_data = {'page-id' : params,
'page-users' : FINDUSERS_SEPARATOR.join(found_users),
'page-users-count' : str(len(found_users)),
'client-id' : self.id}
# Upload the article
upload_url = self.base_url + '/findusers/' + params
response = upload_aux.upload_form(upload_url, form_data, upload_headers)
logging.info("FINDUSERS %s END", params)
# Ok. Command, handled. Now what?
# Do what the server told us to.
# Command MUST be SLEEP. We will sleep for at least self.MIN_SLEEP
command = response.read()
self._handleCommand(command, do_sleep=True)
def getprofile(self, params):
"""Retrieve user profile and send it to the server."""
log = logging.getLogger("GETPROFILE")
# The only parameter we get is the user name
username = params
# Download search result pages
log.info("BEGIN %s", params)
try:
profile, friends = get_user_encoded_profile(username)
log.info("GOT PROFILE FOR USER %s", username)
# Setup form and headers
# Although we used "upload" code, this is a plain POST
upload_headers = dict(self.headers)
form_data = {'username' : username,
'profile' : profile,
'friends-list' : FINDUSERS_SEPARATOR.join(friends),
'friends-list-count' : str(len(friends)),
'client-id' : self.id}
# Upload the article
log.info("UPLOADING TO SERVER %s", params)
upload_url = self.base_url + '/getprofile/' + username
response = upload_aux.upload_form(upload_url, form_data,
upload_headers)
except PageNotFound:
response = self.report_not_found_user(username, log)
log.info("END %s", params)
# Ok. Command, handled. Now what?
# Do what the server told us to.
# Command MUST be SLEEP. We will sleep for at least self.MIN_SLEEP
command = response.read()
self._handleCommand(command, do_sleep=True)
def getlibrary(self, params):
"""Retrive a user's music library.
TODO: we should've encoded the command/job as "username#last_crawled_ts"
but this will be left as pending work for the next crawling...
Encoded params:
username, as string
"""
log = logging.getLogger("GETLIBRARY")
# The only parameter we get is the user name
username = params
# XXX The listened_time_threshold was supposed to come encoded with
# the username but we had to change too much stuff in the server-side
# to make this happen. See above todo.
listened_time_threshold = LibrarySnapshotsRetriever.DAY_ONE
# Download search result pages
log.info("BEGIN %s", params)
try:
result = retrieve_encoded_user_library_snapshot(username,
listened_time_threshold)
library, last_crawled_ts = result
log.info("GOT LIBRARY FOR USER %s", username)
# Setup form and headers
# Although we used "upload" code, this is a plain POST
upload_headers = dict(self.headers)
form_data = {'username' : username,
'library' : library,
'last-crawled-ts' : str(int(last_crawled_ts)),
'client-id' : self.id}
# Upload the article
log.info("UPLOADING TO SERVER %s", params)
upload_url = self.base_url + '/getlibrary/' + username
response = upload_aux.upload_form(upload_url, form_data,
upload_headers)
finally:
# XXX We are not handling errors --
pass
log.info("END %s", params)
# Ok. Command, handled. Now what?
# Do what the server told us to.
# Command MUST be SLEEP. We will sleep for at least self.MIN_SLEEP
command = response.read()
self._handleCommand(command, do_sleep=True)
def report_not_found_user(self, username, log):
log.info("REPORTING 404 ERROR BACK TO SERVER FOR USER '%s'", username)
req = urllib2.Request(self.base_url + '/notfound/' + \
username, headers=self.headers)
response = urllib2.urlopen(req)
return response
######################################################################
# MAIN
######################################################################
#TODO(macambira): move main out of this module or refactor it into a set of small helper functions
def main(base_url, store_dir, log_to_screen=False):
"""Setup enviroment and run client."""
# TODO Merge as much of this code as possible with __main__
hostname = gethostname()
id_filename = store_dir + "/" + hostname + '.id'
log_filename = store_dir + "/" + hostname + '.log'
out_filename = store_dir + "/" + hostname + '.out' # FIXME DEPRECATED!!!!
# Setup logging
extra_log_options = {"level" : logging.DEBUG, "flushlevel": logging.NOTSET}
if not log_to_screen:
extra_log_options['filename'] = log_filename
logging.basicConfig( **extra_log_options )
# Setup client id
client_id = getUUID(id_filename)
cli = LastFMClient(client_id, base_url=base_url, store_dir=store_dir)
logging.info("STARTED %s", time.asctime())
cli.run()
def parse_command_line():
"""As the name says, parses the command line."""
parser = OptionParser()
parser.add_option("-d", "--outdir", dest="store_dir", default=None,
help=("Write log reports and other files to DIR. "
"Defaults to the current directory."),
metavar="DIR")
parser.add_option("-f", "--foreground",
action="store_true", dest="foreground", default=False,
help="runs the client in foreground")
parser.add_option("-l", "--log-to-screen", dest="toscreen", default=False,
action="store_true",
help=("Log to stdout instead of to log file. "
"Defaults to False"))
(options, args) = parser.parse_args()
return options, args
if __name__ == '__main__':
BASE_URL = 'http://www.speed.dcc.ufmg.br/lastfm'
#BASE_URL = 'http://localhost:8700'
BASE_DIR = os.getcwd() # "." loses its meaning as soon as we deamonize
STDOUT_REDIR_FILENAME = BASE_DIR + "/daemon.out"
options, args = parse_command_line()
store_dir = BASE_DIR
if options.store_dir:
store_dir = os.path.join(BASE_DIR, options.store_dir)
if not os.path.isdir(store_dir):
os.makedirs(store_dir)
# TODO Merge as much of this code as possible with main()
# Dettach the current proceess from the terminal and became
# a daemon
if not options.foreground:
print "Becoming a daemon"
res = createDaemon()
# We closed all stdio and redirected them for /dev/null
# Just in case we need them back, let's reconfigure stdout and stderr
reconfigStdout(STDOUT_REDIR_FILENAME)
print "Became a daemon"
try:
main(BASE_URL, store_dir, options.toscreen)
except urllib2.HTTPError, e:
log_urllib2_exception(e)
raise
except:
log_backtrace()
raise
# vim: set ai tw=80 et sw=4 sts=4 fileencoding=utf-8 :