diff --git a/scrapy/contrib/aws.py b/scrapy/contrib/aws.py index 5addcb84b30af97defab50e165913220364a5748..1f54b763473c73b47b86ad0282457b0c16b79d5d 100644 --- a/scrapy/contrib/aws.py +++ b/scrapy/contrib/aws.py @@ -1,91 +1,27 @@ +""" +A downloader middleware for signing AWS requests just before they get into the +downloader. It is important to sign as close to the downloader as possible +because Amazon Web Service use timestamps for authentication. +""" + import os -import re import time -import hmac -import base64 -import hashlib -from urlparse import urlsplit from scrapy.utils.httpobj import urlparse_cached +from scrapy.utils.aws import sign_request from scrapy.conf import settings - -METADATA_PREFIX = 'x-amz-meta-' -AMAZON_HEADER_PREFIX = 'x-amz-' - - -# generates the aws canonical string for the given parameters -def canonical_string(method, path, headers, expires=None): - interesting_headers = {} - for key in headers: - lk = key.lower() - if lk in set(['content-md5', 'content-type', 'date']) or lk.startswith(AMAZON_HEADER_PREFIX): - interesting_headers[lk] = headers[key].strip() - - # these keys get empty strings if they don't exist - interesting_headers.setdefault('content-type', '') - interesting_headers.setdefault('content-md5', '') - - # just in case someone used this. it's not necessary in this lib. - if 'x-amz-date' in interesting_headers: - interesting_headers['date'] = '' - - # if you're using expires for query string auth, then it trumps date - # (and x-amz-date) - if expires: - interesting_headers['date'] = str(expires) - - sorted_header_keys = interesting_headers.keys() - sorted_header_keys.sort() - - buf = "%s\n" % method - for key in sorted_header_keys: - if key.startswith(AMAZON_HEADER_PREFIX): - buf += "%s:%s\n" % (key, interesting_headers[key]) - else: - buf += "%s\n" % interesting_headers[key] - - # don't include anything after the first ? in the resource... - buf += "%s" % path.split('?')[0] - - # ...unless there is an acl or torrent parameter - if re.search("[&?]acl($|=|&)", path): - buf += "?acl" - elif re.search("[&?]logging($|=|&)", path): - buf += "?logging" - elif re.search("[&?]torrent($|=|&)", path): - buf += "?torrent" - elif re.search("[&?]location($|=|&)", path): - buf += "?location" - - return buf - - - -def sign_request(req, accesskey, secretkey): - if 'Date' not in req.headers: - req.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) - - parsed = urlsplit(req.url) - bucket = parsed.hostname.replace('.s3.amazonaws.com','') - key = '%s?%s' % (parsed.path, parsed.query) if parsed.query else parsed.path - fqkey = '/%s%s' % (bucket, key) - - c_string = canonical_string(req.method, fqkey, req.headers) - _hmac = hmac.new(secretkey, digestmod=hashlib.sha1) - _hmac.update(c_string) - b64_hmac = base64.encodestring(_hmac.digest()).strip() - req.headers['Authorization'] = "AWS %s:%s" % (accesskey, b64_hmac) - - class AWSMiddleware(object): def __init__(self): - self.access_key = settings['AWS_ACCESS_KEY_ID'] or os.environ.get('AWS_ACCESS_KEY_ID') - self.secret_key = settings['AWS_SECRET_ACCESS_KEY'] or os.environ.get('AWS_SECRET_ACCESS_KEY') + self.access_key = settings['AWS_ACCESS_KEY_ID'] or \ + os.environ.get('AWS_ACCESS_KEY_ID') + self.secret_key = settings['AWS_SECRET_ACCESS_KEY'] or \ + os.environ.get('AWS_SECRET_ACCESS_KEY') def process_request(self, request, spider): hostname = urlparse_cached(request).hostname if spider.domain_name == 's3.amazonaws.com' \ or (hostname and hostname.endswith('s3.amazonaws.com')): - request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) + request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", \ + time.gmtime()) sign_request(request, self.access_key, self.secret_key) diff --git a/scrapy/tests/test_aws.py b/scrapy/tests/test_utils_aws.py similarity index 98% rename from scrapy/tests/test_aws.py rename to scrapy/tests/test_utils_aws.py index 71fc45e5fa7cde856c57e3ca9d1bd7d08920360b..e61962f12fc8fe059a61ddc271e9b4bcb3d93f5f 100644 --- a/scrapy/tests/test_aws.py +++ b/scrapy/tests/test_utils_aws.py @@ -1,8 +1,9 @@ from unittest import TestCase, main -from scrapy.contrib import aws + +from scrapy.utils import aws from scrapy.http import Request -# keys are provided by amazon developer guide at +# just some random keys. keys are provided by amazon developer guide at # http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf # and the tests described here are the examples from that manual @@ -103,7 +104,5 @@ class ScrapyAWSTest(TestCase): 'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=') - - if __name__ == '__main__': main() diff --git a/scrapy/utils/aws.py b/scrapy/utils/aws.py new file mode 100644 index 0000000000000000000000000000000000000000..f20b0df39119e3ec0de5f2b2309345a227dac08e --- /dev/null +++ b/scrapy/utils/aws.py @@ -0,0 +1,72 @@ +"""Helper function for working with Amazon Web Services""" + +import re +import time +import hmac +import base64 +import hashlib +from urlparse import urlsplit + +AMAZON_HEADER_PREFIX = 'x-amz-' + +# generates the aws canonical string for the given parameters +def canonical_string(method, path, headers, expires=None): + interesting_headers = {} + for key in headers: + lk = key.lower() + if lk in set(['content-md5', 'content-type', 'date']) \ + or lk.startswith(AMAZON_HEADER_PREFIX): + interesting_headers[lk] = headers[key].strip() + + # these keys get empty strings if they don't exist + interesting_headers.setdefault('content-type', '') + interesting_headers.setdefault('content-md5', '') + + # just in case someone used this. it's not necessary in this lib. + if 'x-amz-date' in interesting_headers: + interesting_headers['date'] = '' + + # if you're using expires for query string auth, then it trumps date + # (and x-amz-date) + if expires: + interesting_headers['date'] = str(expires) + + sorted_header_keys = interesting_headers.keys() + sorted_header_keys.sort() + + buf = "%s\n" % method + for key in sorted_header_keys: + if key.startswith(AMAZON_HEADER_PREFIX): + buf += "%s:%s\n" % (key, interesting_headers[key]) + else: + buf += "%s\n" % interesting_headers[key] + + # don't include anything after the first ? in the resource... + buf += "%s" % path.split('?')[0] + + # ...unless there is an acl or torrent parameter + if re.search("[&?]acl($|=|&)", path): + buf += "?acl" + elif re.search("[&?]logging($|=|&)", path): + buf += "?logging" + elif re.search("[&?]torrent($|=|&)", path): + buf += "?torrent" + elif re.search("[&?]location($|=|&)", path): + buf += "?location" + + return buf + +def sign_request(req, accesskey, secretkey): + if 'Date' not in req.headers: + req.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) + + parsed = urlsplit(req.url) + bucket = parsed.hostname.replace('.s3.amazonaws.com','') + key = '%s?%s' % (parsed.path, parsed.query) if parsed.query else parsed.path + fqkey = '/%s%s' % (bucket, key) + + c_string = canonical_string(req.method, fqkey, req.headers) + _hmac = hmac.new(secretkey, digestmod=hashlib.sha1) + _hmac.update(c_string) + b64_hmac = base64.encodestring(_hmac.digest()).strip() + req.headers['Authorization'] = "AWS %s:%s" % (accesskey, b64_hmac)