提交 8e7d5f69 编写于 作者: P Pablo Hoffman

- fixed setup.py script (closes #80)

- added .tmpl extension to project template files to prevent distutils from crashing when trying to compile those files
- cleaned up some garbage from settings templates

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%401042
上级 324ec076
# Define here the models for your scraped items
from scrapy.contrib.item import RobustScrapedItem
from scrapy.item import ScrapedItem
class GoogledirItem(RobustScrapedItem):
"""Directory website link"""
ATTRIBUTES = {
'guid': basestring,
'name': basestring,
'url': basestring,
'description': basestring,
}
class GoogledirItem(ScrapedItem):
pass
# - Scrapy settings for googledir -
import googledir
# ---------------------------------------------------------------------------
# - Scrapy settings for googledir -
# ---------------------------------------------------------------------------
PROJECT_NAME = 'googledir'
BOT_NAME = PROJECT_NAME
......@@ -11,96 +10,13 @@ BOT_VERSION = '1.0'
SPIDER_MODULES = ['googledir.spiders']
NEWSPIDER_MODULE = 'googledir.spiders'
TEMPLATES_DIR = '%s/templates' % googledir.__path__[0]
ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % googledir.__path__[0]
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
# The amount of time (in secs) that the downloader should wait before
# downloading consecutive pages from the same spider. This can be used
# to throttle the crawling speed to avoid hitting servers too
# hard. Decimal numbers are supported. Example:
# DOWNLOAD_DELAY = 2.5
DOWNLOAD_TIMEOUT = 600
# use this spider class as default when no spider was found for a given url
#DEFAULT_SPIDER = 'scrapy.contrib.spiders.generic.GenericSpider'
# uncomment if you want to add your own custom scrapy commands
#COMMANDS_MODULE = 'googledir.commands'
#COMMANDS_SETTINGS_MODULE = 'googledir.conf.commands'
#Global timeout between sucessive downloads (can be overrided by spider
#attribute download_timeout
#DOWNLOAD_TIMEOUT = 0
MYSQL_CONNECTION_SETTINGS = {"charset": "utf8" }
MYSQL_CONNECTION_PING_PERIOD = 600
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
#CACHE2_DIR = '/tmp/cache2' # if set, enables HTTP cache
#CACHE2_IGNORE_MISSING = 0 # ignore requests not in cache
#CACHE2_SECTORIZE = 1 # sectorize domains to distribute storage among servers
#STATS_ENABLED = 1 # enable stats
#STATS_CLEANUP = 0 # cleanup domain stats when a domain is closed (saves memory)
#STATS_DEBUG = 0 # log stats on domain closed
EXTENSIONS = (
'scrapy.management.web.WebConsole',
'scrapy.management.telnet.TelnetConsole',
)
DOWNLOADER_MIDDLEWARES = (
# Engine side
'scrapy.contrib.downloadermiddleware.errorpages.ErrorPagesMiddleware',
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware',
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware',
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware',
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware',
'scrapy.contrib.downloadermiddleware.common.CommonMiddleware',
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware',
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware',
'scrapy.contrib.downloadermiddleware.debug.CrawlDebug',
'scrapy.contrib.downloadermiddleware.cache.CacheMiddleware',
# Downloader side
)
SPIDER_MIDDLEWARES = (
# Engine side
'scrapy.contrib.spidermiddleware.limit.RequestLimitMiddleware',
'scrapy.contrib.spidermiddleware.restrict.RestrictMiddleware',
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware',
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware',
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware',
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware',
'scrapy.contrib.spidermiddleware.duplicatesfilter.DuplicatesFilterMiddleware',
# Spider side
)
ITEM_PIPELINES = (
'googledir.pipelines.GoogledirPipeline',
)
#DEPTH_LIMIT = 10 # limit the maximum link depth to follow
#DEPTH_STATS = 1 # enable depth stats
# Limit URL length. See: http://www.boutell.com/newfaq/misc/urllength.html
URLLENGTH_LIMIT = 2083
#WEBCONSOLE_ENABLED = 1
#WEBCONSOLE_PORT = 8060 # if not set uses a dynamic port
#TELNETCONSOLE_ENABLED = 1
#TELNETCONSOLE_PORT = 2020 # if not set uses a dynamic port
# global mail sending settings
#MAIL_HOST = 'localhost'
#MAIL_FROM = 'scrapybot@localhost'
# scrapy webservice
WS_ENABLED = 0
SPIDERPROFILER_ENABLED = 0
......@@ -4,8 +4,6 @@ import re
from scrapy.xpath import HtmlXPathSelector
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib_exp import adaptors
from scrapy.utils.misc import items_to_csv
from googledir.items import GoogledirItem
class GoogleDirectorySpider(CrawlSpider):
......@@ -13,12 +11,11 @@ class GoogleDirectorySpider(CrawlSpider):
start_urls = ['http://www.google.com/dirhp']
rules = (
Rule(RegexLinkExtractor(allow=('google.com/[A-Z][a-zA-Z_/]+$',),),
Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
'parse_category',
follow=True,
),
)
csv_file = open('scraped_items.csv', 'ab+')
def parse_category(self, response):
# The selector we're going to use in order to extract data from the page
......@@ -27,22 +24,12 @@ class GoogleDirectorySpider(CrawlSpider):
# The path to website links in directory page
links = hxs.x('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
# The list of functions to apply to an attribute before assigning its value
adaptor_pipe = [adaptors.extract, adaptors.delist(''), adaptors.strip]
adaptor_map = {
'name': adaptor_pipe,
'url': adaptor_pipe,
'description': adaptor_pipe,
}
for link in links:
item = GoogledirItem()
item.set_adaptors(adaptor_map)
item.attribute('name', link.x('a/text()'))
item.attribute('url', link.x('a/@href'))
item.attribute('description', link.x('font[2]/text()'))
items_to_csv(self.csv_file, [item])
item.name = link.x('a/text()').extract()
item.url = link.x('a/@href').extract()
item.description = link.x('font[2]/text()')
yield item
SPIDER = GoogleDirectorySpider()
......@@ -24,9 +24,9 @@ PROJECT_TEMPLATES_PATH = os.path.join(scrapy.__path__[0], 'templates/project')
# project directory.
TEMPLATES = (
'scrapy-ctl.py',
'${project_name}/settings.py',
'${project_name}/items.py',
'${project_name}/pipelines.py',
'${project_name}/settings.py.tmpl',
'${project_name}/items.py.tmpl',
'${project_name}/pipelines.py.tmpl',
)
IGNORE = ignore_patterns('*.pyc', '.svn')
......
# Define yours item pipelines here
# Define your item pipelines here
class ${ProjectName}Pipeline(object):
def process_item(self, domain, item):
......
import $project_name
# ---------------------------------------------------------------------------
# - Scrapy settings for $project_name -
# ---------------------------------------------------------------------------
PROJECT_NAME = '$project_name'
BOT_NAME = PROJECT_NAME
BOT_VERSION = '1.0'
SPIDER_MODULES = ['$project_name.spiders']
NEWSPIDER_MODULE = '$project_name.spiders'
TEMPLATES_DIR = '%s/templates' % $project_name.__path__[0]
ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % $project_name.__path__[0]
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
# The amount of time (in secs) that the downloader should wait before
# downloading consecutive pages from the same spider. This can be used
# to throttle the crawling speed to avoid hitting servers too
# hard. Decimal numbers are supported. Example:
# DOWNLOAD_DELAY = 2.5
DOWNLOAD_TIMEOUT = 600
# use this spider class as default when no spider was found for a given url
#DEFAULT_SPIDER = 'scrapy.contrib.spiders.generic.GenericSpider'
# uncomment if you want to add your own custom scrapy commands
#COMMANDS_MODULE = '$project_name.commands'
#COMMANDS_SETTINGS_MODULE = '$project_name.conf.commands'
#Global timeout between sucessive downloads (can be overrided by spider
#attribute download_timeout
#DOWNLOAD_TIMEOUT = 0
MYSQL_CONNECTION_SETTINGS = {"charset": "utf8" }
MYSQL_CONNECTION_PING_PERIOD = 600
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
#CACHE2_DIR = '/tmp/cache2' # if set, enables HTTP cache
#CACHE2_IGNORE_MISSING = 0 # ignore requests not in cache
#CACHE2_SECTORIZE = 1 # sectorize domains to distribute storage among servers
#STATS_ENABLED = 1 # enable stats
#STATS_CLEANUP = 0 # cleanup domain stats when a domain is closed (saves memory)
#STATS_DEBUG = 0 # log stats on domain closed
EXTENSIONS = (
'scrapy.management.web.WebConsole',
'scrapy.management.telnet.TelnetConsole',
)
DOWNLOADER_MIDDLEWARES = (
# Engine side
'scrapy.contrib.downloadermiddleware.errorpages.ErrorPagesMiddleware',
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware',
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware',
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware',
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware',
'scrapy.contrib.downloadermiddleware.common.CommonMiddleware',
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware',
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware',
'scrapy.contrib.downloadermiddleware.debug.CrawlDebug',
'scrapy.contrib.downloadermiddleware.cache.CacheMiddleware',
# Downloader side
)
SPIDER_MIDDLEWARES = (
# Engine side
'scrapy.contrib.spidermiddleware.limit.RequestLimitMiddleware',
'scrapy.contrib.spidermiddleware.restrict.RestrictMiddleware',
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware',
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware',
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware',
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware',
'scrapy.contrib.spidermiddleware.duplicatesfilter.DuplicatesFilterMiddleware',
# Spider side
)
ITEM_PIPELINES = (
'${project_name}.pipelines.${ProjectName}Pipeline',
)
#DEPTH_LIMIT = 10 # limit the maximum link depth to follow
#DEPTH_STATS = 1 # enable depth stats
# Limit URL length. See: http://www.boutell.com/newfaq/misc/urllength.html
URLLENGTH_LIMIT = 2083
#WEBCONSOLE_ENABLED = 1
#WEBCONSOLE_PORT = 8060 # if not set uses a dynamic port
#TELNETCONSOLE_ENABLED = 1
#TELNETCONSOLE_PORT = 2020 # if not set uses a dynamic port
# global mail sending settings
#MAIL_HOST = 'localhost'
#MAIL_FROM = 'scrapybot@localhost'
# scrapy webservice
WS_ENABLED = 0
SPIDERPROFILER_ENABLED = 0
# Scrapy settings for $project_name project
import $project_name
PROJECT_NAME = '$project_name'
BOT_NAME = PROJECT_NAME
BOT_VERSION = '1.0'
SPIDER_MODULES = ['$project_name.spiders']
NEWSPIDER_MODULE = '$project_name.spiders'
TEMPLATES_DIR = '%s/templates' % $project_name.__path__[0]
DEFAULT_ITEM_CLASS = '$project_name.item.${ProjectName}Item'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
# uncomment if you want to add your own custom scrapy commands
#COMMANDS_MODULE = '$project_name.commands'
#COMMANDS_SETTINGS_MODULE = '$project_name.conf.commands'
# global mail sending settings
#MAIL_HOST = 'localhost'
#MAIL_FROM = 'scrapybot@localhost'
......@@ -143,8 +143,10 @@ def render_templatefile(path, **kwargs):
content = string.Template(raw).substitute(**kwargs)
with open(path, 'wb') as file:
with open(path.rstrip('.tmpl'), 'wb') as file:
file.write(content)
if path.endswith('.tmpl'):
os.remove(path)
def items_to_csv(file, items, delimiter=';', headers=None):
......
[bdist_rpm]
release = 1
doc_files = docs examples extras AUTHORS INSTALL LICENSE README
install-script = scripts/rpm-install.sh
[install]
......
from setuptools import setup, find_packages
import os, os.path, glob
# Scrapy setup.py script
#
# Most of the code here was taken from Django setup.py
def findfiles(pattern, base='.'):
matches = []
for root, _, _ in os.walk(base):
matches.extend(glob.glob(os.path.join(root, pattern)))
return matches
from distutils.core import setup
from distutils.command.install_data import install_data
from distutils.command.install import INSTALL_SCHEMES
import os
import sys
class osx_install_data(install_data):
# On MacOS, the platform-specific lib dir is /System/Library/Framework/Python/.../
# which is wrong. Python 2.5 supplied with MacOS 10.5 has an Apple-specific fix
# for this in distutils.command.install_data#306. It fixes install_lib but not
# install_data, which is why we roll our own install_data class.
def finalize_options(self):
# By the time finalize_options is called, install.install_lib is set to the
# fixed directory, so we set the installdir to install_lib. The
# install_data class uses ('install_data', 'install_dir') instead.
self.set_undefined_options('install', ('install_lib', 'install_dir'))
install_data.finalize_options(self)
if sys.platform == "darwin":
cmdclasses = {'install_data': osx_install_data}
else:
cmdclasses = {'install_data': install_data}
def fullsplit(path, result=None):
"""
Split a pathname into components (the opposite of os.path.join) in a
platform-neutral way.
"""
if result is None:
result = []
head, tail = os.path.split(path)
if head == '':
return [tail] + result
if head == path:
return result
return fullsplit(head, [tail] + result)
# Tell distutils to put the data_files in platform-specific installation
# locations. See here for an explanation:
# http://groups.google.com/group/comp.lang.python/browse_thread/thread/35ec7b2fed36eaec/2105ee4d9e8042cb
for scheme in INSTALL_SCHEMES.values():
scheme['data'] = scheme['purelib']
# Compile the list of packages available, because distutils doesn't have
# an easy way to do this.
packages, data_files = [], []
root_dir = os.path.dirname(__file__)
if root_dir != '':
os.chdir(root_dir)
scrapy_dir = 'scrapy'
def is_not_module(filename):
return os.path.splitext(f)[1] not in ['.py', '.pyc', '.pyo']
for dirpath, dirnames, filenames in os.walk(scrapy_dir):
# Ignore dirnames that start with '.'
for i, dirname in enumerate(dirnames):
if dirname.startswith('.'): del dirnames[i]
if '__init__.py' in filenames:
packages.append('.'.join(fullsplit(dirpath)))
data = [f for f in filenames if is_not_module(f)]
if data:
data_files.append([dirpath, [os.path.join(dirpath, f) for f in data]])
elif filenames:
data_files.append([dirpath, [os.path.join(dirpath, f) for f in filenames]])
# Small hack for working with bdist_wininst.
# See http://mail.python.org/pipermail/distutils-sig/2004-August/004134.html
if len(sys.argv) > 1 and sys.argv[1] == 'bdist_wininst':
for file_info in data_files:
file_info[0] = '\\PURELIB\\%s' % file_info[0]
# Dynamically calculate the version based on scrapy.__version__
version = __import__('scrapy').__version__
if u'SVN' in version:
version = ' '.join(version.split(' ')[:-1])
setup(
name = 'scrapy',
version = '0.8',
description = '',
long_description = '',
author = '',
author_email = '',
license = '',
name = 'Scrapy',
version = version,
url = 'http://scrapy.org',
packages = find_packages(),
package_data = {
'scrapy': ['templates/*.tmpl'],
},
download_url = "http://scrapy.org/releases/scrapy-0.7.0.tar.gz",
description = 'A high-level Python Screen Scraping framework',
long_description = 'Scrapy is a high level scraping and web crawling framework for writing spiders to crawl and parse web pages for all kinds of purposes, from information retrieval to monitoring or testing web sites.',
author = 'Scrapy developers',
author_email = '',
license = 'BSD',
packages = packages,
cmdclass = cmdclasses,
data_files = data_files,
scripts = ['scrapy/bin/scrapy-admin.py'],
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册