提交 f2095857 编写于 作者: 梦想橡皮擦's avatar 梦想橡皮擦 💬

在行爬虫,超级产品经理爬虫,优设网爬虫

上级 8690fb95
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ArticleItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 文章标题
url = scrapy.Field() # 文章地址
author = scrapy.Field() # 作者
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class MyProjectSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyProjectDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class TitlePipeline:
def process_item(self, item, spider): # 移除标题中的空格
if item["title"]:
item["title"] = item["title"].strip()
return item
else:
return DropItem("异常数据")
# Scrapy settings for my_project project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'my_project'
SPIDER_MODULES = ['my_project.spiders']
NEWSPIDER_MODULE = 'my_project.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'my_project (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'my_project.middlewares.MyProjectSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'my_project.middlewares.MyProjectDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'my_project.pipelines.TitlePipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
from my_project.items import ArticleItem
class PmSpider(scrapy.Spider):
name = 'pm'
allowed_domains = ['www.imspm.com']
start_urls = ['http://www.imspm.com/chanpin/']
def parse(self, response):
# print(response.text)
list_item = response.css('.list-item-default')
# print(list_item)
for i in list_item:
item = ArticleItem()
title = i.css('.title::text').extract_first() # 直接获取文本
url = i.css('.a_block::attr(href)').extract_first() # 获取属性值
author = i.css('.author::text').extract_first() # 直接获取文本
# print(title, url, author)
# 对 item 进行赋值
item['title'] = title
item['url'] = url
item['author'] = author
yield item
next = response.css('.nav a:nth-last-child(2)::attr(href)').extract_first() # 获取下一页链接
# print(next)
# 再次生成一个请求
yield scrapy.Request(url=next, callback=self.parse)
此差异已折叠。
此差异已折叠。
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = my_project.settings
[deploy]
#url = http://localhost:6800/
project = my_project
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = zaihang_spider.settings
[deploy]
#url = http://localhost:6800/
project = zaihang_spider
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ZaihangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field() # 姓名
city = scrapy.Field() # 城市
industry = scrapy.Field() # 行业
price = scrapy.Field() # 价格
chat_nums = scrapy.Field() # 聊天人数
score = scrapy.Field() # 评分
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ZaihangSpiderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ZaihangSpiderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class ZaihangMySQLPipeline:
def __init__(self, host, port, user, password, db):
self.host = host
self.port = port
self.user = user
self.password = password
self.db = db
self.conn = None
self.cursor = None
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('HOST'),
port=crawler.settings.get('PORT'),
user=crawler.settings.get('USER'),
password=crawler.settings.get('PASSWORD'),
db=crawler.settings.get('DB')
)
def open_spider(self, spider):
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, db=self.db)
def process_item(self, item, spider):
print(item)
name = item["name"]
city = item["city"]
industry = item["industry"]
price = item["price"]
chat_nums = item["chat_nums"]
score = item["score"]
sql = "insert into users(name,city,industry,price,chat_nums,score) values ('%s','%s','%s',%.1f,%d,%.1f)" % (
name, city, industry, float(price), int(chat_nums), float(score))
print(sql)
self.cursor = self.conn.cursor() # 设置游标
try:
self.cursor.execute(sql) # 执行 sql
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
# Scrapy settings for zaihang_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'zaihang_spider'
SPIDER_MODULES = ['zaihang_spider.spiders']
NEWSPIDER_MODULE = 'zaihang_spider.spiders'
HOST = "127.0.0.1"
PORT = 3306
USER = "root"
PASSWORD = "123456"
DB = "zaihang"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'zaihang_spider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zaihang_spider.middlewares.ZaihangSpiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'zaihang_spider.middlewares.ZaihangSpiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'zaihang_spider.pipelines.ZaihangMySQLPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
from zaihang_spider.items import ZaihangItem
class ZhSpider(scrapy.Spider):
name = 'zh'
allowed_domains = ['www.zaih.com']
page = 1 # 起始页码
url_format = 'https://www.zaih.com/falcon/mentors?first_tag_id=479&first_tag_name=%E5%BF%83%E7%90%86&page={}' # 模板
start_urls = [url_format.format(page)]
def parse(self, response):
empty = response.css("section.empty")
if len(empty) > 0:
return
mentors = response.css(".mentor-board a")
for m in mentors:
item = ZaihangItem()
name = m.css(".mentor-card__name::text").extract_first()
city = m.css(".mentor-card__location::text").extract_first()
industry = m.css(".mentor-card__title::text").extract_first()
price = self.replace_space(m.css(".mentor-card__price::text").extract_first())
chat_nums = self.replace_space(m.css(".mentor-card__number::text").extract()[0])
score = self.replace_space(m.css(".mentor-card__number::text").extract()[1])
# 格式化数据
item["name"] = name
item["city"] = city
item["industry"] = industry
item["price"] = price
item["chat_nums"] = chat_nums
item["score"] = score
yield item
# 再次生成一个请求
self.page += 1
next_url = format(self.url_format.format(self.page))
yield scrapy.Request(url=next_url, callback=self.parse)
def replace_space(self, in_str):
in_str = in_str.replace("\n", "").replace("\r", "").replace("¥", "")
return in_str.strip()
author,tag,title
阿东,B端,8000+干货!B端设计师要懂的信息架构
陈子木,3D样机,第一波!2021年10月精选实用设计干货合集
张爱国,LOL,人气顶流的英雄联盟手游,有哪些值得关注的设计细节?
邓海贝,对比色,配色没有冲击力?来试试对比色!
土拨鼠,ui设计,这3个细节创新,大厂设计师都不一定全知道!
Reman刘斌,平面设计,民国时期的平面设计到底有多潮?穿着旗袍开机车!
米米米米米米米米,,甲方给的图片不清晰?收下这个超好用的免费无损放大软件 Topaz Gigapixel AI
土拨鼠,抄袭,设计该如何看待「抄袭」这件事?我总结了5个方面!
土拨鼠,内容优先,用一篇文章,帮你了解提高用户体验的「内容策划研究方法」
土拨鼠,,她做完这个暗黑设计后,为什么选择了离职?
陈子木,免费图库,壹周速读:假期归来先存好这些干货!
张彭彭,PPT设计,大厂进阶案例!腾讯D10晋升失败的复盘总结
酷家乐UED,业务分析,需求太碎?聊聊设计师如何在小业务中提炼价值
Ohh,交互设计,对比多个大厂产品后,总结了竖屏播放进度条的设计思考
小果,交互设计,如何实现产品的「啊哈时刻」?从4个方面展开详聊
郝小七,HMI,HMI设计必看!入局车载设计的最优路径+入门指南
土拨鼠,B端,8000字干货!B端用户「帮助体系」搭建指南
土拨鼠,医疗,腾讯实战案例!联动医疗下的用户研究
土拨鼠,Decentrafile,上传文件永不过期!免费的匿名去中心化云端空间「Decentrafile 」
ZoeYZ,交互设计,不止画图标!5 个金刚区的交互设计思考
土拨鼠,2021-2022设计趋势,腾讯 ISUX 出品!2021-2022设计趋势报告:日系潮玩篇
土拨鼠,3D渲染模板,2021年全网最新最好的样机资源被我找到了!免费打包!
S 设计研究所,产品视觉设计,从学校到五年职场,我是如何理解插画设计的
土拨鼠,Image Extractor,输入网址就能批量下载网站图片的在线神器「Image Extractor」
陈子木,干货,第四波!2021年9月精选实用设计干货合集
土拨鼠,平面设计,用一篇文章,帮你了解视觉冲击的最佳风格「达达主义」
百度MEUX,产品设计,大厂出品!如何通过游戏化设计助力直播互动?
土拨鼠,在线工具,收录超过 30 万个高质量免费图标的网站「PNG Repo」
思鱼耶,平面设计,文字太多画面没有层次感?优设标题黑主设计师的版式设计秘籍来了
土拨鼠,产品设计,6000+干货!资深总监的四条产品设计工作观(附私藏神器包)
author,tag,title
阿东,B端,8000+干货!B端设计师要懂的信息架构
陈子木,3D样机,第一波!2021年10月精选实用设计干货合集
张爱国,LOL,人气顶流的英雄联盟手游,有哪些值得关注的设计细节?
邓海贝,对比色,配色没有冲击力?来试试对比色!
土拨鼠,ui设计,这3个细节创新,大厂设计师都不一定全知道!
Reman刘斌,平面设计,民国时期的平面设计到底有多潮?穿着旗袍开机车!
米米米米米米米米,,甲方给的图片不清晰?收下这个超好用的免费无损放大软件 Topaz Gigapixel AI
土拨鼠,抄袭,设计该如何看待「抄袭」这件事?我总结了5个方面!
土拨鼠,内容优先,用一篇文章,帮你了解提高用户体验的「内容策划研究方法」
土拨鼠,,她做完这个暗黑设计后,为什么选择了离职?
陈子木,免费图库,壹周速读:假期归来先存好这些干货!
张彭彭,PPT设计,大厂进阶案例!腾讯D10晋升失败的复盘总结
酷家乐UED,业务分析,需求太碎?聊聊设计师如何在小业务中提炼价值
Ohh,交互设计,对比多个大厂产品后,总结了竖屏播放进度条的设计思考
小果,交互设计,如何实现产品的「啊哈时刻」?从4个方面展开详聊
郝小七,HMI,HMI设计必看!入局车载设计的最优路径+入门指南
土拨鼠,B端,8000字干货!B端用户「帮助体系」搭建指南
土拨鼠,医疗,腾讯实战案例!联动医疗下的用户研究
土拨鼠,Decentrafile,上传文件永不过期!免费的匿名去中心化云端空间「Decentrafile 」
ZoeYZ,交互设计,不止画图标!5 个金刚区的交互设计思考
土拨鼠,2021-2022设计趋势,腾讯 ISUX 出品!2021-2022设计趋势报告:日系潮玩篇
土拨鼠,3D渲染模板,2021年全网最新最好的样机资源被我找到了!免费打包!
S 设计研究所,产品视觉设计,从学校到五年职场,我是如何理解插画设计的
土拨鼠,Image Extractor,输入网址就能批量下载网站图片的在线神器「Image Extractor」
陈子木,干货,第四波!2021年9月精选实用设计干货合集
土拨鼠,平面设计,用一篇文章,帮你了解视觉冲击的最佳风格「达达主义」
百度MEUX,产品设计,大厂出品!如何通过游戏化设计助力直播互动?
土拨鼠,在线工具,收录超过 30 万个高质量免费图标的网站「PNG Repo」
思鱼耶,平面设计,文字太多画面没有层次感?优设标题黑主设计师的版式设计秘籍来了
土拨鼠,产品设计,6000+干货!资深总监的四条产品设计工作观(附私藏神器包)
author,tag,title
阿东,B端,8000+干货!B端设计师要懂的信息架构
陈子木,3D样机,第一波!2021年10月精选实用设计干货合集
张爱国,LOL,人气顶流的英雄联盟手游,有哪些值得关注的设计细节?
邓海贝,对比色,配色没有冲击力?来试试对比色!
土拨鼠,ui设计,这3个细节创新,大厂设计师都不一定全知道!
Reman刘斌,平面设计,民国时期的平面设计到底有多潮?穿着旗袍开机车!
米米米米米米米米,,甲方给的图片不清晰?收下这个超好用的免费无损放大软件 Topaz Gigapixel AI
土拨鼠,抄袭,设计该如何看待「抄袭」这件事?我总结了5个方面!
土拨鼠,内容优先,用一篇文章,帮你了解提高用户体验的「内容策划研究方法」
土拨鼠,,她做完这个暗黑设计后,为什么选择了离职?
陈子木,免费图库,壹周速读:假期归来先存好这些干货!
张彭彭,PPT设计,大厂进阶案例!腾讯D10晋升失败的复盘总结
酷家乐UED,业务分析,需求太碎?聊聊设计师如何在小业务中提炼价值
Ohh,交互设计,对比多个大厂产品后,总结了竖屏播放进度条的设计思考
小果,交互设计,如何实现产品的「啊哈时刻」?从4个方面展开详聊
郝小七,HMI,HMI设计必看!入局车载设计的最优路径+入门指南
土拨鼠,B端,8000字干货!B端用户「帮助体系」搭建指南
土拨鼠,医疗,腾讯实战案例!联动医疗下的用户研究
土拨鼠,Decentrafile,上传文件永不过期!免费的匿名去中心化云端空间「Decentrafile 」
ZoeYZ,交互设计,不止画图标!5 个金刚区的交互设计思考
土拨鼠,2021-2022设计趋势,腾讯 ISUX 出品!2021-2022设计趋势报告:日系潮玩篇
土拨鼠,3D渲染模板,2021年全网最新最好的样机资源被我找到了!免费打包!
S 设计研究所,产品视觉设计,从学校到五年职场,我是如何理解插画设计的
土拨鼠,Image Extractor,输入网址就能批量下载网站图片的在线神器「Image Extractor」
陈子木,干货,第四波!2021年9月精选实用设计干货合集
土拨鼠,平面设计,用一篇文章,帮你了解视觉冲击的最佳风格「达达主义」
百度MEUX,产品设计,大厂出品!如何通过游戏化设计助力直播互动?
土拨鼠,在线工具,收录超过 30 万个高质量免费图标的网站「PNG Repo」
思鱼耶,平面设计,文字太多画面没有层次感?优设标题黑主设计师的版式设计秘籍来了
土拨鼠,产品设计,6000+干货!资深总监的四条产品设计工作观(附私藏神器包)
author,tag,title
阿东,B端,8000+干货!B端设计师要懂的信息架构
陈子木,3D样机,第一波!2021年10月精选实用设计干货合集
张爱国,LOL,人气顶流的英雄联盟手游,有哪些值得关注的设计细节?
邓海贝,对比色,配色没有冲击力?来试试对比色!
土拨鼠,ui设计,这3个细节创新,大厂设计师都不一定全知道!
Reman刘斌,平面设计,民国时期的平面设计到底有多潮?穿着旗袍开机车!
米米米米米米米米,,甲方给的图片不清晰?收下这个超好用的免费无损放大软件 Topaz Gigapixel AI
土拨鼠,抄袭,设计该如何看待「抄袭」这件事?我总结了5个方面!
土拨鼠,内容优先,用一篇文章,帮你了解提高用户体验的「内容策划研究方法」
土拨鼠,,她做完这个暗黑设计后,为什么选择了离职?
陈子木,免费图库,壹周速读:假期归来先存好这些干货!
张彭彭,PPT设计,大厂进阶案例!腾讯D10晋升失败的复盘总结
酷家乐UED,业务分析,需求太碎?聊聊设计师如何在小业务中提炼价值
Ohh,交互设计,对比多个大厂产品后,总结了竖屏播放进度条的设计思考
小果,交互设计,如何实现产品的「啊哈时刻」?从4个方面展开详聊
郝小七,HMI,HMI设计必看!入局车载设计的最优路径+入门指南
土拨鼠,B端,8000字干货!B端用户「帮助体系」搭建指南
土拨鼠,医疗,腾讯实战案例!联动医疗下的用户研究
土拨鼠,Decentrafile,上传文件永不过期!免费的匿名去中心化云端空间「Decentrafile 」
ZoeYZ,交互设计,不止画图标!5 个金刚区的交互设计思考
土拨鼠,2021-2022设计趋势,腾讯 ISUX 出品!2021-2022设计趋势报告:日系潮玩篇
土拨鼠,3D渲染模板,2021年全网最新最好的样机资源被我找到了!免费打包!
S 设计研究所,产品视觉设计,从学校到五年职场,我是如何理解插画设计的
土拨鼠,Image Extractor,输入网址就能批量下载网站图片的在线神器「Image Extractor」
陈子木,干货,第四波!2021年9月精选实用设计干货合集
土拨鼠,平面设计,用一篇文章,帮你了解视觉冲击的最佳风格「达达主义」
百度MEUX,产品设计,大厂出品!如何通过游戏化设计助力直播互动?
土拨鼠,在线工具,收录超过 30 万个高质量免费图标的网站「PNG Repo」
思鱼耶,平面设计,文字太多画面没有层次感?优设标题黑主设计师的版式设计秘籍来了
土拨鼠,产品设计,6000+干货!资深总监的四条产品设计工作观(附私藏神器包)
author,tag,title
阿东,B端,8000+干货!B端设计师要懂的信息架构
陈子木,3D样机,第一波!2021年10月精选实用设计干货合集
张爱国,LOL,人气顶流的英雄联盟手游,有哪些值得关注的设计细节?
邓海贝,对比色,配色没有冲击力?来试试对比色!
土拨鼠,ui设计,这3个细节创新,大厂设计师都不一定全知道!
Reman刘斌,平面设计,民国时期的平面设计到底有多潮?穿着旗袍开机车!
米米米米米米米米,,甲方给的图片不清晰?收下这个超好用的免费无损放大软件 Topaz Gigapixel AI
土拨鼠,抄袭,设计该如何看待「抄袭」这件事?我总结了5个方面!
土拨鼠,内容优先,用一篇文章,帮你了解提高用户体验的「内容策划研究方法」
土拨鼠,,她做完这个暗黑设计后,为什么选择了离职?
陈子木,免费图库,壹周速读:假期归来先存好这些干货!
张彭彭,PPT设计,大厂进阶案例!腾讯D10晋升失败的复盘总结
酷家乐UED,业务分析,需求太碎?聊聊设计师如何在小业务中提炼价值
Ohh,交互设计,对比多个大厂产品后,总结了竖屏播放进度条的设计思考
小果,交互设计,如何实现产品的「啊哈时刻」?从4个方面展开详聊
郝小七,HMI,HMI设计必看!入局车载设计的最优路径+入门指南
土拨鼠,B端,8000字干货!B端用户「帮助体系」搭建指南
土拨鼠,医疗,腾讯实战案例!联动医疗下的用户研究
土拨鼠,Decentrafile,上传文件永不过期!免费的匿名去中心化云端空间「Decentrafile 」
ZoeYZ,交互设计,不止画图标!5 个金刚区的交互设计思考
土拨鼠,2021-2022设计趋势,腾讯 ISUX 出品!2021-2022设计趋势报告:日系潮玩篇
土拨鼠,3D渲染模板,2021年全网最新最好的样机资源被我找到了!免费打包!
S 设计研究所,产品视觉设计,从学校到五年职场,我是如何理解插画设计的
土拨鼠,Image Extractor,输入网址就能批量下载网站图片的在线神器「Image Extractor」
陈子木,干货,第四波!2021年9月精选实用设计干货合集
土拨鼠,平面设计,用一篇文章,帮你了解视觉冲击的最佳风格「达达主义」
百度MEUX,产品设计,大厂出品!如何通过游戏化设计助力直播互动?
土拨鼠,在线工具,收录超过 30 万个高质量免费图标的网站「PNG Repo」
思鱼耶,平面设计,文字太多画面没有层次感?优设标题黑主设计师的版式设计秘籍来了
土拨鼠,产品设计,6000+干货!资深总监的四条产品设计工作观(附私藏神器包)
author,tag,title
阿东,B端,新闻:8000+干货!B端设计师要懂的信息架构
陈子木,3D样机,新闻:第一波!2021年10月精选实用设计干货合集
张爱国,LOL,新闻:人气顶流的英雄联盟手游,有哪些值得关注的设计细节?
邓海贝,对比色,新闻:配色没有冲击力?来试试对比色!
土拨鼠,ui设计,新闻:这3个细节创新,大厂设计师都不一定全知道!
Reman刘斌,平面设计,新闻:民国时期的平面设计到底有多潮?穿着旗袍开机车!
米米米米米米米米,,新闻:甲方给的图片不清晰?收下这个超好用的免费无损放大软件 Topaz Gigapixel AI
土拨鼠,抄袭,新闻:设计该如何看待「抄袭」这件事?我总结了5个方面!
土拨鼠,内容优先,新闻:用一篇文章,帮你了解提高用户体验的「内容策划研究方法」
土拨鼠,,新闻:她做完这个暗黑设计后,为什么选择了离职?
陈子木,免费图库,新闻:壹周速读:假期归来先存好这些干货!
张彭彭,PPT设计,新闻:大厂进阶案例!腾讯D10晋升失败的复盘总结
酷家乐UED,业务分析,新闻:需求太碎?聊聊设计师如何在小业务中提炼价值
Ohh,交互设计,新闻:对比多个大厂产品后,总结了竖屏播放进度条的设计思考
小果,交互设计,新闻:如何实现产品的「啊哈时刻」?从4个方面展开详聊
郝小七,HMI,新闻:HMI设计必看!入局车载设计的最优路径+入门指南
土拨鼠,B端,新闻:8000字干货!B端用户「帮助体系」搭建指南
土拨鼠,医疗,新闻:腾讯实战案例!联动医疗下的用户研究
土拨鼠,Decentrafile,新闻:上传文件永不过期!免费的匿名去中心化云端空间「Decentrafile 」
ZoeYZ,交互设计,新闻:不止画图标!5 个金刚区的交互设计思考
土拨鼠,2021-2022设计趋势,新闻:腾讯 ISUX 出品!2021-2022设计趋势报告:日系潮玩篇
土拨鼠,3D渲染模板,新闻:2021年全网最新最好的样机资源被我找到了!免费打包!
S 设计研究所,产品视觉设计,新闻:从学校到五年职场,我是如何理解插画设计的
土拨鼠,Image Extractor,新闻:输入网址就能批量下载网站图片的在线神器「Image Extractor」
陈子木,干货,新闻:第四波!2021年9月精选实用设计干货合集
土拨鼠,平面设计,新闻:用一篇文章,帮你了解视觉冲击的最佳风格「达达主义」
百度MEUX,产品设计,新闻:大厂出品!如何通过游戏化设计助力直播互动?
土拨鼠,在线工具,新闻:收录超过 30 万个高质量免费图标的网站「PNG Repo」
思鱼耶,平面设计,新闻:文字太多画面没有层次感?优设标题黑主设计师的版式设计秘籍来了
土拨鼠,产品设计,新闻:6000+干货!资深总监的四条产品设计工作观(附私藏神器包)
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = uisdc.settings
[deploy]
#url = http://localhost:6800/
project = uisdc
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
from scrapy.loader.processors import MapCompose, TakeFirst
def ext(value):
return "新闻:" + value
class UisdcItem(Item):
# define the fields for your item here like:
title = Field(
input_processor=MapCompose(ext),
output_processor=TakeFirst()
)
author = Field(output_processor=TakeFirst())
tag = Field(output_processor=TakeFirst())
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class UisdcSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class UisdcDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class UisdcPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for uisdc project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'uisdc'
SPIDER_MODULES = ['uisdc.spiders']
NEWSPIDER_MODULE = 'uisdc.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'uisdc (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'uisdc.middlewares.UisdcSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'uisdc.middlewares.UisdcDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'uisdc.pipelines.UisdcPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# LOG_LEVEL = 'WARNING'
\ No newline at end of file
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
from scrapy.loader import ItemLoader
from uisdc.items import UisdcItem
class UiSpider(scrapy.Spider):
name = 'ui'
allowed_domains = ['www.uisdc.com']
start_urls = ['https://www.uisdc.com/archives']
custom_settings = {
"ROBOTSTXT_OBEY": False
}
def parse(self, response):
items = response.xpath('//div[@id="archive_list"]/div/div[1]/div[1]/div[contains(@class,"item-article")]')
for i in items:
l = ItemLoader(item=UisdcItem(), selector=i)
l.add_xpath('title', ".//h2[@class='item-title']/a/text()")
l.add_xpath('author', ".//h3[@class='meta-name']/text()")
l.add_xpath('tag', ".//div[@class='meta-tag']/a/text()")
ret = l.load_item()
# print(ret)
yield ret
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册