diff --git a/NO54/a.txt b/NO54/a.txt new file mode 100644 index 0000000000000000000000000000000000000000..7767b9ee78ab545f6bb797a618605879f53a02cc --- /dev/null +++ b/NO54/a.txt @@ -0,0 +1,31 @@ +name:银黄胶囊name:阿胶益寿口服液name:香菊片name:舒阴洁洗剂name:灵丹草合剂name:田七痛经胶囊name:枣仁安神液name:复方庆大霉素膜name:复方穿心莲片name:橘半止咳颗粒name:银黄胶囊name:一清颗粒name:虫草洋参胶囊name:归圆口服液name:五子衍宗丸name:清气化痰丸name:藿香清胃片name:穿心莲片name:维C银翘片name:银黄颗粒name:抗脑衰胶囊name:苦胆草片name:通便灵胶囊name:复方鲜竹沥液name:强力枇杷露name:生脉饮name:复方海蛇胶囊name:宁心宝胶囊name:银黄颗粒name:六味地黄胶囊 +name:银黄胶囊 +name:阿胶益寿口服液 +name:香菊片 +name:舒阴洁洗剂 +name:灵丹草合剂 +name:田七痛经胶囊 +name:枣仁安神液 +name:复方庆大霉素膜 +name:复方穿心莲片 +name:橘半止咳颗粒 +name:银黄胶囊 +name:一清颗粒 +name:虫草洋参胶囊 +name:归圆口服液 +name:五子衍宗丸 +name:清气化痰丸 +name:藿香清胃片 +name:穿心莲片 +name:维C银翘片 +name:银黄颗粒 +name:抗脑衰胶囊 +name:苦胆草片 +name:通便灵胶囊 +name:复方鲜竹沥液 +name:强力枇杷露 +name:生脉饮 +name:复方海蛇胶囊 +name:宁心宝胶囊 +name:银黄颗粒 +name:六味地黄胶囊 \ No newline at end of file diff --git a/NO54/data/yy/2021-10-27T06-49-06.csv b/NO54/data/yy/2021-10-27T06-49-06.csv new file mode 100644 index 0000000000000000000000000000000000000000..57660c18936e453958a0ea11ddea7ac4f40b4a91 --- /dev/null +++ b/NO54/data/yy/2021-10-27T06-49-06.csv @@ -0,0 +1,31 @@ +name +银黄胶囊 +阿胶益寿口服液 +香菊片 +舒阴洁洗剂 +灵丹草合剂 +田七痛经胶囊 +枣仁安神液 +复方庆大霉素膜 +复方穿心莲片 +橘半止咳颗粒 +银黄胶囊 +一清颗粒 +虫草洋参胶囊 +归圆口服液 +五子衍宗丸 +清气化痰丸 +藿香清胃片 +穿心莲片 +维C银翘片 +银黄颗粒 +抗脑衰胶囊 +苦胆草片 +通便灵胶囊 +复方鲜竹沥液 +强力枇杷露 +生脉饮 +复方海蛇胶囊 +宁心宝胶囊 +银黄颗粒 +六味地黄胶囊 diff --git a/NO54/scrapy.cfg b/NO54/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..6ab556ae1d91758c848d3320ea89acce1f4dc473 --- /dev/null +++ b/NO54/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = yiyao.settings + +[deploy] +#url = http://localhost:6800/ +project = yiyao diff --git a/NO54/yiyao/__init__.py b/NO54/yiyao/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/NO54/yiyao/__pycache__/__init__.cpython-37.pyc b/NO54/yiyao/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e28f3438a90063a81d48d7db991a8c6eaa0aa5f Binary files /dev/null and b/NO54/yiyao/__pycache__/__init__.cpython-37.pyc differ diff --git a/NO54/yiyao/__pycache__/items.cpython-37.pyc b/NO54/yiyao/__pycache__/items.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..295398b5c38927f66c3b48b2d171dc9a59268412 Binary files /dev/null and b/NO54/yiyao/__pycache__/items.cpython-37.pyc differ diff --git a/NO54/yiyao/__pycache__/my_ext.cpython-37.pyc b/NO54/yiyao/__pycache__/my_ext.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3e44f57837a210098d5f48a5f6e3cdda902107a Binary files /dev/null and b/NO54/yiyao/__pycache__/my_ext.cpython-37.pyc differ diff --git a/NO54/yiyao/__pycache__/pipelines.cpython-37.pyc b/NO54/yiyao/__pycache__/pipelines.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e81161ae4e5456cddadde0819f1868e6858add4c Binary files /dev/null and b/NO54/yiyao/__pycache__/pipelines.cpython-37.pyc differ diff --git a/NO54/yiyao/__pycache__/settings.cpython-37.pyc b/NO54/yiyao/__pycache__/settings.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8f310db9fde1bd9bedab0400e4088e3eada7119 Binary files /dev/null and b/NO54/yiyao/__pycache__/settings.cpython-37.pyc differ diff --git a/NO54/yiyao/items.py b/NO54/yiyao/items.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8615e6bf3681e4b628752b2630a860e5189e1d --- /dev/null +++ b/NO54/yiyao/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class YiyaoItem(scrapy.Item): + # define the fields for your item here like: + name = scrapy.Field() + diff --git a/NO54/yiyao/middlewares.py b/NO54/yiyao/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..bff82ca76e4b108f5ebf26eba36fbada2cc601ae --- /dev/null +++ b/NO54/yiyao/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class YiyaoSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class YiyaoDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/NO54/yiyao/my_ext.py b/NO54/yiyao/my_ext.py new file mode 100644 index 0000000000000000000000000000000000000000..795ecae64620ada90ea2d057ad616062498ff757 --- /dev/null +++ b/NO54/yiyao/my_ext.py @@ -0,0 +1,15 @@ +from scrapy.exporters import BaseItemExporter + + +class TXTItemExporter(BaseItemExporter): + + def __init__(self, file, **kwargs): + super().__init__(dont_fail=True, **kwargs) + self.file = file + + def export_item(self, item): + # _get_serialized_fields 方法可以获得 item 所有字段,并返回迭代器 + print(self._get_serialized_fields(item, default_value='')) + print(self.file) + for name, value in self._get_serialized_fields(item, default_value=''): + self.file.write(bytes("\nname:" + value, encoding="utf-8")) diff --git a/NO54/yiyao/pipelines.py b/NO54/yiyao/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..60742ad8a01138a424d1827a0636e9b9e996532b --- /dev/null +++ b/NO54/yiyao/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class YiyaoPipeline: + def process_item(self, item, spider): + return item diff --git a/NO54/yiyao/settings.py b/NO54/yiyao/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..103b1225aaea495751dfc8ed9cfb959b327daa82 --- /dev/null +++ b/NO54/yiyao/settings.py @@ -0,0 +1,96 @@ +# Scrapy settings for yiyao project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'yiyao' + +SPIDER_MODULES = ['yiyao.spiders'] +NEWSPIDER_MODULE = 'yiyao.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'yiyao (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'yiyao.middlewares.YiyaoSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'yiyao.middlewares.YiyaoDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'yiyao.pipelines.YiyaoPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# FEEDS = { +# '%(batch_id)d.csv': { +# 'format': 'csv', +# 'encoding': 'utf8', +# 'batch_item_count': 2, +# }, +# } + +FEED_EXPORTERS={'txt':'yiyao.my_ext.TXTItemExporter'} diff --git a/NO54/yiyao/spiders/__init__.py b/NO54/yiyao/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/NO54/yiyao/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/NO54/yiyao/spiders/__pycache__/__init__.cpython-37.pyc b/NO54/yiyao/spiders/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4fb00387ddb396c6e475085402bb68cd500720d Binary files /dev/null and b/NO54/yiyao/spiders/__pycache__/__init__.cpython-37.pyc differ diff --git a/NO54/yiyao/spiders/__pycache__/yy.cpython-37.pyc b/NO54/yiyao/spiders/__pycache__/yy.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d1e5e034a295d35dfd802cc7ca8112e92cf1bda Binary files /dev/null and b/NO54/yiyao/spiders/__pycache__/yy.cpython-37.pyc differ diff --git a/NO54/yiyao/spiders/yy.py b/NO54/yiyao/spiders/yy.py new file mode 100644 index 0000000000000000000000000000000000000000..dfee424c6ec8f243161e186a17c0602378069121 --- /dev/null +++ b/NO54/yiyao/spiders/yy.py @@ -0,0 +1,14 @@ +import scrapy +from yiyao.items import YiyaoItem + +class YySpider(scrapy.Spider): + name = 'yy' + allowed_domains = ['pharmnet.com.cn'] + start_urls = ['http://www.pharmnet.com.cn/product/1111/1/1.html'] + + def parse(self, response): + all_items = response.css('a.green.fb.f13::text').getall() + for item in all_items: + ret = YiyaoItem() + ret["name"] = item + yield ret diff --git "a/NO55/images/\345\217\257\347\210\261\346\260\264\346\211\213\346\234\215\347\276\216\345\245\263\346\270\205\346\226\260\351\253\230\346\270\205\346\241\214\351\235\242\345\243\201\347\272\270_0.jpg" "b/NO55/images/\345\217\257\347\210\261\346\260\264\346\211\213\346\234\215\347\276\216\345\245\263\346\270\205\346\226\260\351\253\230\346\270\205\346\241\214\351\235\242\345\243\201\347\272\270_0.jpg" new file mode 100644 index 0000000000000000000000000000000000000000..ee61607d3b2e5744db0b2faa17a8084e1352c414 Binary files /dev/null and "b/NO55/images/\345\217\257\347\210\261\346\260\264\346\211\213\346\234\215\347\276\216\345\245\263\346\270\205\346\226\260\351\253\230\346\270\205\346\241\214\351\235\242\345\243\201\347\272\270_0.jpg" differ diff --git a/NO55/scrapy.cfg b/NO55/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..07ca752ec899a6f203ac0d41287bbef9458ed46a --- /dev/null +++ b/NO55/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = sougou.settings + +[deploy] +#url = http://localhost:6800/ +project = sougou diff --git a/NO55/sougou/__init__.py b/NO55/sougou/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/NO55/sougou/__pycache__/__init__.cpython-37.pyc b/NO55/sougou/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42385582a9fa83199f39a0850e4b955c8d0fc0eb Binary files /dev/null and b/NO55/sougou/__pycache__/__init__.cpython-37.pyc differ diff --git a/NO55/sougou/__pycache__/pipelines.cpython-37.pyc b/NO55/sougou/__pycache__/pipelines.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efeb234bea29114a843abf1969786bf20d0728cf Binary files /dev/null and b/NO55/sougou/__pycache__/pipelines.cpython-37.pyc differ diff --git a/NO55/sougou/__pycache__/settings.cpython-37.pyc b/NO55/sougou/__pycache__/settings.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f2cade41fab26d74611e4a34fd55483b1c1caba Binary files /dev/null and b/NO55/sougou/__pycache__/settings.cpython-37.pyc differ diff --git a/NO55/sougou/items.py b/NO55/sougou/items.py new file mode 100644 index 0000000000000000000000000000000000000000..88a12dce95a0f536671c4097a8e3eca958ba99d9 --- /dev/null +++ b/NO55/sougou/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class SougouItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/NO55/sougou/middlewares.py b/NO55/sougou/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..40a5a5452ac5db76fdab675acb3a8f034ec376f0 --- /dev/null +++ b/NO55/sougou/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class SougouSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class SougouDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/NO55/sougou/pipelines.py b/NO55/sougou/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..86a896ddbbee9ac3892fb515dd1c0d0469a7ffe2 --- /dev/null +++ b/NO55/sougou/pipelines.py @@ -0,0 +1,32 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter +from scrapy.http import Request +from scrapy.pipelines.images import ImagesPipeline + +class SougouPipeline: + def process_item(self, item, spider): + return item + + +class SogouImgPipeline(ImagesPipeline): + + def get_media_requests(self, item, info): + name = item["name"] + for index, url in enumerate(item["image_urls"]): + yield Request(url, meta={'name': name, 'index': index}) + + def file_path(self, request, response=None, info=None): + # 名称 + name = request.meta['name'] + # 索引 + index = request.meta['index'] + + filename = u'{0}_{1}.jpg'.format(name, index) + print(filename) + return filename diff --git a/NO55/sougou/settings.py b/NO55/sougou/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..03aa520a48ffef1d2d7e6083f4021da5a60593ae --- /dev/null +++ b/NO55/sougou/settings.py @@ -0,0 +1,91 @@ +# Scrapy settings for sougou project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'sougou' + +SPIDER_MODULES = ['sougou.spiders'] +NEWSPIDER_MODULE = 'sougou.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 1 +RANDOMIZE_DOWNLOAD_DELAY = True +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +DEFAULT_REQUEST_HEADERS = { + 'Accept': 'application/json, text/plain, */*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'HOST': 'pic.sogou.com', +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'sougou.middlewares.SougouSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'sougou.middlewares.SougouDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'sougou.pipelines.SogouImgPipeline': 1, +} +IMAGES_STORE = "images" + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/NO55/sougou/spiders/__init__.py b/NO55/sougou/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/NO55/sougou/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/NO55/sougou/spiders/__pycache__/__init__.cpython-37.pyc b/NO55/sougou/spiders/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9745f818ac06a20354eb3ad51ce45453a3e1b250 Binary files /dev/null and b/NO55/sougou/spiders/__pycache__/__init__.cpython-37.pyc differ diff --git a/NO55/sougou/spiders/__pycache__/sg.cpython-37.pyc b/NO55/sougou/spiders/__pycache__/sg.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..839683768768fb010e84dc91b629743ebf7cea69 Binary files /dev/null and b/NO55/sougou/spiders/__pycache__/sg.cpython-37.pyc differ diff --git a/NO55/sougou/spiders/sg.py b/NO55/sougou/spiders/sg.py new file mode 100644 index 0000000000000000000000000000000000000000..e0104078d2943cc8d21bf117c64f3229d8e79a16 --- /dev/null +++ b/NO55/sougou/spiders/sg.py @@ -0,0 +1,20 @@ +import scrapy + + +class SgSpider(scrapy.Spider): + name = 'sg' + allowed_domains = ['pic.sogou.com'] + base_url = "https://pic.sogou.com/napi/pc/recommend?key=homeFeedData&category=feed&start={}&len=10" + start_urls = [base_url.format(0)] + + def parse(self, response): + json_data = response.json() + if json_data is not None: + img_list = json_data["data"]["list"] + for img in img_list: + yield { + 'name': img[0]['title'], + 'image_urls': [_["originImage"] for _ in img[0]["picList"]], + } + else: + return None diff --git a/NO56/ca_tt.py b/NO56/ca_tt.py new file mode 100644 index 0000000000000000000000000000000000000000..c89d4d2b5412500000af9980b6f8a3c419b88f6a --- /dev/null +++ b/NO56/ca_tt.py @@ -0,0 +1,8 @@ +import browsercookie +import requests +import re +firefox_cookiejar = browsercookie.firefox() + + +# res = requests.get("https://img-home.csdnimg.cn/data_json/jsconfig/menu_path.json", cookies=firefox_cookiejar) +# print(res.text) diff --git a/NO56/csdn/__init__.py b/NO56/csdn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/NO56/csdn/__pycache__/__init__.cpython-37.pyc b/NO56/csdn/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b958de86cf86f87ad87195e11655bd75ca73b1b9 Binary files /dev/null and b/NO56/csdn/__pycache__/__init__.cpython-37.pyc differ diff --git a/NO56/csdn/__pycache__/middlewares.cpython-37.pyc b/NO56/csdn/__pycache__/middlewares.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eec54e62ee095f8358ac3291a8f0aaae6f3713fd Binary files /dev/null and b/NO56/csdn/__pycache__/middlewares.cpython-37.pyc differ diff --git a/NO56/csdn/__pycache__/settings.cpython-37.pyc b/NO56/csdn/__pycache__/settings.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b18e8cff473d7c112eaf0535a31f6dd7a21c8fd Binary files /dev/null and b/NO56/csdn/__pycache__/settings.cpython-37.pyc differ diff --git a/NO56/csdn/items.py b/NO56/csdn/items.py new file mode 100644 index 0000000000000000000000000000000000000000..ffc1ebb5d85b66d753723c02e132d3db87f1760c --- /dev/null +++ b/NO56/csdn/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class CsdnItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/NO56/csdn/middlewares.py b/NO56/csdn/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..ee2cb77b49ad3dc392b3cc53277495b1216b3e7c --- /dev/null +++ b/NO56/csdn/middlewares.py @@ -0,0 +1,118 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals +from scrapy.downloadermiddlewares.cookies import CookiesMiddleware +import browsercookie + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class CsdnSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class CsdnDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class BrowserCookiesDownloaderMiddleware(CookiesMiddleware): + def __init__(self, debug=False): + super().__init__(debug) + self.load_browser_cookies() + + def load_browser_cookies(self): + # 注意这个地方的名字叫做 firefox + jar = self.jars['firefox'] + firefox_cookiejar = browsercookie.firefox() + for cookie in firefox_cookiejar: + jar.set_cookie(cookie) diff --git a/NO56/csdn/pipelines.py b/NO56/csdn/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..797e40e1a425e318ed0cf34e7289b10635a17279 --- /dev/null +++ b/NO56/csdn/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class CsdnPipeline: + def process_item(self, item, spider): + return item diff --git a/NO56/csdn/settings.py b/NO56/csdn/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..e9dbedb4af64dfe65a1e06bb7d26cb1c8c035984 --- /dev/null +++ b/NO56/csdn/settings.py @@ -0,0 +1,88 @@ +# Scrapy settings for csdn project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'csdn' + +SPIDER_MODULES = ['csdn.spiders'] +NEWSPIDER_MODULE = 'csdn.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'csdn.middlewares.CsdnSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, + 'csdn.middlewares.BrowserCookiesDownloaderMiddleware': 543, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = { +# 'csdn.pipelines.CsdnPipeline': 300, +# } + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/NO56/csdn/spiders/__init__.py b/NO56/csdn/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/NO56/csdn/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/NO56/csdn/spiders/__pycache__/__init__.cpython-37.pyc b/NO56/csdn/spiders/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..338302f89a6c2acdd2cacdeb095f01966edb0d13 Binary files /dev/null and b/NO56/csdn/spiders/__pycache__/__init__.cpython-37.pyc differ diff --git a/NO56/csdn/spiders/__pycache__/clike.cpython-37.pyc b/NO56/csdn/spiders/__pycache__/clike.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24a3820b93d72fad2a2164ad17c46c221e741ba8 Binary files /dev/null and b/NO56/csdn/spiders/__pycache__/clike.cpython-37.pyc differ diff --git a/NO56/csdn/spiders/clike.py b/NO56/csdn/spiders/clike.py new file mode 100644 index 0000000000000000000000000000000000000000..0aec92cf28d49420aed37a34957f96ba7709d4e7 --- /dev/null +++ b/NO56/csdn/spiders/clike.py @@ -0,0 +1,16 @@ +import scrapy + + +class ClikeSpider(scrapy.Spider): + name = 'clike' + allowed_domains = ['csdn.net'] + like_url = 'https://blog.csdn.net/phoenix/web/v1/article/like' + + def start_requests(self): + data = { + "articleId": "120845464", + } + yield scrapy.FormRequest(url=self.like_url, formdata=data, meta={'cookiejar': 'firefox'}) + + def parse(self, response): + print(response.json()) diff --git a/NO56/scrapy.cfg b/NO56/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..1e233dd5b417006ad1196770702e2e12663234c0 --- /dev/null +++ b/NO56/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = csdn.settings + +[deploy] +#url = http://localhost:6800/ +project = csdn diff --git a/README.md b/README.md index 4a631b5fea19eb696be01a08fa5bb5cd5ab63b51..61f97ca7e49459cd59cc93241c1c4c6e27e178ee 100644 --- a/README.md +++ b/README.md @@ -94,5 +94,10 @@ 48. [程序员跨行帮朋友,python爬虫之饲料添加剂数据,采集+备份](https://dream.blog.csdn.net/article/details/121028282) 49. [CSDN热榜、华为云博客都可用来练习Python scrapy 爬虫](https://dream.blog.csdn.net/article/details/121066927) 50. [纯纯的爬虫知识,python scrapy 下载中间件知多少](https://dream.blog.csdn.net/article/details/121083780) -51. [20行Python代码、爬虫、蓝桥训练营,一篇博客整合这几个关键词](https://dream.blog.csdn.net/article/details/121151700) +51. [[20行Python scrapy 代码,去采集【蓝桥】训练营](https://editor.csdn.net/md/?articleId=121151700)](https://dream.blog.csdn.net/article/details/121151700) +52. [Scrapy Spider中间件,你学会了吗?本篇博客有一案例](https://dream.blog.csdn.net/article/details/120969435) +53. [通过淘宝数据学习爬虫,python scrapy requests与response对象](https://dream.blog.csdn.net/article/details/120979533) +54. [你知道在 scrapy 中,可以定制化导出数据格式吗?scrapy 导出器学习](https://dream.blog.csdn.net/article/details/120992365) +55. [python scrapy ,几行代码实现一个【搜狗图片】下载器](https://dream.blog.csdn.net/article/details/120996308) +56. [Python爬虫落地应用之【自动化点赞器】,一篇游走在封禁边缘的博客](https://dream.blog.csdn.net/article/details/121000212)