From c2bae541c77e43f2f0ae11ebcb1071236400ca02 Mon Sep 17 00:00:00 2001 From: hihell Date: Mon, 25 Oct 2021 10:14:14 +0800 Subject: [PATCH] =?UTF-8?q?=E9=98=85=E6=96=87=E4=BD=9C=E5=AE=B6=E4=B8=AD?= =?UTF-8?q?=E5=BF=83=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NO45/scrapy.cfg | 11 ++ NO45/yuewen/__init__.py | 0 .../__pycache__/__init__.cpython-37.pyc | Bin 0 -> 147 bytes .../__pycache__/settings.cpython-37.pyc | Bin 0 -> 279 bytes NO45/yuewen/items.py | 12 ++ NO45/yuewen/middlewares.py | 103 ++++++++++++++++++ NO45/yuewen/pipelines.py | 13 +++ NO45/yuewen/settings.py | 89 +++++++++++++++ NO45/yuewen/spiders/__init__.py | 4 + .../__pycache__/__init__.cpython-37.pyc | Bin 0 -> 155 bytes .../spiders/__pycache__/yw.cpython-37.pyc | Bin 0 -> 1594 bytes NO45/yuewen/spiders/yw.py | 34 ++++++ README.md | 5 +- 13 files changed, 270 insertions(+), 1 deletion(-) create mode 100644 NO45/scrapy.cfg create mode 100644 NO45/yuewen/__init__.py create mode 100644 NO45/yuewen/__pycache__/__init__.cpython-37.pyc create mode 100644 NO45/yuewen/__pycache__/settings.cpython-37.pyc create mode 100644 NO45/yuewen/items.py create mode 100644 NO45/yuewen/middlewares.py create mode 100644 NO45/yuewen/pipelines.py create mode 100644 NO45/yuewen/settings.py create mode 100644 NO45/yuewen/spiders/__init__.py create mode 100644 NO45/yuewen/spiders/__pycache__/__init__.cpython-37.pyc create mode 100644 NO45/yuewen/spiders/__pycache__/yw.cpython-37.pyc create mode 100644 NO45/yuewen/spiders/yw.py diff --git a/NO45/scrapy.cfg b/NO45/scrapy.cfg new file mode 100644 index 0000000..7d9e63b --- /dev/null +++ b/NO45/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = yuewen.settings + +[deploy] +#url = http://localhost:6800/ +project = yuewen diff --git a/NO45/yuewen/__init__.py b/NO45/yuewen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/NO45/yuewen/__pycache__/__init__.cpython-37.pyc b/NO45/yuewen/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..352f7e7f3a69f0aa104005137d300547e384109d GIT binary patch literal 147 zcmZ?b<>g`kg7A=tL=gQLM8E(ekl_Ht#VkM~g&~+hlhJP_LlH&ryk0@& UEe@O9{FKt1R6CHVpMjVG0FXo{5&!@I literal 0 HcmV?d00001 diff --git a/NO45/yuewen/__pycache__/settings.cpython-37.pyc b/NO45/yuewen/__pycache__/settings.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38ac59b39e4cc753817649eac329245b46d7d99d GIT binary patch literal 279 zcmZ?b<>g`kf{=>bL=GMXhQ}Zd3@`#24nSNi1td}!q8L&b(;1@}Qy8O|QkbHcQ<$Sz zQdoi+G+CqADoaz#Q}e3$Ahcd_L1s#7QL)=C_Hf4_KTkh*KTXzK98Ufr@qUiJuDAGt z13X<^gW`StT|#|agKr7=xrU<%-Qo-K2dWMZi3o}JcXExq#p&bk9`EBC=IXPOp@<#m z95C_K63BA3iYcfp$;i(OD9X=DO)iOf-m&Jz%+-cQ22b{NKby1W>6{HQCZ;hEm&FvP mmXu`Xr5EcJRNmsS$<0qG%}KQb1wkg`kg7A=t#DzfmF^B^Lj6jA15Erumi4=xl22Do4l?+87VFdBj5y*A5iYcfp z$;i(OD9X=DO)iOf-m&Jz%+-cQ22b{NKby1W>6{HQCZ;i!rK#nqc`?NWnJKA7#WC^m dnR%Hd@$q^EmA5!-a`RJ4b5iX_Po^H z%X*hJ1nfh==m!LRkd7ffb|gmV|Rj4_(z={iv_Ls-7>J%^HFB z{xpkr972A_$#H`@*@7-V1HlNRDG5+pDb1|FqByowJ97dja|4%>F9~y)`;0J`J5TMP z!ss31S6;(u((|cqewE~pb{;Dc#i|g#rR&=w8l`v2gmIyr+ryN{xc9iZFxY}F?}6YX zpo|0-qk+w=2Q+Y)4btTmUPZ6_lvP-D-!gNLor8Ie)%UG|6?m+{n$JjZjxDhk=o)LY z^PubO0=o#h!IlB%iuShe&#(m6cr}v@w zl}KV0J}x7*EAN7>GAQuI*n*STP}|UvZg~lWg6uw}lqh;&G3$_iLomC*irt&6tgWq; zqKLVaA>2Jol?2QGX2iF(TZ$xCx*}r{m1AAK&G&`?8Y;q(JRN8c+>}Kw`AVmt(bCH0hK;9NuJJ=d|)5K z1HMcSoI~=&O2`v?YE5nC+#pkXQ2=gl(#8Tg-fKi&)K_^S6UG)5>_QEiaG9HGy7KT~D;78+>MjMIrD{Sb;9_ zfCo%eXvJ<@%eLph=b3fCDv&L`GJ92>6n?gI@(UqnZkOh3^VcD;P-dvm!`obI@5U_N On@Frg5FTyNw*4=p3!9Aq literal 0 HcmV?d00001 diff --git a/NO45/yuewen/spiders/yw.py b/NO45/yuewen/spiders/yw.py new file mode 100644 index 0000000..a3205b5 --- /dev/null +++ b/NO45/yuewen/spiders/yw.py @@ -0,0 +1,34 @@ +import scrapy +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule + + +class CbSpider(CrawlSpider): + name = 'yw' + allowed_domains = ['write.qq.com'] + start_urls = ['https://write.qq.com/portal/article?filterType=0&page=1'] + # URL 提取规则 + rules = ( + Rule(LinkExtractor(allow=r'.*/portal/content\?caid=\d+&feedType=2&lcid=\d+$'), callback="parse_item"), + # 寻找下一页 url 地址 + Rule(LinkExtractor(restrict_xpaths="//a[@title='下一页']"), follow=True), + ) + + def parse_start_url(self, response): + print("---process_results---") + yield scrapy.Request('https://write.qq.com/portal/article?filterType=0&page=1') + + def process_results(self, response, results): + print("---process_results---") + print(results) + + def parse_item(self, response): + print("---parse_item---") + print(response.url) + title = response.css('title::text').extract()[0].strip() + item = {} + item["title"] = title + yield item + + def parse(self): + pass \ No newline at end of file diff --git a/README.md b/README.md index 169ec4e..3ebcbc2 100644 --- a/README.md +++ b/README.md @@ -85,5 +85,8 @@ ### 📘 scrapy 库学习 - +42. [学python,怎么能不学习scrapy呢,这篇博客带你学会它](https://dream.blog.csdn.net/article/details/120899494) +43. [python scrapy 管道学习,并拿在行练手爬虫项目](https://dream.blog.csdn.net/article/details/120934425) +44. [python scrapy极细拆解,打开Spider类看内容,顺手爬了一下优设网](https://dream.blog.csdn.net/article/details/120936534) +45. [练手练到阅文集团作家中心了,python crawlspider 二维抓取学习](https://dream.blog.csdn.net/article/details/120835220) -- GitLab