diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.json" new file mode 100644 index 0000000000000000000000000000000000000000..06f87988f78620c788e67c37729598654137d147 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.json" @@ -0,0 +1,7 @@ +{ + "source": "autoscraper_desc.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.md" new file mode 100644 index 0000000000000000000000000000000000000000..10701be3334d5dd341e9549ac664b57fe5bd3805 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.md" @@ -0,0 +1,29 @@ +# autoscraper简介 + +autoscraper是一个基于python的智能、自动、快速和轻量级的网络爬虫,以下说法错误的是: + +## 答案 + +```bash +是目前解析速度最快的网络爬虫 +``` + +## 选项 + +### A + +```bash +同时提供了精确抽取的方法 +``` + +### B + +```bash +可以根据示例文本自动抽取相似的文本 +``` + +### C + +```bash +避免了手写页面抽取规则的烦恼 +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/config.json" new file mode 100644 index 0000000000000000000000000000000000000000..42d7c05a641f8498c8e3817d6c73c5e0011025e5 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/config.json" @@ -0,0 +1,9 @@ +{ + "export": ["autoscraper_desc.json", "hello_autoscraper.json"], + "keywords": [], + "children": [], + "keywords_must": [ + "autoscraper" + ], + "keywords_forbid": [] +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.json" new file mode 100644 index 0000000000000000000000000000000000000000..855c9906222ee63ca42d022e166817f8993f8dab --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.json" @@ -0,0 +1,7 @@ +{ + "source": "hello_autoscraper.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.md" new file mode 100644 index 0000000000000000000000000000000000000000..7ac426dfd3c39be194aed008e4b343e4a4751373 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.md" @@ -0,0 +1,46 @@ +# autoscraper示例 + +使用 autoscraper 从stackoverflow搜索页提取相似的主题帖,代码如下: + +```python +# -*- coding: UTF-8 -*- +from autoscraper import AutoScraper + +def get_similar_result(url, wanted_list): + scraper = AutoScraper() + # TODO(You): 正确的提取代码 + return result + +url = 'https://stackoverflow.com/search?q=autoscraper&s=7b5866da-920e-4926-8c33-09fb7d32886b' +wanted_list = ["AutoScraper module not found in Python Autoscraper library"] +print(get_similar_result(url, wanted_list)) + +``` + +关于缺失代码部分,以下选项正确的是: + +## 答案 + +```python +result = scraper.build(url, wanted_list) +``` + +## 选项 + +### A + +```python +result = scraper.get_result_similar(url, wanted_list) +``` + +### B + +```python +result = scraper.get(url, wanted_list) +``` + +### C + +```python +result = scraper.get_result_exact(url, wanted_list) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.py" new file mode 100644 index 0000000000000000000000000000000000000000..ba8f24e53e25af1b1cc8a6e23d0a396b56a36a68 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.py" @@ -0,0 +1,12 @@ +# -*- coding: UTF-8 -*- + +from autoscraper import AutoScraper + +def get_similar_result(url, wanted_list): + scraper = AutoScraper() + result = scraper.build(url, wanted_list) + return result + +url = 'https://stackoverflow.com/search?q=autoscraper&s=7b5866da-920e-4926-8c33-09fb7d32886b' +wanted_list = ["AutoScraper module not found in Python Autoscraper library"] +print(get_similar_result(url, wanted_list)) \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/config.json" new file mode 100644 index 0000000000000000000000000000000000000000..1d44cedf30cfcb91fc4086e0d61bad323bc04f9f --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/config.json" @@ -0,0 +1,9 @@ +{ + "export": ["selectolax_desc.json", "hello_selectolax.json"], + "keywords": [], + "children": [], + "keywords_must": [ + "selectolax" + ], + "keywords_forbid": [] +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.json" new file mode 100644 index 0000000000000000000000000000000000000000..4c6179e87bae1cdc1742ce1ed711058c5af4a009 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.json" @@ -0,0 +1,7 @@ +{ + "source": "hello_selectolax.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.md" new file mode 100644 index 0000000000000000000000000000000000000000..f9c87b8b601f254582b45d31a82cc8e83ceb324b --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.md" @@ -0,0 +1,57 @@ +# selectolax示例 + +使用 selectolax 提取页面p标签的内容,代码如下: + +```python +# -*- coding: UTF-8 -*- +from selectolax.parser import HTMLParser + +def get_p(html): + p_list = [] + for node in HTMLParser(html).css("p"): + # TODO(You): 正确的提取代码 + return p_list + +html = ''' + + + 这是一个简单的测试页面 + + +

body 元素的内容会显示在浏览器中。

+

title 元素的内容会显示在浏览器的标题栏中。

+ + + ''' + +print(get_p(html)) + +``` + +关于缺失代码部分,以下选项正确的是: + +## 答案 + +```python +p_list.append(node.text()) +``` + +## 选项 + +### A + +```python +p_list.append(node.text) +``` + +### B + +```python +p_list.append(node) +``` + +### C + +```python +p_list.append(node.get_text()) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.py" new file mode 100644 index 0000000000000000000000000000000000000000..c55eed116ef2e67b9d7744cfe25aaf0b7e00d347 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.py" @@ -0,0 +1,24 @@ +# -*- coding: UTF-8 -*- + +from selectolax.parser import HTMLParser + + +def get_p(html): + p_list = [] + for node in HTMLParser(html).css("p"): + p_list.append(node.text()) + return p_list + +html = ''' + + + 这是一个简单的测试页面 + + +

body 元素的内容会显示在浏览器中。

+

title 元素的内容会显示在浏览器的标题栏中。

+ + + ''' + +print(get_p(html)) diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.json" new file mode 100644 index 0000000000000000000000000000000000000000..f05a79fd311435ca33b83a73febb4412aa6ebeb9 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.json" @@ -0,0 +1,7 @@ +{ + "source": "selectolax_desc.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.md" new file mode 100644 index 0000000000000000000000000000000000000000..0eb973de88074b9547068ce733cbbc71c04a8cd4 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.md" @@ -0,0 +1,29 @@ +# selectolax简介 + +selectolax用来高效解析网页,以下说法错误的是: + +## 答案 + +```bash +selectolax提供了下载网页功能 +``` + +## 选项 + +### A + +```bash +selectolax解析速度优于lxml +``` + +### B + +```bash +爬取大量数据,解析页面可以考虑使用selectolax +``` + +### C + +```bash +使用了Modest和Lexbor引擎 +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/config.json" new file mode 100644 index 0000000000000000000000000000000000000000..3f808504c2db64ad77cc17063f704053a6c3a889 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/config.json" @@ -0,0 +1,9 @@ +{ + "export": ["requests_html_desc.json", "hello_requests_html.json"], + "keywords": [], + "children": [], + "keywords_must": [ + "requests-html" + ], + "keywords_forbid": [] +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.json" new file mode 100644 index 0000000000000000000000000000000000000000..4fd4c51be19fff056bf8505d37ce8d8c13dd0c4e --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.json" @@ -0,0 +1,7 @@ +{ + "source": "hello_requests_html.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.md" new file mode 100644 index 0000000000000000000000000000000000000000..b8b91a23d659b8f3c88b3e345479a7f282a1800c --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.md" @@ -0,0 +1,45 @@ +# requests-html示例 + +使用 requests-html 提取页面https://www.baidu.com/上面的所有链接,代码如下: + +```python +# -*- coding: UTF-8 -*- +from requests_html import HTMLSession + +def get_url(url): + session = HTMLSession() + r = session.get(url) + # TODO(You): 正确的提取代码 + return urls + +print(get_url("https://www.baidu.com/")) + +``` + +关于缺失代码部分,以下选项正确的是: + +## 答案 + +```python +urls = r.html.links +``` + +## 选项 + +### A + +```python +urls = r.html.find("url") +``` + +### B + +```python +urls = r.html.find("url")[0] +``` + +### C + +```python +urls = r.html.urls +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.py" new file mode 100644 index 0000000000000000000000000000000000000000..10828f5be51fda6956dccd280da349aa2cb608a2 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.py" @@ -0,0 +1,11 @@ +# -*- coding: UTF-8 -*- + +from requests_html import HTMLSession + + +def get_url(url): + session = HTMLSession() + r = session.get(url) + return r.html.links + +print(get_url("https://www.baidu.com/")) \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.json" new file mode 100644 index 0000000000000000000000000000000000000000..4b51197fb67ba6923a37211086d2a571992d6929 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.json" @@ -0,0 +1,7 @@ +{ + "source": "requests_html_desc.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.md" new file mode 100644 index 0000000000000000000000000000000000000000..badac7e9e387386f51762ce2715a2c1f08f2c9c0 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.md" @@ -0,0 +1,29 @@ +# requests-html简介 + +requests-html可以使爬虫开发人员方便的编写爬虫代码,以下说法错误的是: + +## 答案 + +```bash +支持验证码识别 +``` + +## 选项 + +### A + +```bash +requests-html不仅可以下载网页,还可以解析网页 +``` + +### B + +```bash +支持CSS和XPath选择器 +``` + +### C + +```bash +支持持久cookie和代理 +```