From b2148e6a37f767cefeb412b4766e477dfcae6387 Mon Sep 17 00:00:00 2001 From: chenlong Date: Thu, 13 Jan 2022 18:36:44 +0800 Subject: [PATCH] add questions --- .../12.autoscraper/autoscraper_desc.json" | 7 +++ .../12.autoscraper/autoscraper_desc.md" | 29 ++++++++++ .../12.autoscraper/config.json" | 9 +++ .../12.autoscraper/hello_autoscraper.json" | 7 +++ .../12.autoscraper/hello_autoscraper.md" | 46 +++++++++++++++ .../12.autoscraper/hello_autoscraper.py" | 12 ++++ .../13.selectolax/config.json" | 9 +++ .../13.selectolax/hello_selectolax.json" | 7 +++ .../13.selectolax/hello_selectolax.md" | 57 +++++++++++++++++++ .../13.selectolax/hello_selectolax.py" | 24 ++++++++ .../13.selectolax/selectolax_desc.json" | 7 +++ .../13.selectolax/selectolax_desc.md" | 29 ++++++++++ .../14.requests-html/config.json" | 9 +++ .../hello_requests_html.json" | 7 +++ .../14.requests-html/hello_requests_html.md" | 45 +++++++++++++++ .../14.requests-html/hello_requests_html.py" | 11 ++++ .../14.requests-html/requests_html_desc.json" | 7 +++ .../14.requests-html/requests_html_desc.md" | 29 ++++++++++ 18 files changed, 351 insertions(+) create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.md" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/config.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.md" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.py" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/config.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.md" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.py" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.md" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/config.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.md" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.py" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.json" create mode 100644 "data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.md" diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.json" new file mode 100644 index 0000000..06f8798 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.json" @@ -0,0 +1,7 @@ +{ + "source": "autoscraper_desc.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.md" new file mode 100644 index 0000000..10701be --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/autoscraper_desc.md" @@ -0,0 +1,29 @@ +# autoscraper简介 + +autoscraper是一个基于python的智能、自动、快速和轻量级的网络爬虫,以下说法错误的是: + +## 答案 + +```bash +是目前解析速度最快的网络爬虫 +``` + +## 选项 + +### A + +```bash +同时提供了精确抽取的方法 +``` + +### B + +```bash +可以根据示例文本自动抽取相似的文本 +``` + +### C + +```bash +避免了手写页面抽取规则的烦恼 +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/config.json" new file mode 100644 index 0000000..42d7c05 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/config.json" @@ -0,0 +1,9 @@ +{ + "export": ["autoscraper_desc.json", "hello_autoscraper.json"], + "keywords": [], + "children": [], + "keywords_must": [ + "autoscraper" + ], + "keywords_forbid": [] +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.json" new file mode 100644 index 0000000..855c990 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.json" @@ -0,0 +1,7 @@ +{ + "source": "hello_autoscraper.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.md" new file mode 100644 index 0000000..7ac426d --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.md" @@ -0,0 +1,46 @@ +# autoscraper示例 + +使用 autoscraper 从stackoverflow搜索页提取相似的主题帖,代码如下: + +```python +# -*- coding: UTF-8 -*- +from autoscraper import AutoScraper + +def get_similar_result(url, wanted_list): + scraper = AutoScraper() + # TODO(You): 正确的提取代码 + return result + +url = 'https://stackoverflow.com/search?q=autoscraper&s=7b5866da-920e-4926-8c33-09fb7d32886b' +wanted_list = ["AutoScraper module not found in Python Autoscraper library"] +print(get_similar_result(url, wanted_list)) + +``` + +关于缺失代码部分,以下选项正确的是: + +## 答案 + +```python +result = scraper.build(url, wanted_list) +``` + +## 选项 + +### A + +```python +result = scraper.get_result_similar(url, wanted_list) +``` + +### B + +```python +result = scraper.get(url, wanted_list) +``` + +### C + +```python +result = scraper.get_result_exact(url, wanted_list) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.py" new file mode 100644 index 0000000..ba8f24e --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/12.autoscraper/hello_autoscraper.py" @@ -0,0 +1,12 @@ +# -*- coding: UTF-8 -*- + +from autoscraper import AutoScraper + +def get_similar_result(url, wanted_list): + scraper = AutoScraper() + result = scraper.build(url, wanted_list) + return result + +url = 'https://stackoverflow.com/search?q=autoscraper&s=7b5866da-920e-4926-8c33-09fb7d32886b' +wanted_list = ["AutoScraper module not found in Python Autoscraper library"] +print(get_similar_result(url, wanted_list)) \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/config.json" new file mode 100644 index 0000000..1d44ced --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/config.json" @@ -0,0 +1,9 @@ +{ + "export": ["selectolax_desc.json", "hello_selectolax.json"], + "keywords": [], + "children": [], + "keywords_must": [ + "selectolax" + ], + "keywords_forbid": [] +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.json" new file mode 100644 index 0000000..4c6179e --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.json" @@ -0,0 +1,7 @@ +{ + "source": "hello_selectolax.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.md" new file mode 100644 index 0000000..f9c87b8 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.md" @@ -0,0 +1,57 @@ +# selectolax示例 + +使用 selectolax 提取页面p标签的内容,代码如下: + +```python +# -*- coding: UTF-8 -*- +from selectolax.parser import HTMLParser + +def get_p(html): + p_list = [] + for node in HTMLParser(html).css("p"): + # TODO(You): 正确的提取代码 + return p_list + +html = ''' + + + 这是一个简单的测试页面 + + +

body 元素的内容会显示在浏览器中。

+

title 元素的内容会显示在浏览器的标题栏中。

+ + + ''' + +print(get_p(html)) + +``` + +关于缺失代码部分,以下选项正确的是: + +## 答案 + +```python +p_list.append(node.text()) +``` + +## 选项 + +### A + +```python +p_list.append(node.text) +``` + +### B + +```python +p_list.append(node) +``` + +### C + +```python +p_list.append(node.get_text()) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.py" new file mode 100644 index 0000000..c55eed1 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/hello_selectolax.py" @@ -0,0 +1,24 @@ +# -*- coding: UTF-8 -*- + +from selectolax.parser import HTMLParser + + +def get_p(html): + p_list = [] + for node in HTMLParser(html).css("p"): + p_list.append(node.text()) + return p_list + +html = ''' + + + 这是一个简单的测试页面 + + +

body 元素的内容会显示在浏览器中。

+

title 元素的内容会显示在浏览器的标题栏中。

+ + + ''' + +print(get_p(html)) diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.json" new file mode 100644 index 0000000..f05a79f --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.json" @@ -0,0 +1,7 @@ +{ + "source": "selectolax_desc.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.md" new file mode 100644 index 0000000..0eb973d --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/13.selectolax/selectolax_desc.md" @@ -0,0 +1,29 @@ +# selectolax简介 + +selectolax用来高效解析网页,以下说法错误的是: + +## 答案 + +```bash +selectolax提供了下载网页功能 +``` + +## 选项 + +### A + +```bash +selectolax解析速度优于lxml +``` + +### B + +```bash +爬取大量数据,解析页面可以考虑使用selectolax +``` + +### C + +```bash +使用了Modest和Lexbor引擎 +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/config.json" new file mode 100644 index 0000000..3f80850 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/config.json" @@ -0,0 +1,9 @@ +{ + "export": ["requests_html_desc.json", "hello_requests_html.json"], + "keywords": [], + "children": [], + "keywords_must": [ + "requests-html" + ], + "keywords_forbid": [] +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.json" new file mode 100644 index 0000000..4fd4c51 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.json" @@ -0,0 +1,7 @@ +{ + "source": "hello_requests_html.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.md" new file mode 100644 index 0000000..b8b91a2 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.md" @@ -0,0 +1,45 @@ +# requests-html示例 + +使用 requests-html 提取页面https://www.baidu.com/上面的所有链接,代码如下: + +```python +# -*- coding: UTF-8 -*- +from requests_html import HTMLSession + +def get_url(url): + session = HTMLSession() + r = session.get(url) + # TODO(You): 正确的提取代码 + return urls + +print(get_url("https://www.baidu.com/")) + +``` + +关于缺失代码部分,以下选项正确的是: + +## 答案 + +```python +urls = r.html.links +``` + +## 选项 + +### A + +```python +urls = r.html.find("url") +``` + +### B + +```python +urls = r.html.find("url")[0] +``` + +### C + +```python +urls = r.html.urls +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.py" new file mode 100644 index 0000000..10828f5 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/hello_requests_html.py" @@ -0,0 +1,11 @@ +# -*- coding: UTF-8 -*- + +from requests_html import HTMLSession + + +def get_url(url): + session = HTMLSession() + r = session.get(url) + return r.html.links + +print(get_url("https://www.baidu.com/")) \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.json" new file mode 100644 index 0000000..4b51197 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.json" @@ -0,0 +1,7 @@ +{ + "source": "requests_html_desc.md", + "depends": [], + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.md" new file mode 100644 index 0000000..badac7e --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/14.requests-html/requests_html_desc.md" @@ -0,0 +1,29 @@ +# requests-html简介 + +requests-html可以使爬虫开发人员方便的编写爬虫代码,以下说法错误的是: + +## 答案 + +```bash +支持验证码识别 +``` + +## 选项 + +### A + +```bash +requests-html不仅可以下载网页,还可以解析网页 +``` + +### B + +```bash +支持CSS和XPath选择器 +``` + +### C + +```bash +支持持久cookie和代理 +``` -- GitLab