修订到第7章：爬虫，提供完备的HelloWorld

bd64dc5c · feilong · 207cf5f2 · bd64dc5c · bd64dc5c · bd64dc5c
60 changed file
--- a/data/2.python中阶/3.网络爬虫/1.urllib/get_html.json
+++ b/data/2.python中阶/3.网络爬虫/1.urllib/get_html.json
 {
-  "one_line": {
-    "urllib.request.urlopen": [
-      "urllib.request"
-    ],
-    "response.read()": [
-      "response.readline()"
-    ],
-    "buff.decode(\"utf8\")": [
-      "buff.encode(\"utf8\")"
-    ]
-  },
-  "source": "get_html.py",
+  "source": "get_html.md",
  "depends": [],
  "exercise_id": 198,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/1.urllib/get_html.md
+++ b/data/2.python中阶/3.网络爬虫/1.urllib/get_html.md
+# urlib 获取网页(1)
+
+将 url 对应的网页下载到本地
+
+```python
+# -*- coding: UTF-8 -*-
+import urllib.request
+
+def get_html(url):
+    # TODO(You): 请在此实现代码
+    return html
+
+if __name__ == '__main__':
+    url = "http://www.baidu.com"
+    html = get_html(url)
+    print(html)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import urllib.request
+
+def get_html(url):
+    response = urllib.request.urlopen(url)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+
+if __name__ == '__main__':
+    url = "http://www.baidu.com"
+    html = get_html(url)
+    print(html)
+```
+
+## 答案
+
+```python
+def get_html(url):
+    response = urllib.request.urlopen(url)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+```
+
+## 选项
+
+### A
+
+```python
+def get_html(url):
+    response = urllib.request.urlopen(url)
+    buff = response.read()
+    html = buff.encode("utf8")
+    return html
+```
+
+### B
+
+```python
+def get_html(url):
+    response = urllib.request.urlopen(url)
+    buff = response.readline()
+    html = buff.decode("utf8")
+    return html
+```
+
+### C
+
+```python
+def get_html(url):
+    response = urllib.request(url)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+```
+
+### D
+
+```python
+def get_html(url):
+    response = urllib.request.urlopen(url)
+    buff = response.read()
+    html = buff.decode()
+    return html
+```
--- a/data/2.python中阶/3.网络爬虫/1.urllib/post.json
+++ b/data/2.python中阶/3.网络爬虫/1.urllib/post.json
 {
-  "one_line": {
-    "bytes(urllib.parse.urlencode(data), encoding='utf8')": [
-      "bytes(urllib.parse.urlencode(data))",
-      "bytes(data, encoding='utf8')",
-      "urllib.parse.urlencode(data)"
-    ]
-  },
-  "source": "post.py",
+  "source": "post.md",
  "depends": [],
  "exercise_id": 202,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/1.urllib/post.md
+++ b/data/2.python中阶/3.网络爬虫/1.urllib/post.md
+# urllib post请求
+
+urllib post请求
+
+```python
+# -*- coding: UTF-8 -*-
+import urllib.request
+import urllib.parse
+
+def get_response(url, data):
+    # TODO(You):  请在此编写代码
+    return result
+
+if __name__ == '__main__':
+    data = {
+        "key1": "value1",
+        "key2": "value2"
+    }
+    url = "http://httpbin.org/post"
+    html = get_response(url, data)
+    print(html)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import urllib.request
+import urllib.parse
+
+
+def get_response(url, data):
+    data = bytes(urllib.parse.urlencode(data), encoding='utf8')
+    response = urllib.request.urlopen(url, data=data)
+    buff = response.read()
+    result = buff.decode("utf8")
+    return result
+
+if __name__ == '__main__':
+    data = {
+        "key1": "value1",
+        "key2": "value2"
+    }
+    url = "http://httpbin.org/post"
+    html = get_response(url, data)
+    print(html)
+```
+
+## 答案
+
+```python
+def get_response(url, data):
+    data = bytes(urllib.parse.urlencode(data), encoding='utf8')
+    response = urllib.request.urlopen(
+        url, data=data
+    )
+    buff = response.read()
+    result = buff.decode("utf8")
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def get_response(url, data):
+    data = bytes(urllib.parse.urlencode(data, encoding='utf8'))
+    response = urllib.request.urlopen(
+        url, data=data
+    )
+    buff = response.read()
+    result = buff.decode("utf8")
+    return result
+```
+
+### B
+
+```python
+def get_response(url, data):
+    data = bytes(urllib.parse.urlencode(data), encoding='utf8')
+    response = urllib.request.urlopen(
+        url, data
+    )
+    buff = response.read()
+    result = buff.decode("utf8")
+    return result
+```
+
+### C
+
+```python
+def get_response(url, data):
+    data = urllib.parse.urlencode(data, encoding='utf8')
+    response = urllib.request.urlopen(
+        url, data=data
+    )
+    buff = response.read()
+    result = buff.decode("utf8")
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/1.urllib/with_headers.json
+++ b/data/2.python中阶/3.网络爬虫/1.urllib/with_headers.json
 {
-  "one_line": {
-    "req.add_header(key, headers[key])": [
-      "req.append(key, headers[key])"
-    ],
-    "urllib.request.urlopen(req)": [
-      "urllib.request.urlopen(url)"
-    ],
-    "urllib.request.Request(url)": [
-      "urllib.request.request(url)"
-    ]
-  },
-  "source": "with_headers.py",
+  "source": "with_headers.md",
  "depends": [],
  "exercise_id": 247,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/1.urllib/with_headers.md
+++ b/data/2.python中阶/3.网络爬虫/1.urllib/with_headers.md
+# urlib 获取网页(2) with header
+
+将 url 对应的网页下载到本地
+
+```python
+# -*- coding: UTF-8 -*-
+import urllib.request
+
+def get_html(url, headers):
+    # TODO(You): 请在此实现带头部信息的网页请求
+    return html
+
+if __name__ == '__main__':
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+    }
+    url = "http://www.baidu.com"
+    html = get_html(url, headers)
+    print(html)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import urllib.request
+
+def get_html(url, headers=None):
+    req = urllib.request.Request(url)
+    if headers is not None:
+        for key in headers:
+            req.add_header(key, headers[key])
+    response = urllib.request.urlopen(req)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+
+if __name__ == '__main__':
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+    }
+    url = "http://www.baidu.com"
+    html = get_html(url, headers)
+    print(html)
+```
+
+## 答案
+
+```python
+def get_html(url, headers):
+    req = urllib.request.Request(url)
+    for key in headers:
+        req.add_header(key, headers[key])
+    response = urllib.request.urlopen(req)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+```
+
+## 选项
+
+### A
+
+```python
+def get_html(url, headers):
+    req = urllib.request.Request(url)
+    for key in headers:
+        urllib.request.add_header(key, headers[key])
+    response = urllib.request.urlopen(req)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+```
+
+### B
+
+```python
+def get_html(url, headers):
+    req = urllib.request.urlopen(url)
+    for key in headers:
+        req.add_header(key, headers[key])
+    response = urllib.request.urlopen(req)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+```
+
+### C
+
+```python
+def get_html(url, headers):
+    req = urllib.request.Request(url)
+    for key in headers:
+        req.set_header(key, headers[key])
+    response = urllib.request.urlopen(req)
+    buff = response.read()
+    html = buff.decode("utf8")
+    return html
+```
--- a/data/2.python中阶/3.网络爬虫/10.动态渲染页面爬取/dynamic_page.md
+++ b/data/2.python中阶/3.网络爬虫/10.动态渲染页面爬取/dynamic_page.md
@@ -2,18 +2,17 @@

 现在想爬取一个url为下拉滚动的页面，下列选项可以爬取到下列页面内容的是：

-
-
 ## 答案

 ```python
+# -*- coding: UTF-8 -*-
 import time
 from selenium import webdriver
 from bs4 import BeautifulSoup

 driver = webdriver.Chrome()
-driver.get(url);
-Thread.sleep(1000);
+driver.get(url)
+Thread.sleep(1000)

 page_size = 10
 for i in range(page_size):
@@ -29,7 +28,7 @@ print(page.text)

 ### A

-```
+```bash
 以上均不正确
 ```


--- a/data/2.python中阶/3.网络爬虫/11.模拟登录/config.json
+++ b/data/2.python中阶/3.网络爬虫/11.模拟登录/config.json
 {
  "export": [
-    "simulate_login.json"
+      "simulate_login.json",
+      "hello_simulate.json"
  ],
  "keywords": [],
  "children": [

--- a/data/2.python中阶/3.网络爬虫/11.模拟登录/hello_simulate.json
+++ b/data/2.python中阶/3.网络爬虫/11.模拟登录/hello_simulate.json
+{
+  "author": "huanhuilong",
+  "source": "hello_simulate.md",
+  "depends": [],
+  "type": "code_options",
+  "notebook_enable": true,
+  "exercise_id": "237d7909392a48998437fdfe58ea3db4"
+}
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/11.模拟登录/hello_simulate.md
+++ b/data/2.python中阶/3.网络爬虫/11.模拟登录/hello_simulate.md
+# 模拟登陆例子
+
+以下是一个使用 cookie 模拟登录请求页面的例子
+
+```python
+# -*- coding: UTF-8 -*-
+import requests
+import sys
+import io
+
+if __name__ == "__main__":
+    # 登录后才能访问的网页
+    url = 'http://www.csdn.net'
+
+    # 浏览器登录后得到的cookie
+    cookie_str = r'xxx=yyy;zzz=mmm'
+
+    # 把cookie字符串处理成字典，以便接下来使用
+    # TODO(You): 请正确准备cookie数据
+
+    # 设置请求头
+    headers = {
+        'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
+    }
+
+    # 在发送get请求时带上请求头和cookies
+    resp = requests.get(
+        url, 
+        headers=headers, 
+        cookies=cookies
+    )
+
+    print(resp.content.decode('utf-8'))
+```
+
+正确实现代码的是？
+
+## 答案
+
+```python
+cookies = {}
+for line in cookie_str.split(';'):
+    key, value = line.split('=', 1)
+    cookies[key] = value
+```
+
+## 选项
+
+### A
+
+```python
+cookies = {}
+for line in cookie_str.split(';'):
+    key, value = line.split('=', 1)
+    cookies[key] = line
+```
+
+### B
+
+```python
+cookies = cookie_str.split(';')
+```
+
+### C
+
+```python
+cookies = []
+for line in cookie_str.split(';'):
+    cookies.append([key,value])
+```
--- a/data/2.python中阶/3.网络爬虫/11.模拟登录/hello_simulate.py
+++ b/data/2.python中阶/3.网络爬虫/11.模拟登录/hello_simulate.py
+import requests
+import sys
+import io
+
+
+if __name__ == "__main__":
+    # 登录后才能访问的网页
+    url = 'csdn.net'
+
+    # 浏览器登录后得到的cookie
+    cookie_str = r'xxx=yyy;zzz=mmm'
+
+    # 把cookie字符串处理成字典，以便接下来使用
+    cookies = {}
+    for line in cookie_str.split(';'):
+        key, value = line.split('=', 1)
+        cookies[key] = value
+
+    # 设置请求头
+    headers = {
+        'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
+    }
+
+    # 在发送get请求时带上请求头和cookies
+    # TODO(You): 请在此使用 cookie 登录请求页面
+
+    print(resp.content.decode('utf-8'))
--- a/data/2.python中阶/3.网络爬虫/11.模拟登录/simulate_login.md
+++ b/data/2.python中阶/3.网络爬虫/11.模拟登录/simulate_login.md
@@ -2,11 +2,9 @@

 一些网站需要登录之后才能浏览网站的其他内容，爬虫需要拥有登录获取cookie/session的能力才能继续采集数据，以下关于说法<span style="color:red">错误</span>的是：

-
-
 ## 答案

-```
+```bash
 登录成功后获取的cookie一般来说永久有效
 ```

@@ -14,18 +12,18 @@

 ### A

-```
+```bash
 模拟登陆需要先注册网站的账号，或者多注册一些账号来维护一个cookies池
 ```

 ### B

-```
+```bash
 获取登录页面，可以从登录按钮处获取到登录的url
 ```

 ### C

-```
+```bash
 登录成功后获取到cookie，其他请求带上cookie就可以获取到请求的页面资源
 ```
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese01.json
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese01.json
 {
-  "one_line": {
-    "findall": [
-      "find",
-      "finds",
-      "find_all"
-    ]
-  },
-  "source": "chinese01.py",
+  "source": "chinese01.md",
  "depends": [],
  "exercise_id": 243,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese01.md
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese01.md
+# Python 中文处理(1)
+
+获取中文个数
+
+```python
+# -*- coding: UTF-8 -*-
+import re
+
+def getnum_of_cn(inputdata):
+    '''计算字符串中 中文字符 数量'''
+    # TODO(You): 请编写正则查询代码
+    return len(chi)
+
+def test():
+    n = getnum_of_cn('你好，lajfldkjaklda123')
+    print(n)
+
+if __name__ == '__main__':
+    test()
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import re
+
+
+def getnum_of_cn(inputdata):
+    '''计算字符串中 中文字符 数量'''
+    chi = re.findall(r'[\u4E00-\u9FFF]', inputdata)
+    return len(chi)
+
+
+def test():
+    n = getnum_of_cn('你好，lajfldkjaklda123')
+    print(n)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+chi = re.findall(r'[\u4E00-\u9FFF]', inputdata)
+```
+
+## 选项
+
+### A
+
+```python
+chi = re.find(r'[\u4E00-\u9FFF]', inputdata)
+```
+
+### B
+
+```python
+chi = inputdata.findall(r'[\u4E00-\u9FFF]')
+```
+
+### C
+
+```python
+chi = re.findall(r'\u4E00-\u9FFF', inputdata)
+```
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese02.json
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese02.json
 {
-  "one_line": {
-    "search": [
-      "searchall",
-      "match",
-      "find"
-    ]
-  },
-  "source": "chinese02.py",
+  "source": "chinese02.md",
  "depends": [],
  "exercise_id": 219,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese02.md
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/chinese02.md
+# Python 中文处理(2)
+
+获取中文个数
+
+```python
+# -*- coding: UTF-8 -*-
+import re
+
+def search_text(inputdata):
+    '''search返回匹配到的一个'''
+    # TODO(You): 请在此实现代码
+    return chi
+
+def test():
+    n = search_text('你好，nlp先生！nlp先生！')
+    print(n)
+
+if __name__ == '__main__':
+    test()
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import re
+
+
+def search_text(inputdata):
+    '''search返回匹配到的一个'''
+    chi = re.search('nlp', inputdata)
+    return chi
+
+
+def test():
+    n = search_text('你好，nlp先生！nlp先生！')
+    print(n)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+chi = re.search('nlp', inputdata)
+```
+
+## 选项
+
+### A
+
+```python
+chi = re.searchAll('nlp', inputdata)
+```
+
+### B
+
+```python
+chi = re.search(inputdata, 'nlp')
+```
+
+### C
+
+```python
+chi = inputdata.search('nlp')
+```
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/find_ip_address.json
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/find_ip_address.json
 {
-  "one_line": {
-    "findall": [
-      "search",
-      "match",
-      "sub"
-    ]
-  },
-  "source": "find_ip_address.py",
+  "source": "find_ip_address.md",
  "depends": [],
  "exercise_id": 181,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "huanhuilong",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/find_ip_address.md
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/find_ip_address.md
+# 正则表达式实战(2)
+
+查找字符串里含有的全部 IPV4 和 IPV6 地址
+
+```python
+# -*- coding: UTF-8 -*-
+import re
+
+def find_all_ipv4(text):
+    result = []
+    ipv4 = r"((\b25[0-5]|\b2[0-4][0-9]|\b[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})"
+
+    # TODO(You)：请在此匹配ipv4
+
+    for m in ret:
+        result.append({'type': 'ipv4', 'value': m[0]})
+    return result
+
+def find_all_ipv6(text):
+    result = []
+    
+    ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
+
+    # TODO(You): 请在此匹配ipv6
+
+    for m in ret:
+        result.append({'type': 'ipv6', 'value': m[0]})
+    return result
+
+def find_all_ip(text):
+    result = find_all_ipv4(text) + find_all_ipv6(text)
+    return result
+
+if __name__ == '__main__':
+    input = 'IP地址有IPV4，例如：192.168.100.2，也有IPV6，例如：fe80:0000:0000:0000:0204:61ff:fe9d:f156，以及：fe80:0000:0000:0000:0204:61ff:fe9d:f156，还有 192.168.100.50'
+    results = find_all_ip(input)
+    for item in results:
+        print('type: {}, value: {}'.format(item['type'], item['value']))
+```
+
+请选出下列能**正确**实现ipv4和ipv6正则匹配的选项。
+
+## template
+
+```python
+import re
+
+
+def find_all_ip(text):
+    result = []
+
+    ipv4 = r"((\b25[0-5]|\b2[0-4][0-9]|\b[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})"
+    ret = re.findall(ipv4, text)
+    for m in ret:
+        result.append({'type': 'ipv4', 'value': m[0]})
+
+    ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
+    ret = re.finditer(ipv6, text)
+    for m in ret:
+        result.append({'type': 'ipv6', 'value': m[0]})
+
+    return result
+
+if __name__ == '__main__':
+    input = 'IP地址有IPV4，例如：192.168.100.2，也有IPV6，例如：fe80:0000:0000:0000:0204:61ff:fe9d:f156，以及：fe80:0000:0000:0000:0204:61ff:fe9d:f156，还有 192.168.100.50'
+    results = find_all_ip(input)
+    for item in results:
+        print('type: {}, value: {}'.format(item['type'], item['value']))
+```
+
+## 答案
+
+```python
+def find_all_ipv4(text):
+    ...
+    ret = re.findall(ipv4, text)
+    ...
+
+def find_all_ipv6(text):
+    ...
+    ret = re.finditer(ipv6, text)
+    ...
+```
+
+## 选项
+
+### A
+
+```python
+def find_all_ipv4(text):
+    ...
+    ret = re.findall(text, ipv4)
+    ...
+
+def find_all_ipv6(text):
+    ...
+    ret = re.finditer(text, ipv6)
+    ...
+```
+
+### B
+
+```python
+def find_all_ipv4(text):
+    ...
+    ret = text.findall(ipv4)
+    ...
+
+def find_all_ipv6(text):
+    ...
+    ret = text.finditer(ipv6)
+    ...
+```
+
+### C
+
+```python
+def find_all_ipv4(text):
+    ...
+    ret = re.search(ipv4, text)
+    ...
+
+def find_all_ipv6(text):
+    ...
+    ret = re.search(ipv6, text)
+    ...
+```
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/remove_html.json
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/remove_html.json
 {
-  "one_line": {
-    "<[^>]+>": [
-      "<.*>",
-      "<[^>]?>"
-    ],
-    ", re.S": [
-      ""
-    ]
-  },
-  "source": "remove_html.py",
+  "source": "remove_html.md",
  "depends": [],
  "exercise_id": 182,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/2.正则表达式/remove_html.md
+++ b/data/2.python中阶/3.网络爬虫/2.正则表达式/remove_html.md
+# 正则表达式实战(1)
+
+去除html标签
+
+```python
+# -*- coding: UTF-8 -*-
+import re
+from typing import Text
+
+def remove_html(content):
+    # TODO(You): 请在此实现代码
+    return result
+
+if __name__ == '__main__':
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    Text = remove_html(html)
+    print(Text)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import re
+from typing import Text
+
+
+def remove_html(content):
+    pattern = re.compile(r'<[^>]+>', re.S)
+    result = pattern.sub('', content)
+    return result
+
+
+def test():
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    Text = remove_html(html)
+    print(Text)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def remove_html(content):
+    pattern = re.compile(r'<[^>]+>', re.S)
+    result = pattern.sub('', content)
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def remove_html(content):
+    pattern = re.compile(r'<[^>]+>')
+    result = pattern.sub('', content)
+    return result
+```
+
+### B
+
+```python
+def remove_html(content):
+    pattern = re.compile(r'<[^>]+>', re.s)
+    result = pattern.sub('', content)
+    return result
+```
+
+### C
+
+```python
+def remove_html(content):
+    pattern = re.compile(r'<[^>]+>', re.S)
+    result = re.sub(pattern, content)
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_p.json
+++ b/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_p.json
 {
-  "one_line": {
-    "find_all": [
-      "find",
-      "xpath",
-      "findall"
-    ]
-  },
-  "source": "get_p.py",
+  "source": "get_p.md",
  "depends": [],
  "exercise_id": 204,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_p.md
+++ b/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_p.md
+# BeautifulSoup 获取所有p标签
+
+获取所有p标签里的文本
+
+```python
+# -*- coding: UTF-8 -*-
+from bs4 import BeautifulSoup
+
+def fetch_p(html):
+    # TODO(You): 请在此实现代码
+    return results
+
+if __name__ == '__main__':
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    p_text = fetch_p(html)
+    print(p_text)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+from bs4 import BeautifulSoup
+
+
+def fetch_p(html):
+    soup = BeautifulSoup(html, 'lxml')
+    p_list = soup.find_all("p")
+    return [p.text for p in p_list]
+
+
+def test():
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    p_text = fetch_p(html)
+    print(p_text)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def fetch_p(html):
+    soup = BeautifulSoup(html, 'lxml')
+    p_list = soup.find_all("p")
+    results = [p.text for p in p_list]
+    return results
+```
+
+## 选项
+
+### A
+
+```python
+def fetch_p(html):
+    soup = BeautifulSoup(html, 'lxml')
+    p_list = soup.xpath("p")
+    results = [p.text for p in p_list]
+    return results
+```
+
+### B
+
+```python
+def fetch_p(html):
+    soup = BeautifulSoup(html, 'lxml')
+    p_list = soup.findAll("p")
+    results = [p.text for p in p_list]
+    return results
+```
+
+### C
+
+```python
+def fetch_p(html):
+    soup = BeautifulSoup(html, 'lxml')
+    results = soup.find_all("p")
+    return results
+```
--- a/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_text.json
+++ b/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_text.json
 {
-  "one_line": {
-    "text": [
-      "text()",
-      "find_text()",
-      "all_text()"
-    ]
-  },
-  "source": "get_text.py",
+  "source": "get_text.md",
  "depends": [],
  "exercise_id": 245,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_text.md
+++ b/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/get_text.md
+# BeautifulSoup 获取text
+
+获取网页的text
+
+```python
+# -*- coding: UTF-8 -*-
+from bs4 import BeautifulSoup
+
+def fetch_text(html):
+    # TODO(You): 请在此实现代码
+    return result
+
+if __name__ == '__main__':
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    text = fetch_text(html)
+    print(text)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+from bs4 import BeautifulSoup
+
+def fetch_text(html):
+    soup = BeautifulSoup(html, 'lxml')
+    result = soup.text
+    return result
+
+def test():
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    text = fetch_text(html)
+    print(text)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def fetch_text(html):
+    soup = BeautifulSoup(html, 'lxml')
+    result = soup.text
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def fetch_text(html):
+    soup = BeautifulSoup(html, 'lxml')
+    result = soup.find_all('text')
+    return result
+```
+
+### B
+
+```python
+def fetch_text(html):
+    soup = BeautifulSoup(html, 'lxml')
+    result = soup.find_text()
+    return result
+```
+
+### C
+
+```python
+def fetch_text(html):
+    soup = BeautifulSoup(html, 'lxml')
+    result = soup.text()
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/html_parer.json
+++ b/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/html_parer.json
 {
-  "one_line": {
-    "html.parser": [
-      "html5"
-    ],
-    "'img'": [
-      "'src'"
-    ],
-    "BeautifulSoup": [
-      "beautifulsoup"
-    ]
-  },
-  "source": "html_parer.py",
+  "source": "html_parer.md",
  "depends": [],
  "exercise_id": 226,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/html_parer.md
+++ b/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/html_parer.md
+# BeautifulSoup
+
+查找网页里所有图片地址
+
+```python
+from bs4 import BeautifulSoup
+
+def fetch_imgs(html):
+    # TODO(You): 请在此实现代码
+    return imgs
+
+def test():
+    imgs = fetch_imgs(
+        '<p><img src="http://example.com"/><img src="http://example.com"/></p>')
+    print(imgs)
+
+if __name__ == '__main__':
+    test()
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+from bs4 import BeautifulSoup
+
+
+def fetch_imgs(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    imgs = [tag['src'] for tag in soup.find_all('img')]
+    return imgs
+
+
+def test():
+    imgs = fetch_imgs(
+        '<p><img src="http://example.com"/><img src="http://example.com"/></p>')
+    print(imgs)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def fetch_imgs(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    imgs = [tag['src'] for tag in soup.find_all('img')]
+    return imgs
+```
+
+## 选项
+
+### A
+
+```python
+def fetch_imgs(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    imgs = [tag for tag in soup.find_all('img')]
+    return imgs
+```
+
+### B
+
+```python
+def fetch_imgs(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    imgs = soup.find_all('img')
+    return imgs
+```
+
+### C
+
+```python
+def fetch_imgs(html):
+    soup = BeautifulSoup('html.parser', html)
+    imgs = [tag['src'] for tag in soup.find_all('img')]
+    return imgs
+```
--- a/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/html_parer.py
+++ b/data/2.python中阶/3.网络爬虫/3.Beautiful Soup/html_parer.py
@@ -8,7 +8,7 @@ from bs4 import BeautifulSoup

 def fetch_imgs(html):
    soup = BeautifulSoup(html, 'html.parser')
-    imgs = [tag for tag in soup.find_all('img')]
+    imgs = [tag['src'] for tag in soup.find_all('img')]
    return imgs



--- a/data/2.python中阶/3.网络爬虫/4.lxml/get_html_appoint_p.json
+++ b/data/2.python中阶/3.网络爬虫/4.lxml/get_html_appoint_p.json
 {
-  "one_line": {
-    "xpath": [
-      "find"
-    ],
-    "HTML": [
-      "html",
-      "Html"
-    ],
-    "//p[@class='item-1']/text()": [
-      "//p[@class='item-1']",
-      "//p[@class='item-1']/text"
-    ]
-  },
-  "source": "get_html_appoint_p.py",
+  "source": "get_html_appoint_p.md",
  "depends": [],
  "exercise_id": 211,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/4.lxml/get_html_appoint_p.md
+++ b/data/2.python中阶/3.网络爬虫/4.lxml/get_html_appoint_p.md
+# lxml解析网页
+
+使用xpath获取 class 为 "item-1" 的段落文本
+
+```python
+# -*- coding: UTF-8 -*-
+from lxml import etree
+
+def fetch_text(html):
+    # TODO(You): 请在此实现代码
+    return result
+
+if __name__ == '__main__':
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    imgs = fetch_text(html)
+    print(imgs)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+
+from lxml import etree
+
+
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p[@class='item-1']/text()")
+    return result
+
+
+def test():
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    imgs = fetch_text(html)
+    print(imgs)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p[@class='item-1']/text()")
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p[@class='item-2']/text()")
+    return result
+```
+
+### B
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p[class='item-1']/text()")
+    return result
+```
+
+### C
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p[@class='item-1']/text")
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/4.lxml/get_html_p.json
+++ b/data/2.python中阶/3.网络爬虫/4.lxml/get_html_p.json
 {
-  "one_line": {
-    "//p/text()": [
-      "p/text()",
-      "//p",
-      "p.text"
-    ]
-  },
-  "source": "get_html_p.py",
+  "source": "get_html_p.md",
  "depends": [],
  "exercise_id": 191,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/4.lxml/get_html_p.md
+++ b/data/2.python中阶/3.网络爬虫/4.lxml/get_html_p.md
+# lxml解析网页
+
+使用xpath获取所有段落的文本
+
+```python
+# -*- coding: UTF-8 -*-
+from lxml import etree
+
+def fetch_text(html):
+    # TODO(You): 请在此实现代码
+    return result
+
+if __name__ == '__main__':
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    imgs = fetch_text(html)
+    print(imgs)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+
+from lxml import etree
+
+
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p/text()")
+    return result
+
+
+def test():
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
+                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    imgs = fetch_text(html)
+    print(imgs)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p/text()")
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p/text")
+    return result
+```
+
+### B
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("/p/text()")
+    return result
+```
+
+### C
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//p.text()")
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/4.lxml/get_html_text.json
+++ b/data/2.python中阶/3.网络爬虫/4.lxml/get_html_text.json
 {
-  "one_line": {
-    "etree": [
-      "tree",
-      "btree"
-    ],
-    "//text()": [
-      "text()",
-      "//text",
-      "/text()"
-    ]
-  },
-  "source": "get_html_text.py",
+  "source": "get_html_text.md",
  "depends": [],
  "exercise_id": 220,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/4.lxml/get_html_text.md
+++ b/data/2.python中阶/3.网络爬虫/4.lxml/get_html_text.md
+# lxml解析网页
+
+使用xpath获取所有的文本
+
+```python
+# -*- coding: UTF-8 -*-
+from lxml import etree
+
+def fetch_text(html):
+    # TODO(You): 请在此实现代码
+    return result
+
+if __name__ == '__main__':
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p>body 元素的内容会显示在浏览器中。</p>
+                <p>title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    imgs = fetch_text(html)
+    print(imgs)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+
+from lxml import etree
+
+
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//text()")
+    return result
+
+
+def test():
+    html = '''
+        <html>
+            <head>
+                <title>这是一个简单的测试页面</title>
+            </head>
+            <body>
+                <p>body 元素的内容会显示在浏览器中。</p>
+                <p>title 元素的内容会显示在浏览器的标题栏中。</p>
+            </body>
+        </html>
+        '''
+    imgs = fetch_text(html)
+    print(imgs)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//text()")
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("/text()")
+    return result
+```
+
+### B
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("//text")
+    return result
+```
+
+### C
+
+```python
+def fetch_text(html):
+    html = etree.HTML(html)
+    result = html.xpath("/text()")
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/5.requests/get_html.json
+++ b/data/2.python中阶/3.网络爬虫/5.requests/get_html.json
 {
-  "one_line": {
-    "get": [
-      "post",
-      "gets",
-      "fetch"
-    ]
-  },
-  "source": "get_html.py",
+  "source": "get_html.md",
  "depends": [],
  "exercise_id": 242,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/5.requests/get_html.md
+++ b/data/2.python中阶/3.网络爬虫/5.requests/get_html.md
+# requests 获取网页(1)
+
+获取url对应的网页HTML
+
+```python
+# -*- coding: UTF-8 -*-
+import requests
+
+def get_html(url):
+    # TODO(You): 请在此实现代码
+    return result
+
+if __name__ == '__main__':
+    url = "http://www.baidu.com"
+    html = get_html(url)
+    print(html)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import requests
+
+
+def get_html(url):
+    response = requests.get(url=url)
+    return response.text
+
+
+def test():
+    url = "http://www.baidu.com"
+    html = get_html(url)
+    print(html)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def get_html(url):
+    response = requests.get(url=url)
+    result = response.text
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def get_html(url):
+    response = requests.get(url)
+    result = response.text
+    return result
+```
+
+### B
+
+```python
+def get_html(url):
+    result = requests.get(url=url)
+    return result
+```
+
+### C
+
+```python
+def get_html(url):
+    response = requests.get(url=url)
+    result = response.html
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/5.requests/post.json
+++ b/data/2.python中阶/3.网络爬虫/5.requests/post.json
 {
-  "one_line": {
-    "post": [
-      "get",
-      "posts"
-    ],
-    "response = requests.post(url, data, headers)": [
-      "response = requests.post(url, headers, data)",
-      "response = requests.post(data, url, headers)"
-    ]
-  },
-  "source": "post.py",
+  "source": "post.md",
  "depends": [],
  "exercise_id": 186,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/5.requests/post.md
+++ b/data/2.python中阶/3.网络爬虫/5.requests/post.md
+# requests post 请求
+
+requests post 请求
+
+```python
+# -*- coding: UTF-8 -*-
+import requests
+
+def get_response(url, data, headers=None):
+    # TODO(You): 请在此实现代码
+    return result
+
+if __name__ == '__main__':
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+    }
+    data = {
+        "key1": "value1",
+        "key2": "value2"
+    }
+    url = "http://httpbin.org/post"
+    html = get_response(url, data, headers)
+    print(html)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import requests
+
+
+def get_response(url, data, headers=None):
+    response = requests.post(url, data, headers)
+    return response.text
+
+
+def test():
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+    }
+    data = {
+        "key1": "value1",
+        "key2": "value2"
+    }
+    url = "http://httpbin.org/post"
+    html = get_response(url, data, headers)
+    print(html)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+def get_response(url, data, headers=None):
+    response = requests.post(url, data, headers)
+    result = response.text
+    return result
+```
+
+## 选项
+
+### A
+
+```python
+def get_response(url, data, headers=None):
+    response = requests.get(url, headers， data)
+    result = response.text
+    return result
+```
+
+### B
+
+```python
+def get_response(url, data, headers=None):
+    result = requests.post(url, data, headers)
+    return result
+```
+
+### C
+
+```python
+def get_response(url, data, headers=None):
+    response = requests.post(url, data, headers)
+    result = response.text()
+    return result
+```
--- a/data/2.python中阶/3.网络爬虫/5.requests/with_headers.json
+++ b/data/2.python中阶/3.网络爬虫/5.requests/with_headers.json
 {
-  "one_line": {
-    "response.text": [
-      "response.text()",
-      "response.gettext()",
-      "response.get_text()",
-      "response"
-    ]
-  },
-  "source": "with_headers.py",
+  "source": "with_headers.md",
  "depends": [],
  "exercise_id": 210,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "zxm2015",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/5.requests/with_headers.md
+++ b/data/2.python中阶/3.网络爬虫/5.requests/with_headers.md
+# requests 获取网页(2) with headers
+
+将url对应的网页下载到本地
+
+```python
+# -*- coding: UTF-8 -*-
+import requests
+
+def get_html(url, headers=None):
+    response = requests.get(url=url)
+    return response.text
+
+if __name__ == '__main__':
+    # TODO(You): 请正确编写 headers
+    headers = ...
+    url = "http://www.baidu.com"
+    html = get_html(url, headers)
+    print(html)
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import requests
+
+
+def get_html(url, headers=None):
+    response = requests.get(url=url)
+    return response.text
+
+
+def test():
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+    }
+    url = "http://www.baidu.com"
+    html = get_html(url, headers)
+    print(html)
+
+if __name__ == '__main__':
+    test()
+```
+
+## 答案
+
+```python
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+}
+```
+
+## 选项
+
+### A
+
+```python
+headers = {
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+}
+```
+
+### B
+
+```python
+headers = {
+    "useragent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+}
+```
+
+### C
+
+```python
+headers = [
+    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
+]
+```
--- a/data/2.python中阶/3.网络爬虫/6.Selenium/config.json
+++ b/data/2.python中阶/3.网络爬虫/6.Selenium/config.json
 {
  "export": [
-    "selenium.json"
+      "selenium.json",
+      "hello_selenium.json"
  ],
  "keywords": [],
  "children": [

--- a/data/2.python中阶/3.网络爬虫/6.Selenium/hello_selenium.json
+++ b/data/2.python中阶/3.网络爬虫/6.Selenium/hello_selenium.json
+{
+  "author": "huanhuilong",
+  "source": "hello_selenium.md",
+  "depends": [],
+  "type": "code_options",
+  "notebook_enable": false,
+  "exercise_id": "8b4b78b2b9f84b5f8cd6fbb7fe85c3d0"
+}
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/6.Selenium/hello_selenium.md
+++ b/data/2.python中阶/3.网络爬虫/6.Selenium/hello_selenium.md
+# selenium 测试用例
+
+Selenium 是web自动化测试工具集，爬虫可以利用其实现对页面动态资源的采集。请按顺序操作
+
+1. 安装 Python Selenium 包：`pip install selenium`
+2. 安装 Chrome 驱动：`https://npm.taobao.org/mirrors/chromedriver/`，如果使用别的浏览器需要下载对应浏览器的驱动
+3. 编写使用 python unittest 测试使用 selenium 完成自动化
+
+selenium 自动化网页测试的操作：
+
+1. 使用 selenium 的Chrome 驱动，打开 CSDN 首页，此时会打开 Chrome 浏览器测试页面
+2. 验证字符串 "CSDN" 在页面标题
+3. 找到网页里的搜索框
+4. 输入"OpenCV技能树"
+5. 输入回车，搜索结果
+6. 等待10秒退出
+
+代码框架如下：
+
+```python
+# -*- coding: UTF-8 -*-
+import unittest
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+import time
+
+class PythonOrgSearch(unittest.TestCase):
+
+    def setUp(self):
+        self.driver = webdriver.Chrome()
+
+    def test_search_in_python_org(self):
+        # TODO(You): 请正确实现浏览器自动化测试需求
+        time.sleep(10)
+
+    def tearDown(self):
+        self.driver.close()
+
+if __name__ == "__main__":
+    unittest.main()
+```
+
+以下代码实现正确的是？
+
+## template
+
+```python
+import unittest
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+import time
+
+
+class PythonOrgSearch(unittest.TestCase):
+
+    def setUp(self):
+        self.driver = webdriver.Chrome()
+
+    def test_search_in_python_org(self):
+        driver = self.driver
+
+        driver.get("https://www.csdn.net/")
+        self.assertIn("CSDN", driver.title)
+
+        elem = driver.find_element_by_id("toolbar-search-input")
+        elem.send_keys("OpenCV 技能树")
+        elem.send_keys(Keys.RETURN)
+        assert "No results found." not in driver.page_source
+        time.sleep(10)
+
+    def tearDown(self):
+        self.driver.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
+
+```
+
+## 答案
+
+```python
+def test_search_in_python_org(self):
+    driver = self.driver
+    driver.get("https://www.csdn.net/")
+    self.assertIn("CSDN", driver.title)
+
+    elem = driver.find_element_by_id("toolbar-search-input")
+    elem.send_keys("OpenCV 技能树")
+    elem.send_keys(Keys.RETURN)
+    assert "No results found." not in driver.page_source
+    time.sleep(10)
+```
+
+## 选项
+
+### A
+
+```bash
+def test_search_in_python_org(self):
+    driver = self.driver
+    driver.get("https://www.csdn.net/")
+    self.assertIn("CSDN", driver.title)
+
+    elem = driver.find_element_by_name("toolbar-search-input")
+    elem.send_keys("OpenCV 技能树")
+    elem.send_keys(Keys.RETURN)
+    assert "No results found." not in driver.page_source
+    time.sleep(10)
+```
+
+### B
+
+```bash
+def test_search_in_python_org(self):
+    driver = self.driver
+    driver.get("https://www.csdn.net/")
+    self.assertIn("CSDN", driver.title)
+
+    elem = driver.find_element_by_id("toolbar-search-input")
+    elem.send_keys("OpenCV 技能树")
+    assert "No results found." not in driver.page_source
+    time.sleep(10)
+```
+
+### C
+
+```bash
+def test_search_in_python_org(self):
+    driver = self.driver
+    driver.get("https://www.csdn.net/")
+    self.assertIn("CSDN", driver.title)
+
+    elem = driver.find_element_by_id("toolbar-search-input")
+    elem.send_keys(Keys.RETURN)
+    assert "No results found." not in driver.page_source
+    time.sleep(10)
+```
--- a/data/2.python中阶/3.网络爬虫/6.Selenium/hello_selenium.py
+++ b/data/2.python中阶/3.网络爬虫/6.Selenium/hello_selenium.py
+import unittest
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+import time
+
+
+class PythonOrgSearch(unittest.TestCase):
+
+    def setUp(self):
+        self.driver = webdriver.Chrome()
+
+    def test_search_in_python_org(self):
+        driver = self.driver
+
+        driver.get("https://www.csdn.net/")
+        self.assertIn("CSDN", driver.title)
+
+        elem = driver.find_element_by_id("toolbar-search-input")
+        elem.send_keys("OpenCV 技能树")
+        elem.send_keys(Keys.RETURN)
+        assert "No results found." not in driver.page_source
+        time.sleep(10)
+
+    def tearDown(self):
+        self.driver.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/data/2.python中阶/3.网络爬虫/6.Selenium/selenium.md
+++ b/data/2.python中阶/3.网络爬虫/6.Selenium/selenium.md
@@ -2,11 +2,9 @@

 Selenium是web自动化测试工具集，爬虫可以利用其实现对页面动态资源的采集，对于其这种说法<span style="color:red">错误</span>的是：

-
-
 ## 答案

-```
+```bash
 selenium和requests一样，都能用来采集数据，具有同等的速度
 ```

@@ -14,18 +12,18 @@ selenium和requests一样，都能用来采集数据，具有同等的速度

 ### A

-```
-页面执行js才能呈现的内容，可以使用selenium来协助采集
+```bash
+页面执行 js 才能呈现的内容，可以使用 selenium 来协助采集
 ```

 ### B

-```
+```bash
 selenium本质是驱动浏览器来发送请求，模拟浏览器的行为
 ```

 ### C

-```
+```bash
 请求之后往往需要等待一段时间，等待资源加载渲染完成
 ```
--- a/data/2.python中阶/3.网络爬虫/7.Scrapy框架/so_tag_spider.json
+++ b/data/2.python中阶/3.网络爬虫/7.Scrapy框架/so_tag_spider.json
 {
-  "one_line": {
-    "if self.page_count < self.totgal_pages:": [
-      "if self.page_count <= self.totgal_pages:"
-    ],
-    "callback=self.parse": [
-      "callback=parse"
-    ],
-    "yield": [
-      "return"
-    ]
-  },
-  "source": "so_tag_spider.py",
+  "source": "so_tag_spider.md",
  "depends": [
    "tag_pipeline.py"
  ],
  "exercise_id": 206,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "huanhuilong",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/7.Scrapy框架/so_tag_spider.md
+++ b/data/2.python中阶/3.网络爬虫/7.Scrapy框架/so_tag_spider.md
+# Python 爬虫
+
+爬取 stackoverflow 标签
+
+```python
+# -*- coding: UTF-8 -*-
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from scrapy.settings import Settings
+
+BASE_DIR = __loader__.name
+
+class StackOverflowTagSpider(scrapy.Spider):
+    # 爬虫名字
+    name = "stackoverflow_tags"
+
+    # 爬虫运行的域名
+    allowed_domains = ["stackoverflow.com"]
+
+    # 爬虫开始爬取的第1个页面
+    start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
+
+    # 爬虫配置，ITEM_PIPELINES指定每个条目的处理类
+    custom_settings = {
+        'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301},
+        'LOG_LEVEL': 'INFO'
+    }
+
+    def __init__(self):
+        self.total_pages = 45
+        self.page_count = 0
+
+    def parse(self, response):
+        # 访问的页面数+1，使用CSS查询页面内的标签文本
+        self.page_count += 1
+        tags = response.css('.post-tag::text')
+        for tag in tags:
+            yield {'name': tag.get()}
+
+        # 找到页面底部的页码，访问下一页
+        # TODO(You): 请正确实现访问下一页代码
+
+if __name__ == "__main__":
+    settings = Settings()
+    process = CrawlerProcess()
+    process.crawl(StackOverflowTagSpider)
+    process.start()
+```
+
+请选出下列能**正确**实现这一功能的选项。
+
+## template
+
+```python
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from scrapy.settings import Settings
+
+BASE_DIR = __loader__.name
+
+
+class StackOverflowTagSpider(scrapy.Spider):
+    name = "stackoverflow_tags"
+    allowed_domains = ["stackoverflow.com"]
+    start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
+    custom_settings = {
+        'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301},
+        'LOG_LEVEL': 'INFO'
+    }
+
+    def __init__(self):
+        self.totgal_pages = 45
+        self.page_count = 0
+
+    def parse(self, response):
+        self.page_count += 1
+        tags = response.css('.post-tag::text')
+        for tag in tags:
+            yield {'name': tag.get()}
+
+        if self.page_count < self.totgal_pages:
+            next_page_list = response.css('a.js-pagination-item::attr(href)')
+            if len(next_page_list) > 0:
+                next_page_item = next_page_list[len(next_page_list)-1]
+                next_page = next_page_item.get()
+                print('next_page:', next_page)
+                yield response.follow(next_page, callback=self.parse, dont_filter=True)
+
+if __name__ == "__main__":
+    settings = Settings()
+    process = CrawlerProcess()
+    process.crawl(StackOverflowTagSpider)
+    process.start()
+```
+
+## 答案
+
+```python
+if self.page_count < self.total_pages:
+    next_page_list = response.css('a.js-pagination-item::attr(href)')
+    if len(next_page_list) > 0:
+        next_page = next_page_list[len(next_page_list)-1].get()
+        yield response.follow(next_page, callback=self.parse, dont_filter=True)
+```
+
+## 选项
+
+### A
+
+```python
+if self.page_count < self.total_pages:
+    next_page_list = response.css('a.js-pagination-item::attr(href)')
+    if len(next_page_list) > 0:
+        next_page = next_page_list[len(next_page_list)-1].get()
+        return response.follow(next_page, callback=self.parse, dont_filter=True)
+```
+
+### B
+
+```python
+if self.page_count < self.total_pages:
+    next_page_list = response.css('a.js-pagination-item::attr(href)')
+    if len(next_page_list) > 0:
+        next_page = next_page_list[len(next_page_list)-1]
+        yield response.follow(next_page, callback=self.parse, dont_filter=True)
+```
+
+### C
+
+```python
+if self.page_count <= self.total_pages:
+    next_page_list = response.css('a.js-pagination-item::attr(href)')
+    if len(next_page_list) > 0:
+        next_page = next_page_list[len(next_page_list)-1].get()
+        yield response.follow(next_page, callback=self.parse, dont_filter=True)
+```
+
+### D
+
+```python
+next_page_list = response.css('a.js-pagination-item::attr(href)')
+if len(next_page_list) > 0:
+    next_page = next_page_list[len(next_page_list)-1].get()
+    yield response.follow(next_page, callback=self.parse, dont_filter=True)
+```
+
+### E
+
+```python
+if self.page_count < self.total_pages:
+    next_page_list = response.xpath('a.js-pagination-item::attr(href)')
+    if len(next_page_list) > 0:
+        next_page = next_page_list[len(next_page_list)-1].get()
+        yield response.follow(next_page, callback=self.parse, dont_filter=True)
+```
+
+### F
+
+```python
+if self.page_count < self.total_pages:
+    next_page_list = response.css('a.js-pagination-item::attr(href)')
+    if len(next_page_list) > 0:
+        next_page = next_page_list[len(next_page_list)-1].get()
+        yield response.next(next_page, callback=self.parse, dont_filter=True)
+```
--- a/data/2.python中阶/3.网络爬虫/7.Scrapy框架/tag_pipeline.json
+++ b/data/2.python中阶/3.网络爬虫/7.Scrapy框架/tag_pipeline.json
 {
-  "one_line": {
-    "if self.count > 0:": [
-      "if self.count >= 0:"
-    ],
-    "process_item(self, item, spider)": [
-      "process_item(self, spider, item)"
-    ],
-    "self.file.close()": [
-      ""
-    ],
-    ", 'w')": [
-      ", 'r')"
-    ]
-  },
-  "source": "tag_pipeline.py",
+  "source": "tag_pipeline.md",
  "depends": [],
  "exercise_id": 187,
-  "type": "code_options"
+  "type": "code_options",
+  "author": "huanhuilong",
+  "notebook_enable": true
 }
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/7.Scrapy框架/tag_pipeline.md
+++ b/data/2.python中阶/3.网络爬虫/7.Scrapy框架/tag_pipeline.md
+# Python 爬虫(1)
+
+实现一个用以 scrapy 爬虫处理中保存 stackoverflow 标签数据的管道处理类，爬虫管道类必须实现3个方法
+
+1. `open_spider(self, spider)`
+2. `process_item(self, item, spider)`
+3. `close_spider(self, spider)`
+
+本例最终输出的 json 文件格式请参考[stackoverflow.tag.json](https://codechina.csdn.net/csdn/csdn-tags/-/blob/master/src/dataset/stackoverflow.tag.json)
+
+```python
+# -*- coding: UTF-8 -*-
+import json
+
+class StackOverflowTagPipeline(object):
+    def open_spider(self, spider):
+        ''' 打开文件并写入'[\n' 到 json 文件'''
+        self.file = open('/tmp/stackoverflow.tag.json', 'w')
+        self.file.write('[\n')
+        self.count = 0
+        self.tags = {}
+
+    def process_item(self, item, spider):
+        ''' 写入一个 {"name":xxx} 格式的元素，注意逗号拼接 '''
+
+        # 去重
+        if self.tags.get(item['name']) is not None:
+            return
+        self.tags[item['name']] = True
+
+        # TODO:(You): 请正确实现拼接json写入的代码
+        result = ...
+
+        # 写入拼接文本
+        self.file.write(result)
+        self.count += 1
+
+    def close_spider(self, spider):
+        ''' 写入'\n]' 并关闭文件 '''
+        self.file.write('\n]')
+        self.file.close()
+
+if __name__ == "__main__":
+    t = StackOverflowTagPipeline()
+    t.open_spider(None)
+    t.process_item({'name': 'c++'}, None)
+    t.close_spider(None)
+```
+
+以下对json拼接写入处理正确的代码是？
+
+## template
+
+```python
+import json
+
+
+class StackOverflowTagPipeline(object):
+    def open_spider(self, spider):
+        ''' 打开文件并写入'[\n' 到 json 文件'''
+        self.file = open('/tmp/stackoverflow.tag.json', 'w')
+        self.file.write('[\n')
+        self.count = 0
+        self.tags = {}
+
+    def process_item(self, item, spider):
+        ''' 写入一个 {"name":xxx} 格式的元素，注意逗号拼接 '''
+        if self.tags.get(item['name']) is not None:
+            return
+        self.tags[item['name']] = True
+
+        words = []
+        if self.count > 0:
+            words.append(',\n')
+        words.append('  ')
+        words.append(json.dumps(item, ensure_ascii=False).strip())
+        line = ''.join(words)
+
+        self.file.write(line)
+        self.count += 1
+
+    def close_spider(self, spider):
+        ''' 写入'\n]' 并关闭文件 '''
+        self.file.write('\n]')
+        self.file.close()
+
+if __name__ == "__main__":
+    t = StackOverflowTagPipeline()
+    t.open_spider(None)
+    t.process_item({'name': 'c++'}, None)
+    t.close_spider(None)
+```
+
+## 答案
+
+```python
+words = []
+if self.count > 0:
+    words.append(',\n')
+words.append('  ')
+words.append(json.dumps(item, ensure_ascii=False).strip())
+line = ''.join(words)
+```
+
+## 选项
+
+### A
+
+```python
+words = []
+if self.count > 0:
+    words.append(',\n')
+words.append('  ')
+words.append(item))
+result = ''.join(words)
+```
+
+### B
+
+```python
+words = []
+words.append(',\n')
+words.append('  ')
+words.append(json.dumps(item, ensure_ascii=False).strip())
+result = ''.join(words)
+```
+
+### C
+
+```python
+words = []
+if self.count > 0:
+    words.append(',\n')
+words.append('  ')
+words.append(json.dumps(item, ensure_ascii=False).strip())
+line = words
+```
--- a/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/config.json
+++ b/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/config.json
 {
  "export": [
-    "pyspider.json"
+      "pyspider.json",
+      "hello_pyspider.json"
  ],
  "keywords": [],
  "children": [

--- a/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/hello_pyspider.json
+++ b/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/hello_pyspider.json
+{
+  "author": "huanhuilong",
+  "source": "hello_pyspider.md",
+  "depends": [],
+  "type": "code_options",
+  "notebook_enable": false,
+  "exercise_id": "ed92d5e3360a4dabb6dfa3b408768083"
+}
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/hello_pyspider.md
+++ b/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/hello_pyspider.md
+# pyspider 例子
+
+以下是一个 PySpider 的示范例子代码
+
+```python
+# -*- coding: UTF-8 -*-
+from pyspider.libs.base_handler import *
+
+class Handler(BaseHandler):
+    crawl_config = {
+    }
+
+    @every(minutes=24 * 60)
+    def on_start(self):
+        self.crawl('http://scrapy.org/', callback=self.index_page)
+
+    @config(age=10 * 24 * 60 * 60)
+    def index_page(self, response):
+        for each in response.doc('a[href^="http"]').items():
+            self.crawl(each.attr.href, callback=self.detail_page)
+
+    def detail_page(self, response):
+        return {
+            "url": response.url,
+            "title": response.doc('title').text(),
+        }
+```
+
+以下关于上述代码说法正确的是？
+
+## 答案
+
+```bash
+全部都正确
+```
+
+## 选项
+
+### A
+
+```python
+def on_start(self): 
+    '''该函数是入口函数，pyspider 命令启动 run 之后会调用该入口函数'''
+```
+
+### B
+
+```python
+# 添加了一个爬虫任务到PySpider，
+# 回调函数调用了 self.index_page 成员方法
+self.crawl(url, callback=self.index_page)
+```
+
+### C
+
+```python
+def index_page(self, response):
+    '''该函数的 response 参数是一个 Response* 对象. 
+    它通过了一组类似jQuery 的 API 用来查询和提取网页数据。
+    '''
+```
+
+### D
+
+```python
+def detail_page(self, response):
+    '''该函数返回一个字典对象. 返回值会被 resultdb 捕获. '''
+```
--- a/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/hello_pyspider.py
+++ b/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/hello_pyspider.py
+from pyspider.libs.base_handler import *
+
+
+class Handler(BaseHandler):
+    crawl_config = {
+    }
+
+    @every(minutes=24 * 60)
+    def on_start(self):
+        self.crawl('http://scrapy.org/', callback=self.index_page)
+
+    @config(age=10 * 24 * 60 * 60)
+    def index_page(self, response):
+        for each in response.doc('a[href^="http"]').items():
+            self.crawl(each.attr.href, callback=self.detail_page)
+
+    def detail_page(self, response):
+        return {
+            "url": response.url,
+            "title": response.doc('title').text(),
+        }
--- a/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/pyspider.md
+++ b/data/2.python中阶/3.网络爬虫/8.pyspider框架的使用/pyspider.md
 # pyspider

-Pyspider与Scrapy都可以用来爬取数据，关于他们的说法<span style="color:red">错误</span>的是：
-
-
+Pyspider 与 Scrapy 都可以用来爬取数据，关于他们的说法<span style="color:red">错误</span>的是：

 ## 答案

-```
-Scrapy提供了web界面，可以用来调试部署
+```bash
+Scrapy 提供了 web 界面，可以用来调试部署
 ```

 ## 选项

 ### A

-```
-Pyspider提供了web界面，可以进行可视化调试
+```bash
+Pyspider 提供了 web 界面，可以进行可视化调试
 ```

 ### B

-```
-初学者如果想快速入门爬取一个新闻网站，推荐使用Pyspider
+```bash
+初学者如果想快速入门爬取一个新闻网站，推荐使用 Pyspider
 ```

 ### C

-```
-Scrapy的可扩展程度更高，主要用来应对一些复杂的爬取场景
+```bash
+Scrapy 的可扩展程度更高，主要用来应对一些复杂的爬取场景
 ```
--- a/data/2.python中阶/3.网络爬虫/9.验证码处理/code.png
+++ b/data/2.python中阶/3.网络爬虫/9.验证码处理/code.png
--- a/data/2.python中阶/3.网络爬虫/9.验证码处理/config.json
+++ b/data/2.python中阶/3.网络爬虫/9.验证码处理/config.json
 {
  "export": [
-    "verification_code.json"
+      "verification_code.json",
+      "hello_paddle.json"
  ],
  "keywords": [],
  "children": [

--- a/data/2.python中阶/3.网络爬虫/9.验证码处理/hello_paddle.json
+++ b/data/2.python中阶/3.网络爬虫/9.验证码处理/hello_paddle.json
+{
+  "author": "huanhuilong",
+  "source": "hello_paddle.md",
+  "depends": [],
+  "type": "code_options",
+  "notebook_enable": false,
+  "exercise_id": "d925c57963714c1da1268ab4e4680f98"
+}
\ No newline at end of file
--- a/data/2.python中阶/3.网络爬虫/9.验证码处理/hello_paddle.md
+++ b/data/2.python中阶/3.网络爬虫/9.验证码处理/hello_paddle.md
+# 爬虫验证码识别
+
+使用百度 paddle ocr 库可以识别验证码
+
+1. 安装paddle：`pip install paddlepaddle==2.1.0`
+2. 安装paddle ocr: `pip install paddleocr==2.0.6`
+3. 编写代码
+
+```python
+# -*- coding: UTF-8 -*-
+import re
+from paddleocr import PaddleOCR
+
+if __name__ == "__main__":
+    ocr_client = PaddleOCR(
+        use_angle_cls=True,
+        lang="ch",
+        use_space_char=True,
+        use_zero_copy_run=True,
+        use_mp=True,
+        total_process_num=16,
+        ir_optim=True,
+        enable_mkldnn=True,
+        rec_batch_num=1,
+        max_batch_size=1
+    )
+    result = ocr_client.ocr('code.png', det=True, rec=True, cls=True)
+    code_text = []
+    for line in result:
+        print(line)
+        # TODO(You): 请正确提取文本
+        text = ...
+        code_text.append(text)
+    print(code_text)
+```
+
+其中 line 的打印例子是：
+
+```bash
+[[[881.0, 77.0], [1128.0, 56.0], [1161.0, 439.0], [914.0, 460.0]], ('6', 0.97982866)]
+```
+
+以下正确提取`text`的是？
+
+## 答案
+
+```bash
+text = line[1][0]
+```
+
+## 选项
+
+### A
+
+```bash
+text = line[0][1]
+```
+
+### B
+
+```bash
+text = line[0][0]
+```
+
+### C
+
+```bash
+text = line[1][1]
+```
--- a/data/2.python中阶/3.网络爬虫/9.验证码处理/hello_paddle.py
+++ b/data/2.python中阶/3.网络爬虫/9.验证码处理/hello_paddle.py
+
+import re
+from paddleocr import PaddleOCR
+
+if __name__ == "__main__":
+    ocr_client = PaddleOCR(
+        use_angle_cls=True,
+        lang="ch",
+        use_space_char=True,
+        use_zero_copy_run=True,
+        use_mp=True,
+        total_process_num=16,
+        ir_optim=True,
+        enable_mkldnn=True,
+        rec_batch_num=1,
+        max_batch_size=1
+    )
+    result = ocr_client.ocr('code.png', det=True, rec=True, cls=True)
+    code_text = []
+    for line in result:
+        print(line)
+        code_text.append(line[1][0])
+    print(code_text)
--- a/data/2.python中阶/3.网络爬虫/9.验证码处理/verification_code.md
+++ b/data/2.python中阶/3.网络爬虫/9.验证码处理/verification_code.md
@@ -2,11 +2,9 @@

 验证码是用来区分人和机器的一种方式，以下关于验证码的说法<span style="color:red">错误</span>的是：

-
-
 ## 答案

-```
+```bash
 验证码的识别是一个老话题，已经做到了100%的识别率
 ```

@@ -14,18 +12,18 @@

 ### A

-```
+```bash
 验证码的种类繁多，包括中英混合，点选，滑动等等
 ```

 ### B

-```
+```bash
 验证码识别要使用到OCR(Optical Character Recognition)技术
 ```

 ### C

-```
+```bash
 对于有难度的验证码，可以对接打码平台或者第三方平台提供的识别服务
 ```
--- a/main.py
+++ b/main.py
@@ -9,5 +9,5 @@ if __name__ == '__main__':
    walker = TreeWalker("data", "python", "python")
    walker.walk()

-    # md = MDWalker('data/2.python中阶/2.Web应用开发')
+    # md = MDWalker('data/2.python中阶/3.网络爬虫')
    # md.walk()