You need to sign in or sign up before continuing.
提交 bd64dc5c 编写于 作者: F feilong

修订到第7章:爬虫,提供完备的HelloWorld

无相关合并请求
{
"one_line": {
"urllib.request.urlopen": [
"urllib.request"
],
"response.read()": [
"response.readline()"
],
"buff.decode(\"utf8\")": [
"buff.encode(\"utf8\")"
]
},
"source": "get_html.py",
"source": "get_html.md",
"depends": [],
"exercise_id": 198,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# urlib 获取网页(1)
将 url 对应的网页下载到本地
```python
# -*- coding: UTF-8 -*-
import urllib.request
def get_html(url):
# TODO(You): 请在此实现代码
return html
if __name__ == '__main__':
url = "http://www.baidu.com"
html = get_html(url)
print(html)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import urllib.request
def get_html(url):
response = urllib.request.urlopen(url)
buff = response.read()
html = buff.decode("utf8")
return html
if __name__ == '__main__':
url = "http://www.baidu.com"
html = get_html(url)
print(html)
```
## 答案
```python
def get_html(url):
response = urllib.request.urlopen(url)
buff = response.read()
html = buff.decode("utf8")
return html
```
## 选项
### A
```python
def get_html(url):
response = urllib.request.urlopen(url)
buff = response.read()
html = buff.encode("utf8")
return html
```
### B
```python
def get_html(url):
response = urllib.request.urlopen(url)
buff = response.readline()
html = buff.decode("utf8")
return html
```
### C
```python
def get_html(url):
response = urllib.request(url)
buff = response.read()
html = buff.decode("utf8")
return html
```
### D
```python
def get_html(url):
response = urllib.request.urlopen(url)
buff = response.read()
html = buff.decode()
return html
```
{
"one_line": {
"bytes(urllib.parse.urlencode(data), encoding='utf8')": [
"bytes(urllib.parse.urlencode(data))",
"bytes(data, encoding='utf8')",
"urllib.parse.urlencode(data)"
]
},
"source": "post.py",
"source": "post.md",
"depends": [],
"exercise_id": 202,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# urllib post请求
urllib post请求
```python
# -*- coding: UTF-8 -*-
import urllib.request
import urllib.parse
def get_response(url, data):
# TODO(You): 请在此编写代码
return result
if __name__ == '__main__':
data = {
"key1": "value1",
"key2": "value2"
}
url = "http://httpbin.org/post"
html = get_response(url, data)
print(html)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import urllib.request
import urllib.parse
def get_response(url, data):
data = bytes(urllib.parse.urlencode(data), encoding='utf8')
response = urllib.request.urlopen(url, data=data)
buff = response.read()
result = buff.decode("utf8")
return result
if __name__ == '__main__':
data = {
"key1": "value1",
"key2": "value2"
}
url = "http://httpbin.org/post"
html = get_response(url, data)
print(html)
```
## 答案
```python
def get_response(url, data):
data = bytes(urllib.parse.urlencode(data), encoding='utf8')
response = urllib.request.urlopen(
url, data=data
)
buff = response.read()
result = buff.decode("utf8")
return result
```
## 选项
### A
```python
def get_response(url, data):
data = bytes(urllib.parse.urlencode(data, encoding='utf8'))
response = urllib.request.urlopen(
url, data=data
)
buff = response.read()
result = buff.decode("utf8")
return result
```
### B
```python
def get_response(url, data):
data = bytes(urllib.parse.urlencode(data), encoding='utf8')
response = urllib.request.urlopen(
url, data
)
buff = response.read()
result = buff.decode("utf8")
return result
```
### C
```python
def get_response(url, data):
data = urllib.parse.urlencode(data, encoding='utf8')
response = urllib.request.urlopen(
url, data=data
)
buff = response.read()
result = buff.decode("utf8")
return result
```
{
"one_line": {
"req.add_header(key, headers[key])": [
"req.append(key, headers[key])"
],
"urllib.request.urlopen(req)": [
"urllib.request.urlopen(url)"
],
"urllib.request.Request(url)": [
"urllib.request.request(url)"
]
},
"source": "with_headers.py",
"source": "with_headers.md",
"depends": [],
"exercise_id": 247,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# urlib 获取网页(2) with header
将 url 对应的网页下载到本地
```python
# -*- coding: UTF-8 -*-
import urllib.request
def get_html(url, headers):
# TODO(You): 请在此实现带头部信息的网页请求
return html
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
url = "http://www.baidu.com"
html = get_html(url, headers)
print(html)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import urllib.request
def get_html(url, headers=None):
req = urllib.request.Request(url)
if headers is not None:
for key in headers:
req.add_header(key, headers[key])
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode("utf8")
return html
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
url = "http://www.baidu.com"
html = get_html(url, headers)
print(html)
```
## 答案
```python
def get_html(url, headers):
req = urllib.request.Request(url)
for key in headers:
req.add_header(key, headers[key])
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode("utf8")
return html
```
## 选项
### A
```python
def get_html(url, headers):
req = urllib.request.Request(url)
for key in headers:
urllib.request.add_header(key, headers[key])
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode("utf8")
return html
```
### B
```python
def get_html(url, headers):
req = urllib.request.urlopen(url)
for key in headers:
req.add_header(key, headers[key])
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode("utf8")
return html
```
### C
```python
def get_html(url, headers):
req = urllib.request.Request(url)
for key in headers:
req.set_header(key, headers[key])
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode("utf8")
return html
```
......@@ -2,18 +2,17 @@
现在想爬取一个url为下拉滚动的页面,下列选项可以爬取到下列页面内容的是:
## 答案
```python
# -*- coding: UTF-8 -*-
import time
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get(url);
Thread.sleep(1000);
driver.get(url)
Thread.sleep(1000)
page_size = 10
for i in range(page_size):
......@@ -29,7 +28,7 @@ print(page.text)
### A
```
```bash
以上均不正确
```
......
{
"export": [
"simulate_login.json"
"simulate_login.json",
"hello_simulate.json"
],
"keywords": [],
"children": [
......
{
"author": "huanhuilong",
"source": "hello_simulate.md",
"depends": [],
"type": "code_options",
"notebook_enable": true,
"exercise_id": "237d7909392a48998437fdfe58ea3db4"
}
\ No newline at end of file
# 模拟登陆例子
以下是一个使用 cookie 模拟登录请求页面的例子
```python
# -*- coding: UTF-8 -*-
import requests
import sys
import io
if __name__ == "__main__":
# 登录后才能访问的网页
url = 'http://www.csdn.net'
# 浏览器登录后得到的cookie
cookie_str = r'xxx=yyy;zzz=mmm'
# 把cookie字符串处理成字典,以便接下来使用
# TODO(You): 请正确准备cookie数据
# 设置请求头
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
}
# 在发送get请求时带上请求头和cookies
resp = requests.get(
url,
headers=headers,
cookies=cookies
)
print(resp.content.decode('utf-8'))
```
正确实现代码的是?
## 答案
```python
cookies = {}
for line in cookie_str.split(';'):
key, value = line.split('=', 1)
cookies[key] = value
```
## 选项
### A
```python
cookies = {}
for line in cookie_str.split(';'):
key, value = line.split('=', 1)
cookies[key] = line
```
### B
```python
cookies = cookie_str.split(';')
```
### C
```python
cookies = []
for line in cookie_str.split(';'):
cookies.append([key,value])
```
import requests
import sys
import io
if __name__ == "__main__":
# 登录后才能访问的网页
url = 'csdn.net'
# 浏览器登录后得到的cookie
cookie_str = r'xxx=yyy;zzz=mmm'
# 把cookie字符串处理成字典,以便接下来使用
cookies = {}
for line in cookie_str.split(';'):
key, value = line.split('=', 1)
cookies[key] = value
# 设置请求头
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
}
# 在发送get请求时带上请求头和cookies
# TODO(You): 请在此使用 cookie 登录请求页面
print(resp.content.decode('utf-8'))
......@@ -2,11 +2,9 @@
一些网站需要登录之后才能浏览网站的其他内容,爬虫需要拥有登录获取cookie/session的能力才能继续采集数据,以下关于说法<span style="color:red">错误</span>的是:
## 答案
```
```bash
登录成功后获取的cookie一般来说永久有效
```
......@@ -14,18 +12,18 @@
### A
```
```bash
模拟登陆需要先注册网站的账号,或者多注册一些账号来维护一个cookies池
```
### B
```
```bash
获取登录页面,可以从登录按钮处获取到登录的url
```
### C
```
```bash
登录成功后获取到cookie,其他请求带上cookie就可以获取到请求的页面资源
```
{
"one_line": {
"findall": [
"find",
"finds",
"find_all"
]
},
"source": "chinese01.py",
"source": "chinese01.md",
"depends": [],
"exercise_id": 243,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# Python 中文处理(1)
获取中文个数
```python
# -*- coding: UTF-8 -*-
import re
def getnum_of_cn(inputdata):
'''计算字符串中 中文字符 数量'''
# TODO(You): 请编写正则查询代码
return len(chi)
def test():
n = getnum_of_cn('你好,lajfldkjaklda123')
print(n)
if __name__ == '__main__':
test()
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import re
def getnum_of_cn(inputdata):
'''计算字符串中 中文字符 数量'''
chi = re.findall(r'[\u4E00-\u9FFF]', inputdata)
return len(chi)
def test():
n = getnum_of_cn('你好,lajfldkjaklda123')
print(n)
if __name__ == '__main__':
test()
```
## 答案
```python
chi = re.findall(r'[\u4E00-\u9FFF]', inputdata)
```
## 选项
### A
```python
chi = re.find(r'[\u4E00-\u9FFF]', inputdata)
```
### B
```python
chi = inputdata.findall(r'[\u4E00-\u9FFF]')
```
### C
```python
chi = re.findall(r'\u4E00-\u9FFF', inputdata)
```
{
"one_line": {
"search": [
"searchall",
"match",
"find"
]
},
"source": "chinese02.py",
"source": "chinese02.md",
"depends": [],
"exercise_id": 219,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# Python 中文处理(2)
获取中文个数
```python
# -*- coding: UTF-8 -*-
import re
def search_text(inputdata):
'''search返回匹配到的一个'''
# TODO(You): 请在此实现代码
return chi
def test():
n = search_text('你好,nlp先生!nlp先生!')
print(n)
if __name__ == '__main__':
test()
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import re
def search_text(inputdata):
'''search返回匹配到的一个'''
chi = re.search('nlp', inputdata)
return chi
def test():
n = search_text('你好,nlp先生!nlp先生!')
print(n)
if __name__ == '__main__':
test()
```
## 答案
```python
chi = re.search('nlp', inputdata)
```
## 选项
### A
```python
chi = re.searchAll('nlp', inputdata)
```
### B
```python
chi = re.search(inputdata, 'nlp')
```
### C
```python
chi = inputdata.search('nlp')
```
{
"one_line": {
"findall": [
"search",
"match",
"sub"
]
},
"source": "find_ip_address.py",
"source": "find_ip_address.md",
"depends": [],
"exercise_id": 181,
"type": "code_options"
"type": "code_options",
"author": "huanhuilong",
"notebook_enable": true
}
\ No newline at end of file
# 正则表达式实战(2)
查找字符串里含有的全部 IPV4 和 IPV6 地址
```python
# -*- coding: UTF-8 -*-
import re
def find_all_ipv4(text):
result = []
ipv4 = r"((\b25[0-5]|\b2[0-4][0-9]|\b[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})"
# TODO(You):请在此匹配ipv4
for m in ret:
result.append({'type': 'ipv4', 'value': m[0]})
return result
def find_all_ipv6(text):
result = []
ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
# TODO(You): 请在此匹配ipv6
for m in ret:
result.append({'type': 'ipv6', 'value': m[0]})
return result
def find_all_ip(text):
result = find_all_ipv4(text) + find_all_ipv6(text)
return result
if __name__ == '__main__':
input = 'IP地址有IPV4,例如:192.168.100.2,也有IPV6,例如:fe80:0000:0000:0000:0204:61ff:fe9d:f156,以及:fe80:0000:0000:0000:0204:61ff:fe9d:f156,还有 192.168.100.50'
results = find_all_ip(input)
for item in results:
print('type: {}, value: {}'.format(item['type'], item['value']))
```
请选出下列能**正确**实现ipv4和ipv6正则匹配的选项。
## template
```python
import re
def find_all_ip(text):
result = []
ipv4 = r"((\b25[0-5]|\b2[0-4][0-9]|\b[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})"
ret = re.findall(ipv4, text)
for m in ret:
result.append({'type': 'ipv4', 'value': m[0]})
ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
ret = re.finditer(ipv6, text)
for m in ret:
result.append({'type': 'ipv6', 'value': m[0]})
return result
if __name__ == '__main__':
input = 'IP地址有IPV4,例如:192.168.100.2,也有IPV6,例如:fe80:0000:0000:0000:0204:61ff:fe9d:f156,以及:fe80:0000:0000:0000:0204:61ff:fe9d:f156,还有 192.168.100.50'
results = find_all_ip(input)
for item in results:
print('type: {}, value: {}'.format(item['type'], item['value']))
```
## 答案
```python
def find_all_ipv4(text):
...
ret = re.findall(ipv4, text)
...
def find_all_ipv6(text):
...
ret = re.finditer(ipv6, text)
...
```
## 选项
### A
```python
def find_all_ipv4(text):
...
ret = re.findall(text, ipv4)
...
def find_all_ipv6(text):
...
ret = re.finditer(text, ipv6)
...
```
### B
```python
def find_all_ipv4(text):
...
ret = text.findall(ipv4)
...
def find_all_ipv6(text):
...
ret = text.finditer(ipv6)
...
```
### C
```python
def find_all_ipv4(text):
...
ret = re.search(ipv4, text)
...
def find_all_ipv6(text):
...
ret = re.search(ipv6, text)
...
```
{
"one_line": {
"<[^>]+>": [
"<.*>",
"<[^>]?>"
],
", re.S": [
""
]
},
"source": "remove_html.py",
"source": "remove_html.md",
"depends": [],
"exercise_id": 182,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# 正则表达式实战(1)
去除html标签
```python
# -*- coding: UTF-8 -*-
import re
from typing import Text
def remove_html(content):
# TODO(You): 请在此实现代码
return result
if __name__ == '__main__':
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
Text = remove_html(html)
print(Text)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import re
from typing import Text
def remove_html(content):
pattern = re.compile(r'<[^>]+>', re.S)
result = pattern.sub('', content)
return result
def test():
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
Text = remove_html(html)
print(Text)
if __name__ == '__main__':
test()
```
## 答案
```python
def remove_html(content):
pattern = re.compile(r'<[^>]+>', re.S)
result = pattern.sub('', content)
return result
```
## 选项
### A
```python
def remove_html(content):
pattern = re.compile(r'<[^>]+>')
result = pattern.sub('', content)
return result
```
### B
```python
def remove_html(content):
pattern = re.compile(r'<[^>]+>', re.s)
result = pattern.sub('', content)
return result
```
### C
```python
def remove_html(content):
pattern = re.compile(r'<[^>]+>', re.S)
result = re.sub(pattern, content)
return result
```
{
"one_line": {
"find_all": [
"find",
"xpath",
"findall"
]
},
"source": "get_p.py",
"source": "get_p.md",
"depends": [],
"exercise_id": 204,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# BeautifulSoup 获取所有p标签
获取所有p标签里的文本
```python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
def fetch_p(html):
# TODO(You): 请在此实现代码
return results
if __name__ == '__main__':
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
p_text = fetch_p(html)
print(p_text)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
from bs4 import BeautifulSoup
def fetch_p(html):
soup = BeautifulSoup(html, 'lxml')
p_list = soup.find_all("p")
return [p.text for p in p_list]
def test():
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
p_text = fetch_p(html)
print(p_text)
if __name__ == '__main__':
test()
```
## 答案
```python
def fetch_p(html):
soup = BeautifulSoup(html, 'lxml')
p_list = soup.find_all("p")
results = [p.text for p in p_list]
return results
```
## 选项
### A
```python
def fetch_p(html):
soup = BeautifulSoup(html, 'lxml')
p_list = soup.xpath("p")
results = [p.text for p in p_list]
return results
```
### B
```python
def fetch_p(html):
soup = BeautifulSoup(html, 'lxml')
p_list = soup.findAll("p")
results = [p.text for p in p_list]
return results
```
### C
```python
def fetch_p(html):
soup = BeautifulSoup(html, 'lxml')
results = soup.find_all("p")
return results
```
{
"one_line": {
"text": [
"text()",
"find_text()",
"all_text()"
]
},
"source": "get_text.py",
"source": "get_text.md",
"depends": [],
"exercise_id": 245,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# BeautifulSoup 获取text
获取网页的text
```python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
def fetch_text(html):
# TODO(You): 请在此实现代码
return result
if __name__ == '__main__':
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
text = fetch_text(html)
print(text)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
from bs4 import BeautifulSoup
def fetch_text(html):
soup = BeautifulSoup(html, 'lxml')
result = soup.text
return result
def test():
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
text = fetch_text(html)
print(text)
if __name__ == '__main__':
test()
```
## 答案
```python
def fetch_text(html):
soup = BeautifulSoup(html, 'lxml')
result = soup.text
return result
```
## 选项
### A
```python
def fetch_text(html):
soup = BeautifulSoup(html, 'lxml')
result = soup.find_all('text')
return result
```
### B
```python
def fetch_text(html):
soup = BeautifulSoup(html, 'lxml')
result = soup.find_text()
return result
```
### C
```python
def fetch_text(html):
soup = BeautifulSoup(html, 'lxml')
result = soup.text()
return result
```
{
"one_line": {
"html.parser": [
"html5"
],
"'img'": [
"'src'"
],
"BeautifulSoup": [
"beautifulsoup"
]
},
"source": "html_parer.py",
"source": "html_parer.md",
"depends": [],
"exercise_id": 226,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# BeautifulSoup
查找网页里所有图片地址
```python
from bs4 import BeautifulSoup
def fetch_imgs(html):
# TODO(You): 请在此实现代码
return imgs
def test():
imgs = fetch_imgs(
'<p><img src="http://example.com"/><img src="http://example.com"/></p>')
print(imgs)
if __name__ == '__main__':
test()
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
from bs4 import BeautifulSoup
def fetch_imgs(html):
soup = BeautifulSoup(html, 'html.parser')
imgs = [tag['src'] for tag in soup.find_all('img')]
return imgs
def test():
imgs = fetch_imgs(
'<p><img src="http://example.com"/><img src="http://example.com"/></p>')
print(imgs)
if __name__ == '__main__':
test()
```
## 答案
```python
def fetch_imgs(html):
soup = BeautifulSoup(html, 'html.parser')
imgs = [tag['src'] for tag in soup.find_all('img')]
return imgs
```
## 选项
### A
```python
def fetch_imgs(html):
soup = BeautifulSoup(html, 'html.parser')
imgs = [tag for tag in soup.find_all('img')]
return imgs
```
### B
```python
def fetch_imgs(html):
soup = BeautifulSoup(html, 'html.parser')
imgs = soup.find_all('img')
return imgs
```
### C
```python
def fetch_imgs(html):
soup = BeautifulSoup('html.parser', html)
imgs = [tag['src'] for tag in soup.find_all('img')]
return imgs
```
......@@ -8,7 +8,7 @@ from bs4 import BeautifulSoup
def fetch_imgs(html):
soup = BeautifulSoup(html, 'html.parser')
imgs = [tag for tag in soup.find_all('img')]
imgs = [tag['src'] for tag in soup.find_all('img')]
return imgs
......
{
"one_line": {
"xpath": [
"find"
],
"HTML": [
"html",
"Html"
],
"//p[@class='item-1']/text()": [
"//p[@class='item-1']",
"//p[@class='item-1']/text"
]
},
"source": "get_html_appoint_p.py",
"source": "get_html_appoint_p.md",
"depends": [],
"exercise_id": 211,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# lxml解析网页
使用xpath获取 class 为 "item-1" 的段落文本
```python
# -*- coding: UTF-8 -*-
from lxml import etree
def fetch_text(html):
# TODO(You): 请在此实现代码
return result
if __name__ == '__main__':
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
imgs = fetch_text(html)
print(imgs)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
from lxml import etree
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p[@class='item-1']/text()")
return result
def test():
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
imgs = fetch_text(html)
print(imgs)
if __name__ == '__main__':
test()
```
## 答案
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p[@class='item-1']/text()")
return result
```
## 选项
### A
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p[@class='item-2']/text()")
return result
```
### B
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p[class='item-1']/text()")
return result
```
### C
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p[@class='item-1']/text")
return result
```
{
"one_line": {
"//p/text()": [
"p/text()",
"//p",
"p.text"
]
},
"source": "get_html_p.py",
"source": "get_html_p.md",
"depends": [],
"exercise_id": 191,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# lxml解析网页
使用xpath获取所有段落的文本
```python
# -*- coding: UTF-8 -*-
from lxml import etree
def fetch_text(html):
# TODO(You): 请在此实现代码
return result
if __name__ == '__main__':
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
imgs = fetch_text(html)
print(imgs)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
from lxml import etree
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p/text()")
return result
def test():
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
imgs = fetch_text(html)
print(imgs)
if __name__ == '__main__':
test()
```
## 答案
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p/text()")
return result
```
## 选项
### A
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p/text")
return result
```
### B
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("/p/text()")
return result
```
### C
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//p.text()")
return result
```
{
"one_line": {
"etree": [
"tree",
"btree"
],
"//text()": [
"text()",
"//text",
"/text()"
]
},
"source": "get_html_text.py",
"source": "get_html_text.md",
"depends": [],
"exercise_id": 220,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# lxml解析网页
使用xpath获取所有的文本
```python
# -*- coding: UTF-8 -*-
from lxml import etree
def fetch_text(html):
# TODO(You): 请在此实现代码
return result
if __name__ == '__main__':
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p>body 元素的内容会显示在浏览器中。</p>
<p>title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
imgs = fetch_text(html)
print(imgs)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
from lxml import etree
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//text()")
return result
def test():
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p>body 元素的内容会显示在浏览器中。</p>
<p>title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
imgs = fetch_text(html)
print(imgs)
if __name__ == '__main__':
test()
```
## 答案
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//text()")
return result
```
## 选项
### A
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("/text()")
return result
```
### B
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("//text")
return result
```
### C
```python
def fetch_text(html):
html = etree.HTML(html)
result = html.xpath("/text()")
return result
```
{
"one_line": {
"get": [
"post",
"gets",
"fetch"
]
},
"source": "get_html.py",
"source": "get_html.md",
"depends": [],
"exercise_id": 242,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# requests 获取网页(1)
获取url对应的网页HTML
```python
# -*- coding: UTF-8 -*-
import requests
def get_html(url):
# TODO(You): 请在此实现代码
return result
if __name__ == '__main__':
url = "http://www.baidu.com"
html = get_html(url)
print(html)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import requests
def get_html(url):
response = requests.get(url=url)
return response.text
def test():
url = "http://www.baidu.com"
html = get_html(url)
print(html)
if __name__ == '__main__':
test()
```
## 答案
```python
def get_html(url):
response = requests.get(url=url)
result = response.text
return result
```
## 选项
### A
```python
def get_html(url):
response = requests.get(url)
result = response.text
return result
```
### B
```python
def get_html(url):
result = requests.get(url=url)
return result
```
### C
```python
def get_html(url):
response = requests.get(url=url)
result = response.html
return result
```
{
"one_line": {
"post": [
"get",
"posts"
],
"response = requests.post(url, data, headers)": [
"response = requests.post(url, headers, data)",
"response = requests.post(data, url, headers)"
]
},
"source": "post.py",
"source": "post.md",
"depends": [],
"exercise_id": 186,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# requests post 请求
requests post 请求
```python
# -*- coding: UTF-8 -*-
import requests
def get_response(url, data, headers=None):
# TODO(You): 请在此实现代码
return result
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
data = {
"key1": "value1",
"key2": "value2"
}
url = "http://httpbin.org/post"
html = get_response(url, data, headers)
print(html)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import requests
def get_response(url, data, headers=None):
response = requests.post(url, data, headers)
return response.text
def test():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
data = {
"key1": "value1",
"key2": "value2"
}
url = "http://httpbin.org/post"
html = get_response(url, data, headers)
print(html)
if __name__ == '__main__':
test()
```
## 答案
```python
def get_response(url, data, headers=None):
response = requests.post(url, data, headers)
result = response.text
return result
```
## 选项
### A
```python
def get_response(url, data, headers=None):
response = requests.get(url, headers data)
result = response.text
return result
```
### B
```python
def get_response(url, data, headers=None):
result = requests.post(url, data, headers)
return result
```
### C
```python
def get_response(url, data, headers=None):
response = requests.post(url, data, headers)
result = response.text()
return result
```
{
"one_line": {
"response.text": [
"response.text()",
"response.gettext()",
"response.get_text()",
"response"
]
},
"source": "with_headers.py",
"source": "with_headers.md",
"depends": [],
"exercise_id": 210,
"type": "code_options"
"type": "code_options",
"author": "zxm2015",
"notebook_enable": true
}
\ No newline at end of file
# requests 获取网页(2) with headers
将url对应的网页下载到本地
```python
# -*- coding: UTF-8 -*-
import requests
def get_html(url, headers=None):
response = requests.get(url=url)
return response.text
if __name__ == '__main__':
# TODO(You): 请正确编写 headers
headers = ...
url = "http://www.baidu.com"
html = get_html(url, headers)
print(html)
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import requests
def get_html(url, headers=None):
response = requests.get(url=url)
return response.text
def test():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
url = "http://www.baidu.com"
html = get_html(url, headers)
print(html)
if __name__ == '__main__':
test()
```
## 答案
```python
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
```
## 选项
### A
```python
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
```
### B
```python
headers = {
"useragent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
```
### C
```python
headers = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
]
```
{
"export": [
"selenium.json"
"selenium.json",
"hello_selenium.json"
],
"keywords": [],
"children": [
......
{
"author": "huanhuilong",
"source": "hello_selenium.md",
"depends": [],
"type": "code_options",
"notebook_enable": false,
"exercise_id": "8b4b78b2b9f84b5f8cd6fbb7fe85c3d0"
}
\ No newline at end of file
# selenium 测试用例
Selenium 是web自动化测试工具集,爬虫可以利用其实现对页面动态资源的采集。请按顺序操作
1. 安装 Python Selenium 包:`pip install selenium`
2. 安装 Chrome 驱动:`https://npm.taobao.org/mirrors/chromedriver/`,如果使用别的浏览器需要下载对应浏览器的驱动
3. 编写使用 python unittest 测试使用 selenium 完成自动化
selenium 自动化网页测试的操作:
1. 使用 selenium 的Chrome 驱动,打开 CSDN 首页,此时会打开 Chrome 浏览器测试页面
2. 验证字符串 "CSDN" 在页面标题
3. 找到网页里的搜索框
4. 输入"OpenCV技能树"
5. 输入回车,搜索结果
6. 等待10秒退出
代码框架如下:
```python
# -*- coding: UTF-8 -*-
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Chrome()
def test_search_in_python_org(self):
# TODO(You): 请正确实现浏览器自动化测试需求
time.sleep(10)
def tearDown(self):
self.driver.close()
if __name__ == "__main__":
unittest.main()
```
以下代码实现正确的是?
## template
```python
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Chrome()
def test_search_in_python_org(self):
driver = self.driver
driver.get("https://www.csdn.net/")
self.assertIn("CSDN", driver.title)
elem = driver.find_element_by_id("toolbar-search-input")
elem.send_keys("OpenCV 技能树")
elem.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
time.sleep(10)
def tearDown(self):
self.driver.close()
if __name__ == "__main__":
unittest.main()
```
## 答案
```python
def test_search_in_python_org(self):
driver = self.driver
driver.get("https://www.csdn.net/")
self.assertIn("CSDN", driver.title)
elem = driver.find_element_by_id("toolbar-search-input")
elem.send_keys("OpenCV 技能树")
elem.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
time.sleep(10)
```
## 选项
### A
```bash
def test_search_in_python_org(self):
driver = self.driver
driver.get("https://www.csdn.net/")
self.assertIn("CSDN", driver.title)
elem = driver.find_element_by_name("toolbar-search-input")
elem.send_keys("OpenCV 技能树")
elem.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
time.sleep(10)
```
### B
```bash
def test_search_in_python_org(self):
driver = self.driver
driver.get("https://www.csdn.net/")
self.assertIn("CSDN", driver.title)
elem = driver.find_element_by_id("toolbar-search-input")
elem.send_keys("OpenCV 技能树")
assert "No results found." not in driver.page_source
time.sleep(10)
```
### C
```bash
def test_search_in_python_org(self):
driver = self.driver
driver.get("https://www.csdn.net/")
self.assertIn("CSDN", driver.title)
elem = driver.find_element_by_id("toolbar-search-input")
elem.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
time.sleep(10)
```
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Chrome()
def test_search_in_python_org(self):
driver = self.driver
driver.get("https://www.csdn.net/")
self.assertIn("CSDN", driver.title)
elem = driver.find_element_by_id("toolbar-search-input")
elem.send_keys("OpenCV 技能树")
elem.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
time.sleep(10)
def tearDown(self):
self.driver.close()
if __name__ == "__main__":
unittest.main()
......@@ -2,11 +2,9 @@
Selenium是web自动化测试工具集,爬虫可以利用其实现对页面动态资源的采集,对于其这种说法<span style="color:red">错误</span>的是:
## 答案
```
```bash
selenium和requests一样,都能用来采集数据,具有同等的速度
```
......@@ -14,18 +12,18 @@ selenium和requests一样,都能用来采集数据,具有同等的速度
### A
```
页面执行js才能呈现的内容,可以使用selenium来协助采集
```bash
页面执行 js 才能呈现的内容,可以使用 selenium 来协助采集
```
### B
```
```bash
selenium本质是驱动浏览器来发送请求,模拟浏览器的行为
```
### C
```
```bash
请求之后往往需要等待一段时间,等待资源加载渲染完成
```
{
"one_line": {
"if self.page_count < self.totgal_pages:": [
"if self.page_count <= self.totgal_pages:"
],
"callback=self.parse": [
"callback=parse"
],
"yield": [
"return"
]
},
"source": "so_tag_spider.py",
"source": "so_tag_spider.md",
"depends": [
"tag_pipeline.py"
],
"exercise_id": 206,
"type": "code_options"
"type": "code_options",
"author": "huanhuilong",
"notebook_enable": true
}
\ No newline at end of file
# Python 爬虫
爬取 stackoverflow 标签
```python
# -*- coding: UTF-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
BASE_DIR = __loader__.name
class StackOverflowTagSpider(scrapy.Spider):
# 爬虫名字
name = "stackoverflow_tags"
# 爬虫运行的域名
allowed_domains = ["stackoverflow.com"]
# 爬虫开始爬取的第1个页面
start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
# 爬虫配置,ITEM_PIPELINES指定每个条目的处理类
custom_settings = {
'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.total_pages = 45
self.page_count = 0
def parse(self, response):
# 访问的页面数+1,使用CSS查询页面内的标签文本
self.page_count += 1
tags = response.css('.post-tag::text')
for tag in tags:
yield {'name': tag.get()}
# 找到页面底部的页码,访问下一页
# TODO(You): 请正确实现访问下一页代码
if __name__ == "__main__":
settings = Settings()
process = CrawlerProcess()
process.crawl(StackOverflowTagSpider)
process.start()
```
请选出下列能**正确**实现这一功能的选项。
## template
```python
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
BASE_DIR = __loader__.name
class StackOverflowTagSpider(scrapy.Spider):
name = "stackoverflow_tags"
allowed_domains = ["stackoverflow.com"]
start_urls = ['https://stackoverflow.com/tags/synonyms?page=1']
custom_settings = {
'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301},
'LOG_LEVEL': 'INFO'
}
def __init__(self):
self.totgal_pages = 45
self.page_count = 0
def parse(self, response):
self.page_count += 1
tags = response.css('.post-tag::text')
for tag in tags:
yield {'name': tag.get()}
if self.page_count < self.totgal_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page_item = next_page_list[len(next_page_list)-1]
next_page = next_page_item.get()
print('next_page:', next_page)
yield response.follow(next_page, callback=self.parse, dont_filter=True)
if __name__ == "__main__":
settings = Settings()
process = CrawlerProcess()
process.crawl(StackOverflowTagSpider)
process.start()
```
## 答案
```python
if self.page_count < self.total_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page = next_page_list[len(next_page_list)-1].get()
yield response.follow(next_page, callback=self.parse, dont_filter=True)
```
## 选项
### A
```python
if self.page_count < self.total_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page = next_page_list[len(next_page_list)-1].get()
return response.follow(next_page, callback=self.parse, dont_filter=True)
```
### B
```python
if self.page_count < self.total_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page = next_page_list[len(next_page_list)-1]
yield response.follow(next_page, callback=self.parse, dont_filter=True)
```
### C
```python
if self.page_count <= self.total_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page = next_page_list[len(next_page_list)-1].get()
yield response.follow(next_page, callback=self.parse, dont_filter=True)
```
### D
```python
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page = next_page_list[len(next_page_list)-1].get()
yield response.follow(next_page, callback=self.parse, dont_filter=True)
```
### E
```python
if self.page_count < self.total_pages:
next_page_list = response.xpath('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page = next_page_list[len(next_page_list)-1].get()
yield response.follow(next_page, callback=self.parse, dont_filter=True)
```
### F
```python
if self.page_count < self.total_pages:
next_page_list = response.css('a.js-pagination-item::attr(href)')
if len(next_page_list) > 0:
next_page = next_page_list[len(next_page_list)-1].get()
yield response.next(next_page, callback=self.parse, dont_filter=True)
```
{
"one_line": {
"if self.count > 0:": [
"if self.count >= 0:"
],
"process_item(self, item, spider)": [
"process_item(self, spider, item)"
],
"self.file.close()": [
""
],
", 'w')": [
", 'r')"
]
},
"source": "tag_pipeline.py",
"source": "tag_pipeline.md",
"depends": [],
"exercise_id": 187,
"type": "code_options"
"type": "code_options",
"author": "huanhuilong",
"notebook_enable": true
}
\ No newline at end of file
# Python 爬虫(1)
实现一个用以 scrapy 爬虫处理中保存 stackoverflow 标签数据的管道处理类,爬虫管道类必须实现3个方法
1. `open_spider(self, spider)`
2. `process_item(self, item, spider)`
3. `close_spider(self, spider)`
本例最终输出的 json 文件格式请参考[stackoverflow.tag.json](https://codechina.csdn.net/csdn/csdn-tags/-/blob/master/src/dataset/stackoverflow.tag.json)
```python
# -*- coding: UTF-8 -*-
import json
class StackOverflowTagPipeline(object):
def open_spider(self, spider):
''' 打开文件并写入'[\n' 到 json 文件'''
self.file = open('/tmp/stackoverflow.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def process_item(self, item, spider):
''' 写入一个 {"name":xxx} 格式的元素,注意逗号拼接 '''
# 去重
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
# TODO:(You): 请正确实现拼接json写入的代码
result = ...
# 写入拼接文本
self.file.write(result)
self.count += 1
def close_spider(self, spider):
''' 写入'\n]' 并关闭文件 '''
self.file.write('\n]')
self.file.close()
if __name__ == "__main__":
t = StackOverflowTagPipeline()
t.open_spider(None)
t.process_item({'name': 'c++'}, None)
t.close_spider(None)
```
以下对json拼接写入处理正确的代码是?
## template
```python
import json
class StackOverflowTagPipeline(object):
def open_spider(self, spider):
''' 打开文件并写入'[\n' 到 json 文件'''
self.file = open('/tmp/stackoverflow.tag.json', 'w')
self.file.write('[\n')
self.count = 0
self.tags = {}
def process_item(self, item, spider):
''' 写入一个 {"name":xxx} 格式的元素,注意逗号拼接 '''
if self.tags.get(item['name']) is not None:
return
self.tags[item['name']] = True
words = []
if self.count > 0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
self.file.write(line)
self.count += 1
def close_spider(self, spider):
''' 写入'\n]' 并关闭文件 '''
self.file.write('\n]')
self.file.close()
if __name__ == "__main__":
t = StackOverflowTagPipeline()
t.open_spider(None)
t.process_item({'name': 'c++'}, None)
t.close_spider(None)
```
## 答案
```python
words = []
if self.count > 0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = ''.join(words)
```
## 选项
### A
```python
words = []
if self.count > 0:
words.append(',\n')
words.append(' ')
words.append(item))
result = ''.join(words)
```
### B
```python
words = []
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
result = ''.join(words)
```
### C
```python
words = []
if self.count > 0:
words.append(',\n')
words.append(' ')
words.append(json.dumps(item, ensure_ascii=False).strip())
line = words
```
{
"export": [
"pyspider.json"
"pyspider.json",
"hello_pyspider.json"
],
"keywords": [],
"children": [
......
{
"author": "huanhuilong",
"source": "hello_pyspider.md",
"depends": [],
"type": "code_options",
"notebook_enable": false,
"exercise_id": "ed92d5e3360a4dabb6dfa3b408768083"
}
\ No newline at end of file
# pyspider 例子
以下是一个 PySpider 的示范例子代码
```python
# -*- coding: UTF-8 -*-
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://scrapy.org/', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
self.crawl(each.attr.href, callback=self.detail_page)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}
```
以下关于上述代码说法正确的是?
## 答案
```bash
全部都正确
```
## 选项
### A
```python
def on_start(self):
'''该函数是入口函数,pyspider 命令启动 run 之后会调用该入口函数'''
```
### B
```python
# 添加了一个爬虫任务到PySpider,
# 回调函数调用了 self.index_page 成员方法
self.crawl(url, callback=self.index_page)
```
### C
```python
def index_page(self, response):
'''该函数的 response 参数是一个 Response* 对象.
它通过了一组类似jQuery 的 API 用来查询和提取网页数据。
'''
```
### D
```python
def detail_page(self, response):
'''该函数返回一个字典对象. 返回值会被 resultdb 捕获. '''
```
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://scrapy.org/', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
self.crawl(each.attr.href, callback=self.detail_page)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}
# pyspider
Pyspider与Scrapy都可以用来爬取数据,关于他们的说法<span style="color:red">错误</span>的是:
Pyspider 与 Scrapy 都可以用来爬取数据,关于他们的说法<span style="color:red">错误</span>的是:
## 答案
```
Scrapy提供了web界面,可以用来调试部署
```bash
Scrapy 提供了 web 界面,可以用来调试部署
```
## 选项
### A
```
Pyspider提供了web界面,可以进行可视化调试
```bash
Pyspider 提供了 web 界面,可以进行可视化调试
```
### B
```
初学者如果想快速入门爬取一个新闻网站,推荐使用Pyspider
```bash
初学者如果想快速入门爬取一个新闻网站,推荐使用 Pyspider
```
### C
```
Scrapy的可扩展程度更高,主要用来应对一些复杂的爬取场景
```bash
Scrapy 的可扩展程度更高,主要用来应对一些复杂的爬取场景
```
data/2.python中阶/3.网络爬虫/9.验证码处理/code.png

705.1 KB

{
"export": [
"verification_code.json"
"verification_code.json",
"hello_paddle.json"
],
"keywords": [],
"children": [
......
{
"author": "huanhuilong",
"source": "hello_paddle.md",
"depends": [],
"type": "code_options",
"notebook_enable": false,
"exercise_id": "d925c57963714c1da1268ab4e4680f98"
}
\ No newline at end of file
# 爬虫验证码识别
使用百度 paddle ocr 库可以识别验证码
1. 安装paddle:`pip install paddlepaddle==2.1.0`
2. 安装paddle ocr: `pip install paddleocr==2.0.6`
3. 编写代码
```python
# -*- coding: UTF-8 -*-
import re
from paddleocr import PaddleOCR
if __name__ == "__main__":
ocr_client = PaddleOCR(
use_angle_cls=True,
lang="ch",
use_space_char=True,
use_zero_copy_run=True,
use_mp=True,
total_process_num=16,
ir_optim=True,
enable_mkldnn=True,
rec_batch_num=1,
max_batch_size=1
)
result = ocr_client.ocr('code.png', det=True, rec=True, cls=True)
code_text = []
for line in result:
print(line)
# TODO(You): 请正确提取文本
text = ...
code_text.append(text)
print(code_text)
```
其中 line 的打印例子是:
```bash
[[[881.0, 77.0], [1128.0, 56.0], [1161.0, 439.0], [914.0, 460.0]], ('6', 0.97982866)]
```
以下正确提取`text`的是?
## 答案
```bash
text = line[1][0]
```
## 选项
### A
```bash
text = line[0][1]
```
### B
```bash
text = line[0][0]
```
### C
```bash
text = line[1][1]
```
import re
from paddleocr import PaddleOCR
if __name__ == "__main__":
ocr_client = PaddleOCR(
use_angle_cls=True,
lang="ch",
use_space_char=True,
use_zero_copy_run=True,
use_mp=True,
total_process_num=16,
ir_optim=True,
enable_mkldnn=True,
rec_batch_num=1,
max_batch_size=1
)
result = ocr_client.ocr('code.png', det=True, rec=True, cls=True)
code_text = []
for line in result:
print(line)
code_text.append(line[1][0])
print(code_text)
......@@ -2,11 +2,9 @@
验证码是用来区分人和机器的一种方式,以下关于验证码的说法<span style="color:red">错误</span>的是:
## 答案
```
```bash
验证码的识别是一个老话题,已经做到了100%的识别率
```
......@@ -14,18 +12,18 @@
### A
```
```bash
验证码的种类繁多,包括中英混合,点选,滑动等等
```
### B
```
```bash
验证码识别要使用到OCR(Optical Character Recognition)技术
```
### C
```
```bash
对于有难度的验证码,可以对接打码平台或者第三方平台提供的识别服务
```
......@@ -9,5 +9,5 @@ if __name__ == '__main__':
walker = TreeWalker("data", "python", "python")
walker.walk()
# md = MDWalker('data/2.python中阶/2.Web应用开发')
# md = MDWalker('data/2.python中阶/3.网络爬虫')
# md.walk()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部