提交 ef5bfca9 编写于 作者: R root

Sat Jun 28 16:05:00 CST 2025 inscode

上级 0a575cc3
run = "pip install -r requirements.txt;python main.py"
language = "python"
[packager]
AUTO_PIP = true
[env]
VIRTUAL_ENV = "/root/${PROJECT_DIR}/venv"
PATH = "${VIRTUAL_ENV}/bin:${PATH}"
PYTHONPATH = "$PYTHONHOME/lib/python3.10:${VIRTUAL_ENV}/lib/python3.10/site-packages"
REPLIT_POETRY_PYPI_REPOSITORY = "http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/simple"
MPLBACKEND = "TkAgg"
POETRY_CACHE_DIR = "/root/${PROJECT_DIR}/.cache/pypoetry"
[debugger]
program = "main.py"
run = "pip install -r requirements.txt && python main.py"
is_gui = false
is_resident = true
is_html = false
此差异已折叠。
print('欢迎来到 InsCode')
\ No newline at end of file
import requests
from lxml import etree
import re
import pymysql
from time import sleep
from concurrent.futures import ThreadPoolExecutor
def get_conn():
# 创建连接
conn = pymysql.connect(host="127.0.0.1",
user="root",
password="root",
db="novels",
charset="utf8")
# 创建游标
cursor = conn.cursor()
return conn, cursor
def close_conn(conn, cursor):
cursor.close()
conn.close()
def get_xpath_resp(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
try:
resp = requests.get(url, headers=headers, timeout=10)
print(f"响应状态码: {resp.status_code}")
print(f"网页内容长度: {len(resp.text)}")
with open("debug.html", "w", encoding="utf-8") as f:
f.write(resp.text)
tree = etree.HTML(resp.text)
return tree,resp
except Exception as e:
print(f"请求失败: {str(e)}")
return None, None
def get_chapters(url):
tree,_ = get_xpath_resp(url)
# 获取小说名字
novel_name_elements = tree.xpath('//*[@id="info"]/h1/text()')
if not novel_name_elements:
novel_name = "未知小说"
else:
novel_name = novel_name_elements[0]
# 获取小说数据节点 - 使用更通用的选择器
dds = tree.xpath('//dl[contains(@class,"chapterlist")]/dd') or tree.xpath('//div[@class="listmain"]//dd')
title_list = []
link_list = []
for d in dds[:15]:
title = d.xpath('./a/text()')[0] # 章节标题
title_list.append(title)
link = d.xpath('./a/@href')[0] # 章节链接
chapter_url = url +link # 构造完整链接
link_list.append(chapter_url)
return title_list,link_list,novel_name
def get_content(novel_name,title,url):
try:
cursor = None
conn = None
conn, cursor = get_conn()
# 插入数据的sql
sql = 'INSERT INTO novel(novel_name,chapter_name,content) VALUES(%s,%s,%s)'
tree,resp = get_xpath_resp(url)
# 获取内容
content = re.findall('<div id="content">(.*?)</div>',resp.text)[0]
# 对内容进行清洗
content = content.replace('<br />','\n').replace('&nbsp;',' ').replace('全本小说网 www.qb5.tw,最快更新<a href="https://www.qb5.tw/book_116659/">宇宙职业选手</a>最新章节!<br><br>','')
print(title,content)
cursor.execute(sql,[novel_name,title,content]) # 插入数据
conn.commit() # 提交事务保存数据
except:
pass
finally:
sleep(2)
close_conn(conn, cursor) # 关闭数据库
if __name__ == '__main__':
# 获取小说名字,标题链接,章节名称
title_list, link_list, novel_name = get_chapters('https://www.qb5.tw/book_116659/')
with ThreadPoolExecutor(5) as t: # 创建5个线程
for title,link in zip(title_list,link_list):
t.submit(get_content, novel_name,title,link) # 启动线程
\ No newline at end of file
requests==2.31.0
lxml==4.9.4
pymysql==1.1.0
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册