import requests from lxml import etree import re import pymysql from time import sleep from concurrent.futures import ThreadPoolExecutor def get_conn(): # 创建连接 conn = pymysql.connect(host="127.0.0.1", user="root", password="root", db="novels", charset="utf8") # 创建游标 cursor = conn.cursor() return conn, cursor def close_conn(conn, cursor): cursor.close() conn.close() def get_xpath_resp(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'} try: resp = requests.get(url, headers=headers, timeout=10) print(f"响应状态码: {resp.status_code}") print(f"网页内容长度: {len(resp.text)}") with open("debug.html", "w", encoding="utf-8") as f: f.write(resp.text) tree = etree.HTML(resp.text) return tree,resp except Exception as e: print(f"请求失败: {str(e)}") return None, None def get_chapters(url): tree,_ = get_xpath_resp(url) # 获取小说名字 novel_name_elements = tree.xpath('//*[@id="info"]/h1/text()') if not novel_name_elements: novel_name = "未知小说" else: novel_name = novel_name_elements[0] # 获取小说数据节点 - 使用更通用的选择器 dds = tree.xpath('//dl[contains(@class,"chapterlist")]/dd') or tree.xpath('//div[@class="listmain"]//dd') title_list = [] link_list = [] for d in dds[:15]: title = d.xpath('./a/text()')[0] # 章节标题 title_list.append(title) link = d.xpath('./a/@href')[0] # 章节链接 chapter_url = url +link # 构造完整链接 link_list.append(chapter_url) return title_list,link_list,novel_name def get_content(novel_name,title,url): try: cursor = None conn = None conn, cursor = get_conn() # 插入数据的sql sql = 'INSERT INTO novel(novel_name,chapter_name,content) VALUES(%s,%s,%s)' tree,resp = get_xpath_resp(url) # 获取内容 content = re.findall('
(.*?)
',resp.text)[0] # 对内容进行清洗 content = content.replace('
','\n').replace(' ',' ').replace('全本小说网 www.qb5.tw,最快更新宇宙职业选手最新章节!

','') print(title,content) cursor.execute(sql,[novel_name,title,content]) # 插入数据 conn.commit() # 提交事务保存数据 except: pass finally: sleep(2) close_conn(conn, cursor) # 关闭数据库 if __name__ == '__main__': # 获取小说名字,标题链接,章节名称 title_list, link_list, novel_name = get_chapters('https://www.qb5.tw/book_116659/') with ThreadPoolExecutor(5) as t: # 创建5个线程 for title,link in zip(title_list,link_list): t.submit(get_content, novel_name,title,link) # 启动线程