import requests from lxml import etree import re import pymysql from time import sleep from concurrent.futures import ThreadPoolExecutor def get_conn(): # 创建连接 conn = pymysql.connect(host="127.0.0.1", user="root", password="root", db="novels", charset="utf8") # 创建游标 cursor = conn.cursor() return conn, cursor def close_conn(conn, cursor): cursor.close() conn.close() def get_xpath_resp(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'} try: resp = requests.get(url, headers=headers, timeout=10) print(f"响应状态码: {resp.status_code}") print(f"网页内容长度: {len(resp.text)}") with open("debug.html", "w", encoding="utf-8") as f: f.write(resp.text) tree = etree.HTML(resp.text) return tree,resp except Exception as e: print(f"请求失败: {str(e)}") return None, None def get_chapters(url): tree,_ = get_xpath_resp(url) # 获取小说名字 novel_name_elements = tree.xpath('//*[@id="info"]/h1/text()') if not novel_name_elements: novel_name = "未知小说" else: novel_name = novel_name_elements[0] # 获取小说数据节点 - 使用更通用的选择器 dds = tree.xpath('//dl[contains(@class,"chapterlist")]/dd') or tree.xpath('//div[@class="listmain"]//dd') title_list = [] link_list = [] for d in dds[:15]: title = d.xpath('./a/text()')[0] # 章节标题 title_list.append(title) link = d.xpath('./a/@href')[0] # 章节链接 chapter_url = url +link # 构造完整链接 link_list.append(chapter_url) return title_list,link_list,novel_name def get_content(novel_name,title,url): try: cursor = None conn = None conn, cursor = get_conn() # 插入数据的sql sql = 'INSERT INTO novel(novel_name,chapter_name,content) VALUES(%s,%s,%s)' tree,resp = get_xpath_resp(url) # 获取内容 content = re.findall('