import requests
from lxml import etree
import re
import pymysql
from time import sleep
from concurrent.futures import ThreadPoolExecutor
 
def get_conn():
    # 创建连接
    conn = pymysql.connect(host="127.0.0.1",
                           user="root",
                           password="root",
                           db="novels",
                           charset="utf8")
    # 创建游标
    cursor = conn.cursor()
    return conn, cursor
 
def close_conn(conn, cursor):
    cursor.close()
    conn.close()
 
def get_xpath_resp(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        print(f"响应状态码: {resp.status_code}")
        print(f"网页内容长度: {len(resp.text)}")
        with open("debug.html", "w", encoding="utf-8") as f:
            f.write(resp.text)
        tree = etree.HTML(resp.text)
        return tree,resp
    except Exception as e:
        print(f"请求失败: {str(e)}")
        return None, None
 
def get_chapters(url):
    tree,_ = get_xpath_resp(url)
    # 获取小说名字
    novel_name_elements = tree.xpath('//*[@id="info"]/h1/text()')
    if not novel_name_elements:
        novel_name = "未知小说"
    else:
        novel_name = novel_name_elements[0]
    
    # 获取小说数据节点 - 使用更通用的选择器
    dds = tree.xpath('//dl[contains(@class,"chapterlist")]/dd') or tree.xpath('//div[@class="listmain"]//dd')
    title_list = []
    link_list = []
    for d in dds[:15]:
        title = d.xpath('./a/text()')[0]  # 章节标题
        title_list.append(title)
        link = d.xpath('./a/@href')[0]   # 章节链接
        chapter_url = url +link  # 构造完整链接
        link_list.append(chapter_url)
    return title_list,link_list,novel_name
 
def get_content(novel_name,title,url):
    try:
        cursor = None
        conn = None
        conn, cursor = get_conn()
        # 插入数据的sql
        sql = 'INSERT INTO novel(novel_name,chapter_name,content) VALUES(%s,%s,%s)'
        tree,resp = get_xpath_resp(url)
        # 获取内容
        content = re.findall('<div id="content">(.*?)</div>',resp.text)[0]
        # 对内容进行清洗
        content = content.replace('<br />','\n').replace('&nbsp;',' ').replace('全本小说网 www.qb5.tw，最快更新<a href="https://www.qb5.tw/book_116659/">宇宙职业选手</a>最新章节！<br><br>','')
        print(title,content)
        cursor.execute(sql,[novel_name,title,content])  # 插入数据
        conn.commit()  # 提交事务保存数据
    except:
        pass
    finally:
        sleep(2)
        close_conn(conn, cursor)  # 关闭数据库
 
 
if __name__ == '__main__':
    # 获取小说名字，标题链接，章节名称
    title_list, link_list, novel_name = get_chapters('https://www.qb5.tw/book_116659/')
    with ThreadPoolExecutor(5) as t:  # 创建5个线程
        for title,link in zip(title_list,link_list):
            t.submit(get_content, novel_name,title,link)  # 启动线程