多线程版本.py 2.0 KB
Newer Older
梦想橡皮擦's avatar
梦想橡皮擦 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
import requests_html
import threading
import time
import fcntl


class MyThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        global page, lock, page_size
        while True:
            lock.acquire(True)
            if page >= page_size:
                lock.release()
                break
            else:
                page += 1
                lock.release()
                requests_html.DEFAULT_ENCODING = "gb18030"
                session = requests_html.HTMLSession()

                print("正在采集第{}页".format(page), "*" * 50)
                try:
                    page_url = f'http://www.world68.com/top.asp?t=5star&page={page}'
                    world = session.get(page_url, timeout=10)
                    print("正在采集数据", world.url)
                    # print(world.html)
                    title_a = world.html.find('dl>dt>a')
                    print(title_a)
                    my_str = ""

                    for item in title_a:
                        name = item.text
                        url = item.attrs['href']
                        my_str += f"{name.encode('utf-8').decode('utf-8')},{url}\n"

                    with open('thread_webs.txt', "a+", encoding="utf-8") as f:
                        fcntl.flock(f.fileno(), fcntl.LOCK_EX)  # 文件加锁
                        f.write(f"{my_str}")

                except Exception as e:
                    print(e, page_url)


if "__main__" == __name__:
    page_size = int(input("请输入总页码:"))
    page = 0
    thread_list = []

    # 获取开始时间
    start = time.perf_counter()

    lock = threading.Lock()
    for i in range(1, 5):
        t = MyThread()
        thread_list.append(t)
    for t in thread_list:
        t.start()
    for t in thread_list:
        t.join()
    # 获取时间间隔
    elapsed = (time.perf_counter() - start)
    print("程序运行完毕,总耗时为:", elapsed)