diff --git "a/\346\241\210\344\276\2139/\346\262\263\345\214\227\351\230\263\345\205\211\351\227\256\346\224\277\345\271\263\345\217\260\345\256\236\350\257\235\345\256\236\350\257\264.py" "b/\346\241\210\344\276\2139/\346\262\263\345\214\227\351\230\263\345\205\211\351\227\256\346\224\277\345\271\263\345\217\260\345\256\236\350\257\235\345\256\236\350\257\264.py" new file mode 100644 index 0000000000000000000000000000000000000000..c05f9756862a696ca69a9e156037f241fb32cf43 --- /dev/null +++ "b/\346\241\210\344\276\2139/\346\262\263\345\214\227\351\230\263\345\205\211\351\227\256\346\224\277\345\271\263\345\217\260\345\256\236\350\257\235\345\256\236\350\257\264.py" @@ -0,0 +1,42 @@ +import requests +import random +from lxml import etree # 从lxml中导入etree +ua = ['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', +'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', +'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362', +'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'] +headers = { + 'user-agent':ua[random.randint(0,2)] +} +for i in range(1,10): + + + response = requests.get(f"http://yglz.tousu.hebnews.cn/shss-{i}.html",headers=headers) + html = response.content.decode("utf-8") + print("*"*200) + + tree = etree.HTML(html) # 解析html + divs = tree.xpath('//div[@class="listcon"]') # 解析列表区域div + for div in divs: # 循环这个区域 + try: + # 注意下面是通过div去进行的xpath查找,同时加上try方式报错 + shouli = div.xpath('span[1]/p/a/text()')[0] # 受理单位 + + content = div.xpath('span[2]/p/a/text()')[0] # 投诉内容 + datetime = div.xpath('span[3]/p/text()')[0].replace("\n","") # 时间 + status = div.xpath('span[5]/p/text()')[0].replace("\n","") # 时间 + one_data = {"shouli":shouli, + "type":type, + "content":content, + "datetime":datetime, + "status":status, + } + print(one_data) # 打印数据,方便存储到mongodb里面 + + except Exception as e: + print("内部数据报错") + print(div) + continue + + + \ No newline at end of file