提交 d3d0a9b3 编写于 作者: C Corley

V1&2

上级
'''
码市订单抓取
'''
import time
from datetime import datetime
import requests
from openpyxl import Workbook
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from config import codemart_headers
from utils import get_contact
from sender import send_message
def get_one_page(url, start_time):
result_list = []
try:
response = requests.get(url, headers=codemart_headers)
if response.status_code == 200:
data = response.json()
try:
rewards = data['rewards']
for reward in rewards:
# pub_time = float(reward['pubTime']) / 1000
# if start_time - pub_time < 7200:
# data_dict = {
# 'id': reward['id'],
# 'name': reward['name'],
# 'description': reward['description'],
# 'duration': reward['duration'],
# }
# result_list.append(data_dict)
# continue
# else:
# return result_list
data_dict = {
'id': reward['id'],
'name': reward['name'],
'description': reward['description'],
'duration': reward['duration'],
}
result_list.append(data_dict)
return result_list
except Exception as e:
return None, e.args[0]
else:
return None, response.status_code
except Exception as e:
return None, e.args[0]
def main(wb):
print('开始爬取码市订单')
start_time = time.time()
sheet = wb['Sheet']
sheet.title = '码市'
sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
count = 1
for i in range(10):
url = 'https://codemart.com/api/project?page={}'.format(i + 1)
result = get_one_page(url, start_time)
if isinstance(result, list):
for r in result:
desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description'])
contact = get_contact(desc)
sheet.append([count, desc, 'https://codemart.com/project/{}'.format(r['id']), contact])
count += 1
elif isinstance(result, tuple):
message = '码市爬取出错:%s' % result[1]
print(message)
send_message(message)
print('结束爬取码市订单')
if __name__ == '__main__':
wb = Workbook()
main(wb)
now = datetime.now()
wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
配置项
'''
# UA请求头列表
user_agents = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
# 码市请求头
codemart_headers = {'Accept': 'application/json'}
# 开源中国请求头
oschina_headers = codemart_headers
# 邮件正则表达式,包括邮件或者手机号
contact_regex = r'([\w\.-]+@[\w\.-]+\.[\w\.]+)|(1[3-9]\d{9})'
# 企业微信机器人key
upload_wechat_key = r'909exxxx-0be4-4856-ae83-f67ac153xxxx'
notify_wechat_key = r'e481xxxx-cea3-4b95-a0de-04310162xxxx'
\ No newline at end of file
'''
爬取主程序
'''
import time
from datetime import datetime
from openpyxl import Workbook
from apscheduler.schedulers.blocking import BlockingScheduler
import shixian_crawler, rrkf_crawler, wywaibao_crawler, codemart_crawler, yuanjisong_crawler, oschina_crawler
from sender import get_media_id, send_file,send_message
sched = BlockingScheduler()
def crawl_save_upload():
'''调用函数实现抓取、保存和上传数据文件'''
print('-----数据抓取开始-----')
wb = Workbook()
codemart_crawler.main(wb)
oschina_crawler.main(wb)
rrkf_crawler.main(wb)
wywaibao_crawler.main(wb)
yuanjisong_crawler.main(wb)
shixian_crawler.main(wb)
print('-----数据抓取结束-----')
print('-----文件保存开始-----')
now = datetime.now()
file = r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S")
wb.save(file)
time.sleep(3)
print('-----文件保存结束-----')
print('-----文件上传开始-----')
media_id = get_media_id(file)
if isinstance(media_id, str):
upload_result = send_file(media_id)
if upload_result == True:
print('文件上传成功:%s' % file)
else:
message = '文件上传失败:%s' % upload_result[1]
print(message)
send_message(message)
else:
message = '获取media_id失败:%s' % media_id[1]
print(message)
send_message(message)
print('-----文件上传结束-----')
@sched.scheduled_job('interval', seconds=1800)
def schedule():
'''设定执行计划'''
now = datetime.now()
print('当前时间:%s' % now.strftime("%Y-%m-%d %H:%M:%S"))
hour = now.hour
if hour >= 8 and hour <= 22:
print('程序执行开始')
crawl_save_upload()
print('程序执行结束\n')
else:
pass
if __name__ == '__main__':
'''主函数'''
sched.start()
\ No newline at end of file
'''
开源中国抓取
'''
from datetime import datetime
import html2text
import requests
from openpyxl import Workbook
from config import oschina_headers
from utils import get_contact, get_ua
from sender import send_message
def get_id(url):
try:
response = requests.get(url, headers=oschina_headers.update({'User-Agent': get_ua()}))
if response.status_code == 200:
data = response.json()
try:
datas = data['data']['data']
id_list = [d['id'] for d in datas]
return id_list
except Exception as e:
return None, e.args[0]
else:
return None, response.status_code
except Exception as e:
return None, e.args[0]
def get_one_page(url):
try:
response = requests.get(url, headers=oschina_headers)
if response.status_code == 200:
data = response.json()
try:
description = data['data']['prd']
return description
except Exception as e:
return None, e.args[0]
else:
return None, response.status_code
except Exception as e:
return None, e.args[0]
def main(wb):
print('开始爬取开源中国订单')
sheet = wb.create_sheet('开源中国', 1)
sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
count = 1
for i in range(10):
url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30&currentTime=&pageSize=20&currentPage='.format(
i + 1)
id_list = get_id(url)
if isinstance(id_list, list):
for id in id_list:
url = 'https://zb.oschina.net/project/detail?id=%s' % id
desc = get_one_page(url)
if isinstance(desc, str):
desc = html2text.html2text(desc).strip()
contact = get_contact(desc)
sheet.append([count, desc, url, contact])
count += 1
elif isinstance(desc, tuple):
print('开源中国详情爬取出错:%s' % desc[1])
elif isinstance(id_list, tuple):
message = '开源中国爬取出错:%s' % id_list[1]
print(message)
send_message(message)
print('结束爬取开源中国订单')
if __name__ == '__main__':
wb = Workbook()
main(wb)
now = datetime.now()
wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
html2text
openpyxl
fake_useragent
pymysql
mysql-connector-python
SQLAlchemy
APScheduler
lxml==
requests==
\ No newline at end of file
'''
人人开发抓取
'''
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from utils import get_contact, get_ua
from sender import send_message
def get_info(url):
info_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
html = etree.HTML(text)
orders = html.xpath('//*[@id="r-list-wrapper"]/div[2]/div')
for order in orders:
info = {}
link = 'http://www.rrkf.com' + order.xpath('./div[1]/div/h4/a/@href')[0]
desc = order.xpath('./div[1]/div/p/text()')[0]
info['link'] = link
info['desc'] = desc.strip()
info_list.append(info)
return info_list
except Exception as e:
return None, e.args[0]
def main(wb):
print('开始爬取人人开发订单')
sheet = wb.create_sheet('人人开发', 2)
sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
count = 1
for i in range(10):
url = 'http://www.rrkf.com/serv/request?&currentPage={}'.format(i + 1)
info_list = get_info(url)
if isinstance(info_list, list):
for info in info_list:
desc = info['desc']
contact = get_contact(desc)
sheet.append([count, desc, info['link'], contact])
count += 1
elif isinstance(info_list, tuple):
message = '人人开发爬取出错:%s' % info_list[1]
print(message)
send_message(message)
print('结束爬取人人开发订单')
if __name__ == '__main__':
wb = Workbook()
main(wb)
now = datetime.now()
wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
数据文件上传和发送
'''
import os
import requests
from config import upload_wechat_key, notify_wechat_key
def get_media_id(filename):
'''上传文件获取media_id'''
try:
headers = {"Content-Type": "multipart/form-data"}
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/upload_media?key={}&type=file'.format(upload_wechat_key)
file = {
(filename, open(filename, "rb")),
}
res = requests.post(url=send_url, headers=headers, files=file).json()
media_id = res['media_id']
return media_id
except Exception as e:
return None, e.args[0]
def send_file(media_id):
'''发送文件'''
try:
headers = {'Content-Type': 'application/json'}
data = {
"msgtype": "file",
"file": {
"media_id": media_id
}
}
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(upload_wechat_key)
r = requests.post(url=send_url, headers=headers, json=data).json()
if r['errcode'] == 0:
return True
else:
return None, r['errmsg']
except Exception as e:
return None, e.args[0]
def send_message(message):
'发送错误消息'
try:
headers = {'Content-Type': 'application/json'}
data = {
"msgtype": "text",
"text": {
"content": message,
"mentioned_mobile_list": ["15682210532"]
}
}
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(notify_wechat_key)
r = requests.post(url=send_url, headers=headers, json=data).json()
if r['errcode'] == 0:
return True
else:
return None, r['errmsg']
except Exception as e:
return None, e.args[0]
if __name__ == '__main__':
files = os.listdir('./data')
files = [f for f in files if f.endswith('xlsx')]
latest_file = files[-1]
media_id = get_media_id('data/' + latest_file)
upload_result = send_file(media_id)
if upload_result == True:
print('上传成功')
else:
send_message('上传失败:'+upload_result[1])
'''
实现抓取
'''
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from utils import get_contact, get_ua
from sender import send_message
def get_info(url):
info_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
html = etree.HTML(text)
orders = html.xpath('//*[@class="job"]')
for order in orders:
info = {}
link = 'http://www.shixian.com' + order.xpath('./div[1]/a/@href')[0]
desc = order.xpath('./div[1]/a/p/text()')[0]
# release_time = order.xpath('./div[1]/div/div/span/text()')[0]
# if '1 天前发布' in release_time or '小时' in release_time:
# info['link'] = link
# info['desc'] = desc.strip()
# info_list.append(info)
# else:
# continue
info['link'] = link
info['desc'] = desc.strip()
info_list.append(info)
return info_list
except Exception as e:
return None, e.args[0]
def main(wb):
print('开始爬取实现订单')
sheet = wb.create_sheet('实现', 5)
sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
count = 1
for i in range(10):
url = 'https://shixian.com/job/all?page={}&sort_arrow=down'.format(i + 1)
info_list = get_info(url)
if isinstance(info_list, list):
for info in info_list:
desc = info['desc']
contact = get_contact(desc)
sheet.append([count, desc, info['link'], contact])
count += 1
elif isinstance(info_list, tuple):
message = '实现爬取出错:%s' % info_list[1]
print(message)
send_message(message)
print('结束爬取实现订单')
if __name__ == '__main__':
wb = Workbook()
main(wb)
now = datetime.now()
wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
工具函数文件
'''
import re
from fake_useragent import UserAgent
from config import contact_regex
def get_ua():
'''获取随机请求头'''
try:
return UserAgent().chrome
except:
get_ua()
def get_contact(desc):
'''根据字符串获取联系人'''
contact_group = re.findall(contact_regex, desc, re.VERBOSE)
if len(contact_group):
contact_group = [e for t in contact_group for e in t if e != '']
contact_group = list(set(contact_group))
return '|'.join(contact_group)
return ''
'''
51外包抓取
'''
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from utils import get_contact, get_ua
from sender import send_message
def get_info(url):
info_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
html = etree.HTML(text)
orders = html.xpath('//*[@class="xiangmu_item"]')
for order in orders:
info = {}
link = 'http://www.51waibao.net/' + order.xpath('./div[1]/div[1]/a/@href')[0]
desc = order.xpath('./div[2]/text()')[0]
info['link'] = link
info['desc'] = desc.strip()
info_list.append(info)
return info_list
except Exception as e:
return None, e.args[0]
def main(wb):
print('开始爬取51外包订单')
sheet = wb.create_sheet('51外包', 3)
sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
count = 1
for i in range(10):
url = 'http://www.51waibao.net/Project.html?page={}'.format(i + 1)
info_list = get_info(url)
if isinstance(info_list, list):
for info in info_list:
desc = info['desc']
contact = get_contact(desc)
sheet.append([count, desc, info['link'], contact])
count += 1
elif isinstance(info_list, tuple):
message = '51外包爬取出错:%s' % info_list[1]
print(message)
send_message(message)
print('结束爬取51外包订单')
if __name__ == '__main__':
wb = Workbook()
main(wb)
now = datetime.now()
wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
猿急送抓取
'''
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from utils import get_contact, get_ua
from sender import send_message
def get_info(url):
info_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
html = etree.HTML(text)
orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]')
for order in orders:
info = {}
link = order.xpath('./div[1]/div[2]/a/@href')[0]
desc = order.xpath('./div[1]/div[1]/div/a/p/text()')[0]
info['link'] = link
info['desc'] = desc.strip()
info_list.append(info)
return info_list
except Exception as e:
return None, e.args[0]
def main(wb):
print('开始爬取猿急送订单')
sheet = wb.create_sheet('猿急送', 4)
sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
count = 1
for i in range(10):
url = 'https://www.yuanjisong.com/job/allcity/page{}'.format(i + 1)
info_list = get_info(url)
if isinstance(info_list, list):
for info in info_list:
desc = info['desc']
contact = get_contact(desc)
sheet.append([count, desc, info['link'], contact])
count += 1
elif isinstance(info_list, tuple):
message = '猿急送爬取出错:%s' % info_list[1]
print(message)
send_message(message)
print('结束爬取猿急送订单')
if __name__ == '__main__':
wb = Workbook()
main(wb)
now = datetime.now()
wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
码市订单抓取
'''
import time
from datetime import datetime
import requests
from openpyxl import Workbook
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from config import codemart_headers, time_point
from utils import get_contact, get_mysql_connection, create_table, add_default_data
from sender import send_message
def get_one_page(url):
result_list = []
try:
response = requests.get(url, headers=codemart_headers)
if response.status_code == 200:
data = response.json()
rewards = data['rewards']
for reward in rewards:
data_dict = {
'id': reward['id'],
'name': reward['name'],
'description': reward['description'],
'duration': reward['duration'],
'cate': reward['typeText'],
'status': reward['statusText'],
'pubtime': reward['pubTime']
}
result_list.append(data_dict)
return result_list
else:
return None, response.status_code
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def main(wb, session, OrderModel, WebsiteModel):
print('开始爬取码市订单')
sheet = wb['Sheet']
sheet.title = '码市'
sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
count = 1
website = session.query(WebsiteModel).get(1)
for i in range(10, 0, -1):
url = 'https://codemart.com/api/project?page=%d' % i
result = get_one_page(url)
if isinstance(result, list):
for r in result:
time_stamp = int(r['pubtime']) / 1000
publish_time = datetime.fromtimestamp(time_stamp)
if publish_time < time_point:
continue
desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description'])
cid = 'cm-{}'.format(r['id'])
contact = get_contact(desc)
link = 'https://codemart.com/project/{}'.format(r['id'])
is_valid = True if r['status'] == '招募中' else False
order_query = session.query(OrderModel).get(cid)
if order_query:
is_valided = order_query.is_valid
order_query.is_valid = is_valid
if is_valid == True:
sheet.append([count, desc, link, publish_time, contact, ''])
count += 1
if is_valided == False:
order_query.is_delete = False
if is_valided == True and is_valid == False:
order_query.is_delete = True
else:
order = OrderModel(id=cid, desc=desc, link=link, contact=contact, category=r['cate'],
pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True)
order.website = website
session.add(order)
if is_valid == True:
sheet.append([count, desc, link, publish_time, contact, ''])
count += 1
session.commit()
elif isinstance(result, tuple):
message = '码市爬取第%d行出错:%s' % (result[0], result[1])
print(message)
send_message(message)
print('结束爬取码市订单')
if __name__ == '__main__':
wb = Workbook()
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
main(wb, session, Order, Website)
now = datetime.now()
wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
配置项
'''
import re
from datetime import datetime, timedelta
# UA请求头列表
user_agents = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
# 码市请求头
codemart_headers = {'Accept': 'application/json'}
# 开源中国请求头
oschina_headers = codemart_headers
# 邮件正则表达式,包括邮件或者手机号
contact_regex = r'([\w\.-]+@[\w\.-]+\.[\w\.]+)|(1[3-9]\d{9})'
# 企业微信机器人key
upload_wechat_key = r'909exxxx-0be4-4856-ae83-f67ac153xxxx'
notify_wechat_key = r'e481xxxx-cea3-4b95-a0de-04310162xxxx'
# 数据库连接配置
HOSTNAME = '127.0.0.1'
PORT = 3306
USERNAME = 'root'
PASSWORD = 'root'
DATABASE = 'it_outsource'
DB_URL = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format(USERNAME, PASSWORD, HOSTNAME, PORT, DATABASE)
# 项目平台信息
web_name_list = ['码市', '开源中国', '人人开发', '实现', '51外包', '猿急送']
web_url_list = ['https://codemart.com', 'https://zb.oschina.net/', 'http://www.rrkf.com', 'http://www.shixian.com', 'http://www.51waibao.net', 'https://www.yuanjisong.com']
# 时间阈值
time_point = datetime.now() - timedelta(days=60)
# Emoji字符正则表达式
try:
# Wide UCS-4 build
emoji_regex = re.compile(u'['
u'\U0001F300-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u2B55]+',
re.UNICODE)
except re.error:
# Narrow UCS-2 build
emoji_regex = re.compile(u'('
u'\ud83c[\udf00-\udfff]|'
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
u'[\u2600-\u2B55])+',
re.UNICODE)
reserve_file_count = 56
\ No newline at end of file
'''
爬取主程序
'''
import time
from datetime import datetime
from openpyxl import Workbook
from apscheduler.schedulers.blocking import BlockingScheduler
import shixian_crawler, rrkf_crawler, wywaibao_crawler, codemart_crawler, yuanjisong_crawler, oschina_crawler
from sender import get_media_id, send_file,send_message
from utils import get_mysql_connection, create_table, add_default_data, delete_data
sched = BlockingScheduler()
def crawl_save_upload():
'''调用函数实现抓取、保存和上传数据文件'''
print('-----数据抓取开始-----')
wb = Workbook()
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
codemart_crawler.main(wb, session, Order, Website)
oschina_crawler.main(wb, session, Order, Website)
rrkf_crawler.main(wb, session, Order, Website)
shixian_crawler.main(wb, session, Order, Website)
wywaibao_crawler.main(wb, session, Order, Website)
yuanjisong_crawler.main(wb, session, Order, Website)
print('-----数据抓取结束-----')
print('-----文件保存开始-----')
delete_data()
now = datetime.now()
file = r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S")
wb.save(file)
time.sleep(3)
print('-----文件保存结束-----')
print('-----文件上传开始-----')
media_id = get_media_id(file)
if isinstance(media_id, str):
upload_result = send_file(media_id)
if upload_result == True:
print('文件上传成功:%s' % file)
else:
message = '文件上传失败:%s' % upload_result[1]
print(message)
send_message(message)
else:
message = '获取media_id失败:%s' % media_id[1]
print(message)
send_message(message)
print('-----文件上传结束-----')
@sched.scheduled_job('interval', seconds=7200)
def schedule():
'''设定执行计划'''
now = datetime.now()
print('当前时间:%s' % now.strftime("%Y-%m-%d %H:%M:%S"))
hour = now.hour
if hour >= 8 and hour <= 22:
print('程序执行开始')
crawl_save_upload()
print('程序执行结束\n')
else:
pass
if __name__ == '__main__':
'''主函数'''
sched.start()
\ No newline at end of file
'''
开源中国抓取
'''
from datetime import datetime
import html2text
import requests
from openpyxl import Workbook
from config import oschina_headers, time_point
from utils import get_contact, get_ua, get_mysql_connection, create_table, add_default_data
from sender import send_message
def get_id(url):
try:
response = requests.get(url, headers=oschina_headers.update({'User-Agent': get_ua()}))
if response.status_code == 200:
data = response.json()
datas = data['data']['data']
id_list = [(d['id'], d['type']) for d in datas]
return id_list
else:
return 19, response.status_code
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def get_one_page(url):
try:
response = requests.get(url, headers=oschina_headers)
if response.status_code == 200:
data = response.json()
data = data['data']
description = data['prd']
status = data['status']
app = data['application']
time_str = data['publishTime']
tmp_str = data['statusLastTime']
pub_time = datetime.strptime(time_str if time_str else tmp_str, "%Y-%m-%d %H:%M:%S")
return [description, status, app, pub_time]
else:
return 33, response.status_code
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def main(wb, session, OrderModel, WebsiteModel):
print('开始爬取开源中国订单')
sheet = wb.create_sheet('开源中国', 1)
sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
count = 1
website = session.query(WebsiteModel).get(2)
for i in range(10, 0, -1):
url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30&currentTime=&pageSize=20&currentPage=%d' % i
id_list = get_id(url)
if isinstance(id_list, list):
for oid, otype in id_list:
if otype == 2:
url = 'https://zb.oschina.net/reward/detail?id=%d' % oid
link = 'https://zb.oschina.net/reward/detail.html?id=%s' % oid
else:
url = 'https://zb.oschina.net/project/detail?id=%s' % oid
link = 'https://zb.oschina.net/project/detail.html?id=%s' % oid
result = get_one_page(url)
if isinstance(result, list):
publish_time = result[3]
if publish_time < time_point:
continue
desc = html2text.html2text(result[0]).strip()
is_valid = True if result[1] == 3 else False
contact = get_contact(desc)
oid = 'oc-{}'.format(oid//10)
order_query = session.query(OrderModel).filter_by(desc=desc, pub_time=publish_time).first()
if order_query:
is_valided = order_query.is_valid
order_query.is_valid = is_valid
if is_valid == True:
sheet.append([count, desc, link, publish_time, contact, ''])
count += 1
if is_valided == False:
order_query.is_delete = False
if is_valided == True and is_valid == False:
order_query.is_delete = True
else:
order = OrderModel(id=oid, desc=desc, link=link, contact=contact, category=result[2], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True)
order.website = website
session.add(order)
if is_valid == True:
sheet.append([count, desc, link, publish_time, contact, ''])
count += 1
elif isinstance(result, tuple):
message = '开源中国详情爬取第%d行出错:%s' % (result[0], result[1])
print(message)
send_message(message)
session.commit()
elif isinstance(id_list, tuple):
message = '开源中国爬取第%d行出错:%s' % (id_list[0], id_list[1])
print(message)
send_message(message)
print('结束爬取开源中国订单')
if __name__ == '__main__':
wb = Workbook()
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
main(wb, session, Order, Website)
now = datetime.now()
wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
html2text
openpyxl
fake_useragent
pymysql
mysql-connector-python
SQLAlchemy
APScheduler
lxml
requests
\ No newline at end of file
'''
人人开发抓取
'''
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from utils import get_contact, get_ua, get_mysql_connection, create_table, add_default_data
from sender import send_message
def get_info(url):
info_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
html = etree.HTML(text)
orders = html.xpath('//*[@id="r-list-wrapper"]/div[2]/div')
for order in orders:
info = {}
link = 'http://www.rrkf.com' + order.xpath('./div[1]/div/h4/a/@href')[0]
desc = order.xpath('./div[1]/div/p/text()')[0]
info['link'] = link
info['desc'] = desc.strip()
info_list.append(info)
return info_list
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def get_detail(url):
try:
text = requests.get(url, headers={'User-Agent': get_ua()}).text
html = etree.HTML(text)
status_str = html.xpath('//*[@id="step-box"]/ul/li[1]/span/span/text()')
status = status_str[0] if status_str else '定标及以后'
pub_date = html.xpath('//*[@id="step-box"]/ul/li[1]/div/span[2]/text()')
pub_time = pub_date[0] if pub_date else None
return [status, pub_time]
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def main(wb, session, OrderModel, WebsiteModel):
print('开始爬取人人开发订单')
sheet = wb.create_sheet('人人开发', 2)
sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
count = 1
website = session.query(WebsiteModel).get(3)
for i in range(10, 0, -1):
url = 'http://www.rrkf.com/serv/request?&currentPage=%d' % i
info_list = get_info(url)
if isinstance(info_list, list):
for info in info_list:
desc = info['desc']
link = info['link']
details = get_detail(link)
if isinstance(details, list):
rid = 'rr-{}'.format(link.split('=')[1])
contact = get_contact(desc)
is_valid = True if '剩余' in details[0] else False
pub_time = datetime.strptime(details[1], "%Y-%m-%d %H:%M:%S") if details[1] else None
order_query = session.query(OrderModel).get(rid)
if order_query:
is_valided = order_query.is_valid
order_query.is_valid = is_valid
if is_valid == True:
sheet.append([count, desc, link, pub_time, contact, ''])
count += 1
if is_valided == False:
order_query.is_delete = False
if is_valided == True and is_valid == False:
order_query.is_delete = True
else:
order = OrderModel(id=rid, desc=desc, link=link, contact=contact, category='',
pub_time=pub_time, is_valid=is_valid, is_delete=False if is_valid else True)
order.website = website
session.add(order)
if is_valid == True:
sheet.append([count, desc, link, pub_time, contact, ''])
count += 1
else:
message = '人人开发详情爬取第%d行出错:%s' % (details[0], details[1])
print(message)
send_message(message)
session.commit()
elif isinstance(info_list, tuple):
message = '人人开发爬取第%d行出错:%s' % (info_list[0], info_list[1])
print(message)
send_message(message)
print('结束爬取人人开发订单')
if __name__ == '__main__':
wb = Workbook()
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
main(wb, session, Order, Website)
now = datetime.now()
wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
数据文件上传和发送
'''
import os
import requests
from config import upload_wechat_key, notify_wechat_key
def get_media_id(filename):
'''上传文件获取media_id'''
try:
headers = {"Content-Type": "multipart/form-data"}
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/upload_media?key={}&type=file'.format(upload_wechat_key)
file = {
(filename, open(filename, "rb")),
}
res = requests.post(url=send_url, headers=headers, files=file).json()
media_id = res['media_id']
return media_id
except Exception as e:
return None, e.args[0]
def send_file(media_id):
'''发送文件'''
try:
headers = {'Content-Type': 'application/json'}
data = {
"msgtype": "file",
"file": {
"media_id": media_id
}
}
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(upload_wechat_key)
r = requests.post(url=send_url, headers=headers, json=data).json()
if r['errcode'] == 0:
return True
else:
return None, r['errmsg']
except Exception as e:
return None, e.args[0]
def send_message(message):
'发送错误消息'
try:
headers = {'Content-Type': 'application/json'}
data = {
"msgtype": "text",
"text": {
"content": message,
"mentioned_mobile_list": ["15682210532"]
}
}
send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(notify_wechat_key)
r = requests.post(url=send_url, headers=headers, json=data).json()
if r['errcode'] == 0:
return True
else:
return None, r['errmsg']
except Exception as e:
return None, e.args[0]
if __name__ == '__main__':
files = os.listdir('./data')
files = [f for f in files if f.endswith('xlsx')]
latest_file = files[-1]
media_id = get_media_id('data/' + latest_file)
upload_result = send_file(media_id)
if upload_result == True:
print('上传成功')
else:
send_message('上传失败:'+upload_result[1])
'''
实现抓取
'''
import re
import time
import random
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from utils import get_contact, get_ua, get_mysql_connection, create_table, add_default_data
from sender import send_message
from config import emoji_regex
def get_info(url):
info_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
orders = re.findall(r'<div class="job">(.*?)<div class="clearfix"></div>', text, re.S | re.M)
for order in orders:
info = {}
link = 'http://www.shixian.com' + re.search(r'<a target="_blank" href="(.+?)">', order).groups()[0]
desc_str = re.search(r'<p class="describe text-inline-limit">(.*?)</p>', order, re.S | re.M).groups()[0]
desc = emoji_regex.sub('[Emoji]', desc_str)
start_time = re.search(r'.*?(\d{4}-\d{2}-\d{2}).*?', order, re.S | re.M).groups()[0]
info['link'] = str(link)
info['desc'] = desc.strip()
info['start_time'] = start_time + ' 23:59:59'
info_list.append(info)
return info_list
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def get_category(url):
try:
text = requests.get(url, headers={'User-Agent': get_ua()}).text
html = etree.HTML(text)
cate_temp = html.xpath('/html/body/div[3]/div[1]/article/section[1]/dl/dd[1]/span/text()')
cate = cate_temp[0] if cate_temp else ''
return cate
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def main(wb, session, OrderModel, WebsiteModel):
print('开始爬取实现订单')
sheet = wb.create_sheet('实现', 3)
sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
count = 1
website = session.query(WebsiteModel).get(4)
for i in range(10, 0, -1):
url = 'https://shixian.com/job/all?page=%d&sort_arrow=down' % i
info_list = get_info(url)
if isinstance(info_list, list):
for info in info_list:
desc = info['desc']
link = info['link']
contact = get_contact(desc)
dl_time = datetime.strptime(info['start_time'], "%Y-%m-%d %H:%M:%S")
is_valid = True if datetime.now() <= dl_time else False
sid= 'sx-' + link.split('/')[-1]
cate = get_category(link)
if isinstance(cate, str):
order_query = session.query(OrderModel).get(sid)
if order_query:
is_valided = order_query.is_valid
order_query.is_valid = is_valid
if is_valid == True:
sheet.append([count, desc, link, '', contact, ''])
count += 1
if is_valided == False:
order_query.is_delete = False
if is_valided == True and is_valid == False:
order_query.is_delete = True
else:
order = OrderModel(id=sid, desc=desc, link=link, contact=contact, category=cate,
pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True)
order.website = website
session.add(order)
if is_valid == True:
sheet.append([count, desc, link, '', contact, ''])
count += 1
else:
message = '实现详情爬取第%d行出错:%s' % (cate[0], cate[1])
print(message)
send_message(message)
time.sleep(random.random()/10)
session.commit()
elif isinstance(info_list, tuple):
message = '实现爬取第%d行出错:%s' % (info_list[0], info_list[1])
print(message)
send_message(message)
print('结束爬取实现订单')
if __name__ == '__main__':
wb = Workbook()
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
main(wb, session, Order, Website)
now = datetime.now()
wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
工具函数文件
'''
import re
import os
from datetime import datetime
from fake_useragent import UserAgent
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, Text, DateTime, ForeignKey, Boolean
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
from config import contact_regex, DB_URL, web_name_list, web_url_list, reserve_file_count
from sender import send_message
def get_ua():
'''获取随机请求头'''
try:
return UserAgent().chrome
except:
get_ua()
def get_contact(desc):
'''根据字符串获取联系人'''
contact_group = re.findall(contact_regex, desc, re.VERBOSE)
if len(contact_group):
contact_group = [e for t in contact_group for e in t if e != '']
contact_group = list(set(contact_group))
return '|'.join(contact_group)
return ''
def delete_data():
files = os.listdir('./data')
file_count = len(files)
if file_count > reserve_file_count:
delete_count = file_count-reserve_file_count
delete_files = files[:delete_count]
for file in delete_files:
os.remove('data/' + file)
message = '已删除过期文件%d个' % delete_count
print(message)
send_message(message)
def get_mysql_connection():
'''连接MySQL数据库'''
engine = create_engine(DB_URL)
Base = declarative_base(engine)
session = sessionmaker(bind=engine)()
return engine, Base, session
def create_table(engine, Base):
'''创建表'''
class Website(Base):
__tablename__ = 'website'
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(10), nullable=False)
link = Column(String(40), nullable=False)
orders = relationship('Order', backref='website')
class Order(Base):
__tablename__ = 'order'
id = Column(String(50), primary_key=True)
desc = Column(Text, nullable=False)
link = Column(String(80), nullable=False)
contact = Column(String(30))
category = Column(String(15), nullable=True)
pub_time = Column(DateTime, nullable=True)
is_valid = Column(Boolean, nullable=False)
add_time = Column(DateTime, default=datetime.now)
wid = Column(Integer, ForeignKey('website.id'), nullable=False)
is_delete = Column(Boolean, default=False)
if (not engine.dialect.has_table(engine, 'website')) or (not engine.dialect.has_table(engine, 'order')):
Base.metadata.create_all()
print('表创建成功')
return Order, Website
def add_default_data(session, WebsiteModel):
origin_data = session.query(WebsiteModel).all()
if len(origin_data) != 6:
for data in origin_data:
session.delete(data)
session.commit()
for name, url in zip(web_name_list, web_url_list):
website = WebsiteModel(name=name, link=url)
session.add(website)
session.commit()
print('插入数据成功')
if __name__ == '__main__':
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
\ No newline at end of file
'''
51外包抓取
'''
import random
import time
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from config import time_point
from sender import send_message
from utils import get_contact, get_ua, get_mysql_connection, create_table, add_default_data
def get_links(url):
link_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
html = etree.HTML(text)
orders = html.xpath('//*[@class="xiangmu_item"]')
for order in orders:
link = 'http://www.51waibao.net/' + order.xpath('./div[1]/div[1]/a/@href')[0]
link_list.append(link)
return link_list
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def get_detail(url):
try:
text = requests.get(url, headers={'User-Agent': get_ua()}).text
html = etree.HTML(text)
info = html.xpath('//*[@id="form1"]/div[6]/div[3]')[0]
wid = info.xpath('./div[1]/div[1]/ul/li[1]/text()')[0].split('waibao')[1]
cate = info.xpath('./div[1]/div[1]/ul/li[2]/text()')[0][6:]
status = info.xpath('./div[1]/div[1]/ul/li[6]/text()')[0]
pub_time = info.xpath('./div[1]/div[1]/ul/li[7]/text()')[0][6:]
desc_list = info.xpath('./div[2]/div[2]//text()')
desc = '\n'.join([dl.strip() for dl in desc_list])
return [wid, cate, status, pub_time, desc]
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def main(wb, session, OrdrModel, WebsiteModel):
print('开始爬取51外包订单')
sheet = wb.create_sheet('51外包', 4)
sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
count = 1
website = session.query(WebsiteModel).get(5)
for i in range(10, 0, -1):
url = 'http://www.51waibao.net/Project.html?page=%d' % i
link_list = get_links(url)
if isinstance(link_list, list):
for link in link_list:
result = get_detail(link)
if isinstance(result, list):
date_str = result[3]
publish_time = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
if publish_time < time_point:
continue
desc = result[4]
contact = get_contact(desc)
wid = 'wy-' + result[0]
is_valid = False if '项目已过期' in result[2] else True
order_query = session.query(OrdrModel).get(wid)
if order_query:
is_valided = order_query.is_valid
order_query.is_valid = is_valid
if is_valid == True:
sheet.append([count, desc, link, publish_time, contact, ''])
count += 1
if is_valided == False:
order_query.is_delete = False
if is_valided == True and is_valid == False:
order_query.is_delete = True
else:
order = OrdrModel(id=wid, desc=desc, link=link, contact=contact, category=result[1], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True)
order.website = website
session.add(order)
if is_valid == True:
sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
count += 1
else:
message = '51外包详情爬取第%d行出错:%s' % (result[0], result[1])
print(message)
send_message(message)
time.sleep(random.random() / 10)
session.commit()
elif isinstance(link_list, tuple):
message = '51外包爬取第%d行出错:%s' % (link_list[0], link_list[1])
print(message)
send_message(message)
print('结束爬取51外包订单')
if __name__ == '__main__':
wb = Workbook()
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
main(wb, session, Order, Website)
now = datetime.now()
wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
'''
猿急送抓取
'''
from datetime import datetime
import requests
from lxml import etree
from openpyxl import Workbook
from utils import get_contact, get_ua, get_mysql_connection, create_table, add_default_data
from sender import send_message
from config import emoji_regex
def get_info(url):
info_list = []
try:
text = requests.get(url, headers={'User-Agent':get_ua()}).text
html = etree.HTML(text)
orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]')
for order in orders:
info = {}
link = str(order.xpath('./a/@href')[0])
desc_str = order.xpath('./div[1]/div[1]/div/a/p/text()')[0]
desc = emoji_regex.sub('[Emoji]', desc_str)
status = order.xpath('./div[2]/a/text()')[0]
info['link'] = link
info['desc'] = desc.strip()
info['status'] = status
info_list.append(info)
return info_list
except Exception as e:
return e.__traceback__.tb_lineno, e.args[0]
def main(wb, session, OrderModel, WebsiteModel):
print('开始爬取猿急送订单')
sheet = wb.create_sheet('猿急送', 5)
sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
count = 1
website = session.query(WebsiteModel).get(6)
for i in range(10, 0, -1):
url = 'https://www.yuanjisong.com/job/allcity/page%d' % i
info_list = get_info(url)
if isinstance(info_list, list):
for info in info_list:
desc = info['desc']
link = info['link']
contact = get_contact(desc)
is_valid = True if info['status'] == '投递职位' else False
yid = 'yj-{}'.format(int(link.split('/')[-1]))
order_query = session.query(OrderModel).get(yid)
if order_query:
is_valided = order_query.is_valid
order_query.is_valid = is_valid
# if is_valided == False and is_valid == True:
# sheet.append([count, desc, link, contact])
# count += 1
# order_query.is_delete = False
if is_valid == True:
sheet.append([count, desc, link, '', contact, ''])
count += 1
if is_valided == False:
order_query.is_delete = False
if is_valided == True and is_valid == False:
order_query.is_delete = True
else:
order = OrderModel(id=yid, desc=desc, link=link, contact=contact, category='',
pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True)
order.website = website
session.add(order)
if is_valid == True:
sheet.append([count, desc, link, '', contact, ''])
count += 1
session.commit()
elif isinstance(info_list, tuple):
message = '猿急送爬取第%d行出错:%s' % (info_list[0], info_list[1])
print(message)
send_message(message)
print('结束爬取猿急送订单')
if __name__ == '__main__':
wb = Workbook()
engine, Base, session = get_mysql_connection()
Order, Website = create_table(engine, Base)
add_default_data(session, Website)
main(wb, session, Order, Website)
now = datetime.now()
wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
## 1.项目说明
本项目实现定时爬取外包平台的订单并保存,爬取的网站主要包括码市、开源中国、人人开发、实现、51外包和猿急送6个外包订单平台,并先后实现两个版本:
### Version 1.0
1.0版本是初始版本,实现爬取6个网站的订单信息,并分别保存到Excel表格的6个Sheet中,同时实现定时爬取,每半个小时爬取一次,并将文件上传到企业微信群以便寻单人员使用。
### Version 2.0
2.0版本在1.0版本的基础上实现了保存所有订单到MySQL数据库中,并判断单子的有效性(是否已经过期或已被接),有效的单子保存到Excel表格并发送到企业微信群中,同时如果发生异常,会将异常信息发送到测试群,便于开发人员排错。
## 2.项目配置
所有配置信息均在各版本的config.py中,可以根据自己的需要进行修改。
### 安装依赖库
下载或clone项目后,需要使用各版本中的requirements.txt安装依赖库,直接在各版本目录下执行`pip install -r requirements.txt -i https://pypi.douban.com/simple`即可。
### 配置文件修改
config.py中的企业微信机器人key、数据库配置等均需根据自己的需要进行修改,同时需要创建数据库名为it_outsource或者自己需要的名称。
## 3.项目运行
直接在各版本目录下执行`python crawler.py`即可运行项目、开始采集订单。
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册