V1&2

d3d0a9b3 · Corley · d3d0a9b3 · d3d0a9b3 · d3d0a9b3 · d3d0a9b3
40 changed file
--- a/Version 1.0/__pycache__/codemart_crawler.cpython-38.pyc
+++ b/Version 1.0/__pycache__/codemart_crawler.cpython-38.pyc
--- a/Version 1.0/__pycache__/config.cpython-38.pyc
+++ b/Version 1.0/__pycache__/config.cpython-38.pyc
--- a/Version 1.0/__pycache__/oschina_crawler.cpython-38.pyc
+++ b/Version 1.0/__pycache__/oschina_crawler.cpython-38.pyc
--- a/Version 1.0/__pycache__/rrkf_crawler.cpython-38.pyc
+++ b/Version 1.0/__pycache__/rrkf_crawler.cpython-38.pyc
--- a/Version 1.0/__pycache__/sender.cpython-38.pyc
+++ b/Version 1.0/__pycache__/sender.cpython-38.pyc
--- a/Version 1.0/__pycache__/shixian_crawler.cpython-38.pyc
+++ b/Version 1.0/__pycache__/shixian_crawler.cpython-38.pyc
--- a/Version 1.0/__pycache__/utils.cpython-38.pyc
+++ b/Version 1.0/__pycache__/utils.cpython-38.pyc
--- a/Version 1.0/__pycache__/yuanjisong_crawler.cpython-38.pyc
+++ b/Version 1.0/__pycache__/yuanjisong_crawler.cpython-38.pyc
--- a/Version 1.0/codemart_crawler.py
+++ b/Version 1.0/codemart_crawler.py
+'''
+码市订单抓取
+'''
+
+import time
+from datetime import datetime
+
+import requests
+from openpyxl import Workbook
+from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
+
+from config import codemart_headers
+from utils import get_contact
+from sender import send_message
+
+
+def get_one_page(url, start_time):
+    result_list = []
+    try:
+        response = requests.get(url, headers=codemart_headers)
+        if response.status_code == 200:
+            data = response.json()
+            try:
+                rewards = data['rewards']
+                for reward in rewards:
+                    # pub_time = float(reward['pubTime']) / 1000
+                    # if start_time - pub_time < 7200:
+                    #     data_dict = {
+                    #         'id': reward['id'],
+                    #         'name': reward['name'],
+                    #         'description': reward['description'],
+                    #         'duration': reward['duration'],
+                    #     }
+                    #     result_list.append(data_dict)
+                    #     continue
+                    # else:
+                    #     return result_list
+                    data_dict = {
+                        'id': reward['id'],
+                        'name': reward['name'],
+                        'description': reward['description'],
+                        'duration': reward['duration'],
+                    }
+                    result_list.append(data_dict)
+                return result_list
+            except Exception as e:
+                return None, e.args[0]
+        else:
+            return None, response.status_code
+    except Exception as e:
+        return None, e.args[0]
+
+
+def main(wb):
+    print('开始爬取码市订单')
+    start_time = time.time()
+    sheet = wb['Sheet']
+    sheet.title = '码市'
+    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
+    count = 1
+    for i in range(10):
+        url = 'https://codemart.com/api/project?page={}'.format(i + 1)
+        result = get_one_page(url, start_time)
+        if isinstance(result, list):
+            for r in result:
+                desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description'])
+                contact = get_contact(desc)
+                sheet.append([count, desc, 'https://codemart.com/project/{}'.format(r['id']), contact])
+                count += 1
+        elif isinstance(result, tuple):
+            message = '码市爬取出错：%s' % result[1]
+            print(message)
+            send_message(message)
+    print('结束爬取码市订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    main(wb)
+    now = datetime.now()
+    wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 1.0/config.py
+++ b/Version 1.0/config.py
+'''
+配置项
+'''
+
+# UA请求头列表
+user_agents = [
+    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
+    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
+    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
+    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
+    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
+    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
+    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
+    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
+    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
+    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
+    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
+    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
+    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
+    "UCWEB7.0.2.37/28/999",
+    "NOKIA5700/ UCWEB7.0.2.37/28/999",
+    "Openwave/ UCWEB7.0.2.37/28/999",
+    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
+    "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
+]
+
+# 码市请求头
+codemart_headers = {'Accept': 'application/json'}
+
+# 开源中国请求头
+oschina_headers = codemart_headers
+
+# 邮件正则表达式，包括邮件或者手机号
+contact_regex = r'([\w\.-]+@[\w\.-]+\.[\w\.]+)|(1[3-9]\d{9})'
+
+# 企业微信机器人key
+upload_wechat_key = r'909exxxx-0be4-4856-ae83-f67ac153xxxx'
+notify_wechat_key = r'e481xxxx-cea3-4b95-a0de-04310162xxxx'
\ No newline at end of file
--- a/Version 1.0/crawler.py
+++ b/Version 1.0/crawler.py
+'''
+爬取主程序
+'''
+
+import time
+from datetime import datetime
+
+from openpyxl import Workbook
+from apscheduler.schedulers.blocking import BlockingScheduler
+
+import shixian_crawler, rrkf_crawler, wywaibao_crawler, codemart_crawler, yuanjisong_crawler, oschina_crawler
+from sender import get_media_id, send_file,send_message
+
+sched = BlockingScheduler()
+
+
+def crawl_save_upload():
+    '''调用函数实现抓取、保存和上传数据文件'''
+    print('-----数据抓取开始-----')
+    wb = Workbook()
+    codemart_crawler.main(wb)
+    oschina_crawler.main(wb)
+    rrkf_crawler.main(wb)
+    wywaibao_crawler.main(wb)
+    yuanjisong_crawler.main(wb)
+    shixian_crawler.main(wb)
+    print('-----数据抓取结束-----')
+
+    print('-----文件保存开始-----')
+    now = datetime.now()
+    file = r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S")
+    wb.save(file)
+    time.sleep(3)
+    print('-----文件保存结束-----')
+
+    print('-----文件上传开始-----')
+    media_id = get_media_id(file)
+    if isinstance(media_id, str):
+        upload_result = send_file(media_id)
+        if upload_result == True:
+            print('文件上传成功：%s' % file)
+        else:
+            message = '文件上传失败：%s' % upload_result[1]
+            print(message)
+            send_message(message)
+    else:
+        message = '获取media_id失败：%s' % media_id[1]
+        print(message)
+        send_message(message)
+
+    print('-----文件上传结束-----')
+
+
+@sched.scheduled_job('interval', seconds=1800)
+def schedule():
+    '''设定执行计划'''
+    now = datetime.now()
+    print('当前时间：%s' % now.strftime("%Y-%m-%d %H:%M:%S"))
+    hour = now.hour
+    if hour >= 8 and hour <= 22:
+        print('程序执行开始')
+        crawl_save_upload()
+        print('程序执行结束\n')
+    else:
+        pass
+
+
+if __name__ == '__main__':
+    '''主函数'''
+    sched.start()
\ No newline at end of file
--- a/Version 1.0/oschina_crawler.py
+++ b/Version 1.0/oschina_crawler.py
+'''
+开源中国抓取
+'''
+
+from datetime import datetime
+
+import html2text
+import requests
+from openpyxl import Workbook
+
+from config import oschina_headers
+from utils import get_contact, get_ua
+from sender import send_message
+
+
+def get_id(url):
+    try:
+        response = requests.get(url, headers=oschina_headers.update({'User-Agent': get_ua()}))
+        if response.status_code == 200:
+            data = response.json()
+            try:
+                datas = data['data']['data']
+                id_list = [d['id'] for d in datas]
+                return id_list
+            except Exception as e:
+                return None, e.args[0]
+        else:
+            return None, response.status_code
+    except Exception as e:
+        return None, e.args[0]
+
+
+def get_one_page(url):
+    try:
+        response = requests.get(url, headers=oschina_headers)
+        if response.status_code == 200:
+            data = response.json()
+            try:
+                description = data['data']['prd']
+                return description
+            except Exception as e:
+                return None, e.args[0]
+        else:
+            return None, response.status_code
+    except Exception as e:
+        return None, e.args[0]
+
+
+def main(wb):
+    print('开始爬取开源中国订单')
+    sheet = wb.create_sheet('开源中国', 1)
+    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
+    count = 1
+    for i in range(10):
+        url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30&currentTime=&pageSize=20&currentPage='.format(
+            i + 1)
+        id_list = get_id(url)
+        if isinstance(id_list, list):
+            for id in id_list:
+                url = 'https://zb.oschina.net/project/detail?id=%s' % id
+                desc = get_one_page(url)
+                if isinstance(desc, str):
+                    desc = html2text.html2text(desc).strip()
+                    contact = get_contact(desc)
+                    sheet.append([count, desc, url, contact])
+                    count += 1
+                elif isinstance(desc, tuple):
+                    print('开源中国详情爬取出错：%s' % desc[1])
+        elif isinstance(id_list, tuple):
+            message = '开源中国爬取出错：%s' % id_list[1]
+            print(message)
+            send_message(message)
+    print('结束爬取开源中国订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    main(wb)
+    now = datetime.now()
+    wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 1.0/requirements.txt
+++ b/Version 1.0/requirements.txt
+html2text
+openpyxl
+fake_useragent
+pymysql
+mysql-connector-python
+SQLAlchemy
+APScheduler
+lxml==
+requests==
\ No newline at end of file
--- a/Version 1.0/rrkf_crawler.py
+++ b/Version 1.0/rrkf_crawler.py
+'''
+人人开发抓取
+'''
+
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from utils import get_contact, get_ua
+from sender import send_message
+
+
+def get_info(url):
+    info_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        html = etree.HTML(text)
+        orders = html.xpath('//*[@id="r-list-wrapper"]/div[2]/div')
+        for order in orders:
+            info = {}
+            link = 'http://www.rrkf.com' + order.xpath('./div[1]/div/h4/a/@href')[0]
+            desc = order.xpath('./div[1]/div/p/text()')[0]
+            info['link'] = link
+            info['desc'] = desc.strip()
+            info_list.append(info)
+        return info_list
+    except Exception as e:
+        return None, e.args[0]
+
+
+def main(wb):
+    print('开始爬取人人开发订单')
+    sheet = wb.create_sheet('人人开发', 2)
+    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
+    count = 1
+    for i in range(10):
+        url = 'http://www.rrkf.com/serv/request?&currentPage={}'.format(i + 1)
+        info_list = get_info(url)
+        if isinstance(info_list, list):
+            for info in info_list:
+                desc = info['desc']
+                contact = get_contact(desc)
+                sheet.append([count, desc, info['link'], contact])
+                count += 1
+        elif isinstance(info_list, tuple):
+            message = '人人开发爬取出错：%s' % info_list[1]
+            print(message)
+            send_message(message)
+    print('结束爬取人人开发订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    main(wb)
+    now = datetime.now()
+    wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 1.0/sender.py
+++ b/Version 1.0/sender.py
+'''
+数据文件上传和发送
+'''
+
+import os
+import requests
+
+from config import upload_wechat_key, notify_wechat_key
+
+
+def get_media_id(filename):
+    '''上传文件获取media_id'''
+    try:
+        headers = {"Content-Type": "multipart/form-data"}
+        send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/upload_media?key={}&type=file'.format(upload_wechat_key)
+        file = {
+            (filename, open(filename, "rb")),
+        }
+        res = requests.post(url=send_url, headers=headers, files=file).json()
+        media_id = res['media_id']
+        return media_id
+    except Exception as e:
+        return None, e.args[0]
+
+
+def send_file(media_id):
+    '''发送文件'''
+    try:
+        headers = {'Content-Type': 'application/json'}
+        data = {
+            "msgtype": "file",
+            "file": {
+                "media_id": media_id
+            }
+        }
+        send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(upload_wechat_key)
+        r = requests.post(url=send_url, headers=headers, json=data).json()
+        if r['errcode'] == 0:
+            return True
+        else:
+            return None, r['errmsg']
+    except Exception as e:
+        return None, e.args[0]
+
+
+def send_message(message):
+    '发送错误消息'
+    try:
+        headers = {'Content-Type': 'application/json'}
+        data = {
+            "msgtype": "text",
+            "text": {
+                "content": message,
+                "mentioned_mobile_list": ["15682210532"]
+            }
+        }
+        send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(notify_wechat_key)
+        r = requests.post(url=send_url, headers=headers, json=data).json()
+        if r['errcode'] == 0:
+            return True
+        else:
+            return None, r['errmsg']
+    except Exception as e:
+        return None, e.args[0]
+
+
+if __name__ == '__main__':
+    files = os.listdir('./data')
+    files = [f for f in files if f.endswith('xlsx')]
+    latest_file = files[-1]
+    media_id = get_media_id('data/' + latest_file)
+    upload_result = send_file(media_id)
+    if upload_result == True:
+        print('上传成功')
+    else:
+        send_message('上传失败：'+upload_result[1])
--- a/Version 1.0/shixian_crawler.py
+++ b/Version 1.0/shixian_crawler.py
+'''
+实现抓取
+'''
+
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from utils import get_contact, get_ua
+from sender import send_message
+
+
+def get_info(url):
+    info_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        html = etree.HTML(text)
+        orders = html.xpath('//*[@class="job"]')
+        for order in orders:
+            info = {}
+            link = 'http://www.shixian.com' + order.xpath('./div[1]/a/@href')[0]
+            desc = order.xpath('./div[1]/a/p/text()')[0]
+            # release_time = order.xpath('./div[1]/div/div/span/text()')[0]
+            # if '1 天前发布' in release_time or '小时' in release_time:
+            #     info['link'] = link
+            #     info['desc'] = desc.strip()
+            #     info_list.append(info)
+            # else:
+            #     continue
+            info['link'] = link
+            info['desc'] = desc.strip()
+            info_list.append(info)
+        return info_list
+    except Exception as e:
+        return None, e.args[0]
+
+
+def main(wb):
+    print('开始爬取实现订单')
+    sheet = wb.create_sheet('实现', 5)
+    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
+    count = 1
+    for i in range(10):
+        url = 'https://shixian.com/job/all?page={}&sort_arrow=down'.format(i + 1)
+        info_list = get_info(url)
+        if isinstance(info_list, list):
+            for info in info_list:
+                desc = info['desc']
+                contact = get_contact(desc)
+                sheet.append([count, desc, info['link'], contact])
+                count += 1
+        elif isinstance(info_list, tuple):
+            message = '实现爬取出错：%s' % info_list[1]
+            print(message)
+            send_message(message)
+    print('结束爬取实现订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    main(wb)
+    now = datetime.now()
+    wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 1.0/utils.py
+++ b/Version 1.0/utils.py
+'''
+工具函数文件
+'''
+
+import re
+
+from fake_useragent import UserAgent
+
+from config import contact_regex
+
+
+def get_ua():
+    '''获取随机请求头'''
+    try:
+        return UserAgent().chrome
+    except:
+        get_ua()
+
+
+def get_contact(desc):
+    '''根据字符串获取联系人'''
+    contact_group = re.findall(contact_regex, desc, re.VERBOSE)
+    if len(contact_group):
+        contact_group = [e for t in contact_group for e in t if e != '']
+        contact_group = list(set(contact_group))
+        return '|'.join(contact_group)
+    return ''
--- a/Version 1.0/wywaibao_crawler.py
+++ b/Version 1.0/wywaibao_crawler.py
+'''
+51外包抓取
+'''
+
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from utils import get_contact, get_ua
+from sender import send_message
+
+
+def get_info(url):
+    info_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        html = etree.HTML(text)
+        orders = html.xpath('//*[@class="xiangmu_item"]')
+        for order in orders:
+            info = {}
+            link = 'http://www.51waibao.net/' + order.xpath('./div[1]/div[1]/a/@href')[0]
+            desc = order.xpath('./div[2]/text()')[0]
+            info['link'] = link
+            info['desc'] = desc.strip()
+            info_list.append(info)
+        return info_list
+    except Exception as e:
+        return None, e.args[0]
+
+
+def main(wb):
+    print('开始爬取51外包订单')
+    sheet = wb.create_sheet('51外包', 3)
+    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
+    count = 1
+    for i in range(10):
+        url = 'http://www.51waibao.net/Project.html?page={}'.format(i + 1)
+        info_list = get_info(url)
+        if isinstance(info_list, list):
+            for info in info_list:
+                desc = info['desc']
+                contact = get_contact(desc)
+                sheet.append([count, desc, info['link'], contact])
+                count += 1
+        elif isinstance(info_list, tuple):
+            message = '51外包爬取出错：%s' % info_list[1]
+            print(message)
+            send_message(message)
+    print('结束爬取51外包订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    main(wb)
+    now = datetime.now()
+    wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 1.0/yuanjisong_crawler.py
+++ b/Version 1.0/yuanjisong_crawler.py
+'''
+猿急送抓取
+'''
+
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from utils import get_contact, get_ua
+from sender import send_message
+
+
+def get_info(url):
+    info_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        html = etree.HTML(text)
+        orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]')
+        for order in orders:
+            info = {}
+            link = order.xpath('./div[1]/div[2]/a/@href')[0]
+            desc = order.xpath('./div[1]/div[1]/div/a/p/text()')[0]
+            info['link'] = link
+            info['desc'] = desc.strip()
+            info_list.append(info)
+        return info_list
+    except Exception as e:
+        return None, e.args[0]
+
+
+def main(wb):
+    print('开始爬取猿急送订单')
+    sheet = wb.create_sheet('猿急送', 4)
+    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
+    count = 1
+    for i in range(10):
+        url = 'https://www.yuanjisong.com/job/allcity/page{}'.format(i + 1)
+        info_list = get_info(url)
+        if isinstance(info_list, list):
+            for info in info_list:
+                desc = info['desc']
+                contact = get_contact(desc)
+                sheet.append([count, desc, info['link'], contact])
+                count += 1
+        elif isinstance(info_list, tuple):
+            message = '猿急送爬取出错：%s' % info_list[1]
+            print(message)
+            send_message(message)
+    print('结束爬取猿急送订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    main(wb)
+    now = datetime.now()
+    wb.save(r'%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 2.0/__pycache__/codemart_crawler.cpython-38.pyc
+++ b/Version 2.0/__pycache__/codemart_crawler.cpython-38.pyc
--- a/Version 2.0/__pycache__/config.cpython-38.pyc
+++ b/Version 2.0/__pycache__/config.cpython-38.pyc
--- a/Version 2.0/__pycache__/oschina_crawler.cpython-38.pyc
+++ b/Version 2.0/__pycache__/oschina_crawler.cpython-38.pyc
--- a/Version 2.0/__pycache__/rrkf_crawler.cpython-38.pyc
+++ b/Version 2.0/__pycache__/rrkf_crawler.cpython-38.pyc
--- a/Version 2.0/__pycache__/sender.cpython-38.pyc
+++ b/Version 2.0/__pycache__/sender.cpython-38.pyc
--- a/Version 2.0/__pycache__/shixian_crawler.cpython-38.pyc
+++ b/Version 2.0/__pycache__/shixian_crawler.cpython-38.pyc
--- a/Version 2.0/__pycache__/utils.cpython-38.pyc
+++ b/Version 2.0/__pycache__/utils.cpython-38.pyc
--- a/Version 2.0/__pycache__/wywaibao_crawler.cpython-38.pyc
+++ b/Version 2.0/__pycache__/wywaibao_crawler.cpython-38.pyc
--- a/Version 2.0/__pycache__/yuanjisong_crawler.cpython-38.pyc
+++ b/Version 2.0/__pycache__/yuanjisong_crawler.cpython-38.pyc
--- a/Version 2.0/codemart_crawler.py
+++ b/Version 2.0/codemart_crawler.py
+'''
+码市订单抓取
+'''
+
+import time
+from datetime import datetime
+
+import requests
+from openpyxl import Workbook
+from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
+
+from config import codemart_headers, time_point
+from utils import get_contact, get_mysql_connection, create_table, add_default_data
+from sender import send_message
+
+
+def get_one_page(url):
+    result_list = []
+    try:
+        response = requests.get(url, headers=codemart_headers)
+        if response.status_code == 200:
+            data = response.json()
+            rewards = data['rewards']
+            for reward in rewards:
+                data_dict = {
+                    'id': reward['id'],
+                    'name': reward['name'],
+                    'description': reward['description'],
+                    'duration': reward['duration'],
+                    'cate': reward['typeText'],
+                    'status': reward['statusText'],
+                    'pubtime': reward['pubTime']
+                }
+                result_list.append(data_dict)
+            return result_list
+        else:
+            return None, response.status_code
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def main(wb, session, OrderModel, WebsiteModel):
+    print('开始爬取码市订单')
+    sheet = wb['Sheet']
+    sheet.title = '码市'
+    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
+    count = 1
+    website = session.query(WebsiteModel).get(1)
+    for i in range(10, 0, -1):
+        url = 'https://codemart.com/api/project?page=%d' % i
+        result = get_one_page(url)
+        if isinstance(result, list):
+            for r in result:
+                time_stamp = int(r['pubtime']) / 1000
+                publish_time = datetime.fromtimestamp(time_stamp)
+                if publish_time < time_point:
+                    continue
+                desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description'])
+                cid = 'cm-{}'.format(r['id'])
+                contact = get_contact(desc)
+                link = 'https://codemart.com/project/{}'.format(r['id'])
+                is_valid = True if r['status'] == '招募中' else False
+                order_query = session.query(OrderModel).get(cid)
+                if order_query:
+                    is_valided = order_query.is_valid
+                    order_query.is_valid = is_valid
+                    if is_valid == True:
+                        sheet.append([count, desc, link, publish_time, contact, ''])
+                        count += 1
+                        if is_valided == False:
+                            order_query.is_delete = False
+                    if is_valided == True and is_valid == False:
+                        order_query.is_delete = True
+                else:
+                    order = OrderModel(id=cid, desc=desc, link=link, contact=contact, category=r['cate'],
+                                       pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True)
+                    order.website = website
+                    session.add(order)
+                    if is_valid == True:
+                        sheet.append([count, desc, link, publish_time, contact, ''])
+                        count += 1
+            session.commit()
+        elif isinstance(result, tuple):
+            message = '码市爬取第%d行出错：%s' % (result[0], result[1])
+            print(message)
+            send_message(message)
+    print('结束爬取码市订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
+    main(wb, session, Order, Website)
+    now = datetime.now()
+    wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 2.0/config.py
+++ b/Version 2.0/config.py
+'''
+配置项
+'''
+
+import re
+from datetime import datetime, timedelta
+
+# UA请求头列表
+user_agents = [
+    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
+    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
+    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
+    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
+    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
+    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
+    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
+    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
+    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
+    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
+    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
+    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
+    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
+    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
+    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
+    "UCWEB7.0.2.37/28/999",
+    "NOKIA5700/ UCWEB7.0.2.37/28/999",
+    "Openwave/ UCWEB7.0.2.37/28/999",
+    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
+    "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
+]
+
+# 码市请求头
+codemart_headers = {'Accept': 'application/json'}
+
+# 开源中国请求头
+oschina_headers = codemart_headers
+
+# 邮件正则表达式，包括邮件或者手机号
+contact_regex = r'([\w\.-]+@[\w\.-]+\.[\w\.]+)|(1[3-9]\d{9})'
+
+# 企业微信机器人key
+upload_wechat_key = r'909exxxx-0be4-4856-ae83-f67ac153xxxx'
+notify_wechat_key = r'e481xxxx-cea3-4b95-a0de-04310162xxxx'
+
+# 数据库连接配置
+HOSTNAME = '127.0.0.1'
+PORT = 3306
+USERNAME = 'root'
+PASSWORD = 'root'
+DATABASE = 'it_outsource'
+DB_URL = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format(USERNAME, PASSWORD, HOSTNAME, PORT, DATABASE)
+
+# 项目平台信息
+web_name_list = ['码市', '开源中国', '人人开发', '实现', '51外包', '猿急送']
+web_url_list = ['https://codemart.com', 'https://zb.oschina.net/', 'http://www.rrkf.com', 'http://www.shixian.com', 'http://www.51waibao.net', 'https://www.yuanjisong.com']
+
+# 时间阈值
+time_point = datetime.now() - timedelta(days=60)
+
+# Emoji字符正则表达式
+try:
+    # Wide UCS-4 build
+    emoji_regex = re.compile(u'['
+        u'\U0001F300-\U0001F64F'
+        u'\U0001F680-\U0001F6FF'
+        u'\u2600-\u2B55]+',
+        re.UNICODE)
+except re.error:
+    # Narrow UCS-2 build
+    emoji_regex = re.compile(u'('
+        u'\ud83c[\udf00-\udfff]|'
+        u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
+        u'[\u2600-\u2B55])+',
+        re.UNICODE)
+
+
+reserve_file_count = 56
\ No newline at end of file
--- a/Version 2.0/crawler.py
+++ b/Version 2.0/crawler.py
+'''
+爬取主程序
+'''
+
+import time
+from datetime import datetime
+
+from openpyxl import Workbook
+from apscheduler.schedulers.blocking import BlockingScheduler
+
+import shixian_crawler, rrkf_crawler, wywaibao_crawler, codemart_crawler, yuanjisong_crawler, oschina_crawler
+from sender import get_media_id, send_file,send_message
+from utils import get_mysql_connection, create_table,  add_default_data, delete_data
+
+sched = BlockingScheduler()
+
+
+def crawl_save_upload():
+    '''调用函数实现抓取、保存和上传数据文件'''
+    print('-----数据抓取开始-----')
+    wb = Workbook()
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
+    codemart_crawler.main(wb, session, Order, Website)
+    oschina_crawler.main(wb, session, Order, Website)
+    rrkf_crawler.main(wb, session, Order, Website)
+    shixian_crawler.main(wb, session, Order, Website)
+    wywaibao_crawler.main(wb, session, Order, Website)
+    yuanjisong_crawler.main(wb, session, Order, Website)
+    print('-----数据抓取结束-----')
+
+    print('-----文件保存开始-----')
+    delete_data()
+    now = datetime.now()
+    file = r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S")
+    wb.save(file)
+    time.sleep(3)
+    print('-----文件保存结束-----')
+
+    print('-----文件上传开始-----')
+    media_id = get_media_id(file)
+    if isinstance(media_id, str):
+        upload_result = send_file(media_id)
+        if upload_result == True:
+            print('文件上传成功：%s' % file)
+        else:
+            message = '文件上传失败：%s' % upload_result[1]
+            print(message)
+            send_message(message)
+    else:
+        message = '获取media_id失败：%s' % media_id[1]
+        print(message)
+        send_message(message)
+
+    print('-----文件上传结束-----')
+
+
+@sched.scheduled_job('interval', seconds=7200)
+def schedule():
+    '''设定执行计划'''
+    now = datetime.now()
+    print('当前时间：%s' % now.strftime("%Y-%m-%d %H:%M:%S"))
+    hour = now.hour
+    if hour >= 8 and hour <= 22:
+        print('程序执行开始')
+        crawl_save_upload()
+        print('程序执行结束\n')
+    else:
+        pass
+
+
+if __name__ == '__main__':
+    '''主函数'''
+    sched.start()
\ No newline at end of file
--- a/Version 2.0/oschina_crawler.py
+++ b/Version 2.0/oschina_crawler.py
+'''
+开源中国抓取
+'''
+
+from datetime import datetime
+
+import html2text
+import requests
+from openpyxl import Workbook
+
+from config import oschina_headers, time_point
+from utils import get_contact, get_ua, get_mysql_connection, create_table,  add_default_data
+from sender import send_message
+
+
+def get_id(url):
+    try:
+        response = requests.get(url, headers=oschina_headers.update({'User-Agent': get_ua()}))
+        if response.status_code == 200:
+            data = response.json()
+            datas = data['data']['data']
+            id_list = [(d['id'], d['type']) for d in datas]
+            return id_list
+        else:
+            return 19, response.status_code
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def get_one_page(url):
+    try:
+        response = requests.get(url, headers=oschina_headers)
+        if response.status_code == 200:
+            data = response.json()
+            data = data['data']
+            description = data['prd']
+            status = data['status']
+            app = data['application']
+            time_str = data['publishTime']
+            tmp_str = data['statusLastTime']
+            pub_time = datetime.strptime(time_str if time_str else tmp_str, "%Y-%m-%d %H:%M:%S")
+            return [description, status, app, pub_time]
+        else:
+            return 33, response.status_code
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def main(wb, session, OrderModel, WebsiteModel):
+    print('开始爬取开源中国订单')
+    sheet = wb.create_sheet('开源中国', 1)
+    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
+    count = 1
+    website = session.query(WebsiteModel).get(2)
+    for i in range(10, 0, -1):
+        url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30&currentTime=&pageSize=20&currentPage=%d' % i
+        id_list = get_id(url)
+        if isinstance(id_list, list):
+            for oid, otype in id_list:
+                if otype == 2:
+                    url = 'https://zb.oschina.net/reward/detail?id=%d' % oid
+                    link = 'https://zb.oschina.net/reward/detail.html?id=%s' % oid
+                else:
+                    url = 'https://zb.oschina.net/project/detail?id=%s' % oid
+                    link = 'https://zb.oschina.net/project/detail.html?id=%s' % oid
+                result = get_one_page(url)
+                if isinstance(result, list):
+                    publish_time = result[3]
+                    if publish_time < time_point:
+                        continue
+                    desc = html2text.html2text(result[0]).strip()
+                    is_valid = True if result[1] == 3 else False
+                    contact = get_contact(desc)
+
+                    oid = 'oc-{}'.format(oid//10)
+                    order_query = session.query(OrderModel).filter_by(desc=desc, pub_time=publish_time).first()
+                    if order_query:
+                        is_valided = order_query.is_valid
+                        order_query.is_valid = is_valid
+                        if is_valid == True:
+                            sheet.append([count, desc, link, publish_time, contact, ''])
+                            count += 1
+                            if is_valided == False:
+                                order_query.is_delete = False
+                        if is_valided == True and is_valid == False:
+                            order_query.is_delete = True
+                    else:
+                        order = OrderModel(id=oid, desc=desc, link=link, contact=contact, category=result[2], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True)
+                        order.website = website
+                        session.add(order)
+                        if is_valid == True:
+                            sheet.append([count, desc, link, publish_time, contact, ''])
+                            count += 1
+                elif isinstance(result, tuple):
+                    message = '开源中国详情爬取第%d行出错：%s'  % (result[0], result[1])
+                    print(message)
+                    send_message(message)
+            session.commit()
+        elif isinstance(id_list, tuple):
+            message = '开源中国爬取第%d行出错：%s' % (id_list[0], id_list[1])
+            print(message)
+            send_message(message)
+    print('结束爬取开源中国订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
+    main(wb, session, Order, Website)
+    now = datetime.now()
+    wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 2.0/requirements.txt
+++ b/Version 2.0/requirements.txt
+html2text
+openpyxl
+fake_useragent
+pymysql
+mysql-connector-python
+SQLAlchemy
+APScheduler
+lxml
+requests
\ No newline at end of file
--- a/Version 2.0/rrkf_crawler.py
+++ b/Version 2.0/rrkf_crawler.py
+'''
+人人开发抓取
+'''
+
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from utils import get_contact, get_ua, get_mysql_connection, create_table,  add_default_data
+from sender import send_message
+
+
+def get_info(url):
+    info_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        html = etree.HTML(text)
+        orders = html.xpath('//*[@id="r-list-wrapper"]/div[2]/div')
+        for order in orders:
+            info = {}
+            link = 'http://www.rrkf.com' + order.xpath('./div[1]/div/h4/a/@href')[0]
+            desc = order.xpath('./div[1]/div/p/text()')[0]
+            info['link'] = link
+            info['desc'] = desc.strip()
+            info_list.append(info)
+        return info_list
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def get_detail(url):
+    try:
+        text = requests.get(url, headers={'User-Agent': get_ua()}).text
+        html = etree.HTML(text)
+        status_str = html.xpath('//*[@id="step-box"]/ul/li[1]/span/span/text()')
+        status = status_str[0] if status_str else '定标及以后'
+        pub_date = html.xpath('//*[@id="step-box"]/ul/li[1]/div/span[2]/text()')
+        pub_time = pub_date[0] if pub_date else None
+        return [status, pub_time]
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def main(wb, session, OrderModel, WebsiteModel):
+    print('开始爬取人人开发订单')
+    sheet = wb.create_sheet('人人开发', 2)
+    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
+    count = 1
+    website = session.query(WebsiteModel).get(3)
+    for i in range(10, 0, -1):
+        url = 'http://www.rrkf.com/serv/request?&currentPage=%d' % i
+        info_list = get_info(url)
+        if isinstance(info_list, list):
+            for info in info_list:
+                desc = info['desc']
+                link = info['link']
+                details = get_detail(link)
+                if isinstance(details, list):
+                    rid = 'rr-{}'.format(link.split('=')[1])
+                    contact = get_contact(desc)
+                    is_valid = True if '剩余' in details[0] else False
+                    pub_time = datetime.strptime(details[1], "%Y-%m-%d %H:%M:%S") if details[1] else None
+                    order_query = session.query(OrderModel).get(rid)
+                    if order_query:
+                        is_valided = order_query.is_valid
+                        order_query.is_valid = is_valid
+                        if is_valid == True:
+                            sheet.append([count, desc, link, pub_time, contact, ''])
+                            count += 1
+                            if is_valided == False:
+                                order_query.is_delete = False
+                        if is_valided == True and is_valid == False:
+                            order_query.is_delete = True
+                    else:
+                        order = OrderModel(id=rid, desc=desc, link=link, contact=contact, category='',
+                                          pub_time=pub_time, is_valid=is_valid, is_delete=False if is_valid else True)
+                        order.website = website
+                        session.add(order)
+                        if is_valid == True:
+                            sheet.append([count, desc, link, pub_time, contact, ''])
+                            count += 1
+                else:
+                    message = '人人开发详情爬取第%d行出错：%s' % (details[0], details[1])
+                    print(message)
+                    send_message(message)
+            session.commit()
+        elif isinstance(info_list, tuple):
+            message = '人人开发爬取第%d行出错：%s' % (info_list[0], info_list[1])
+            print(message)
+            send_message(message)
+    print('结束爬取人人开发订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
+    main(wb, session, Order, Website)
+    now = datetime.now()
+    wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 2.0/sender.py
+++ b/Version 2.0/sender.py
+'''
+数据文件上传和发送
+'''
+
+import os
+import requests
+
+from config import upload_wechat_key, notify_wechat_key
+
+
+def get_media_id(filename):
+    '''上传文件获取media_id'''
+    try:
+        headers = {"Content-Type": "multipart/form-data"}
+        send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/upload_media?key={}&type=file'.format(upload_wechat_key)
+        file = {
+            (filename, open(filename, "rb")),
+        }
+        res = requests.post(url=send_url, headers=headers, files=file).json()
+        media_id = res['media_id']
+        return media_id
+    except Exception as e:
+        return None, e.args[0]
+
+
+def send_file(media_id):
+    '''发送文件'''
+    try:
+        headers = {'Content-Type': 'application/json'}
+        data = {
+            "msgtype": "file",
+            "file": {
+                "media_id": media_id
+            }
+        }
+        send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(upload_wechat_key)
+        r = requests.post(url=send_url, headers=headers, json=data).json()
+        if r['errcode'] == 0:
+            return True
+        else:
+            return None, r['errmsg']
+    except Exception as e:
+        return None, e.args[0]
+
+
+def send_message(message):
+    '发送错误消息'
+    try:
+        headers = {'Content-Type': 'application/json'}
+        data = {
+            "msgtype": "text",
+            "text": {
+                "content": message,
+                "mentioned_mobile_list": ["15682210532"]
+            }
+        }
+        send_url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}'.format(notify_wechat_key)
+        r = requests.post(url=send_url, headers=headers, json=data).json()
+        if r['errcode'] == 0:
+            return True
+        else:
+            return None, r['errmsg']
+    except Exception as e:
+        return None, e.args[0]
+
+
+if __name__ == '__main__':
+    files = os.listdir('./data')
+    files = [f for f in files if f.endswith('xlsx')]
+    latest_file = files[-1]
+    media_id = get_media_id('data/' + latest_file)
+    upload_result = send_file(media_id)
+    if upload_result == True:
+        print('上传成功')
+    else:
+        send_message('上传失败：'+upload_result[1])
--- a/Version 2.0/shixian_crawler.py
+++ b/Version 2.0/shixian_crawler.py
+'''
+实现抓取
+'''
+
+import re
+import time
+import random
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from utils import get_contact, get_ua, get_mysql_connection, create_table,  add_default_data
+from sender import send_message
+from config import emoji_regex
+
+
+def get_info(url):
+    info_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        orders = re.findall(r'<div class="job">(.*?)<div class="clearfix"></div>', text, re.S | re.M)
+        for order in orders:
+            info = {}
+            link = 'http://www.shixian.com' + re.search(r'<a target="_blank" href="(.+?)">', order).groups()[0]
+            desc_str = re.search(r'<p class="describe text-inline-limit">(.*?)</p>', order, re.S | re.M).groups()[0]
+            desc = emoji_regex.sub('[Emoji]', desc_str)
+            start_time = re.search(r'.*?(\d{4}-\d{2}-\d{2}).*?', order, re.S | re.M).groups()[0]
+            info['link'] = str(link)
+            info['desc'] = desc.strip()
+            info['start_time'] = start_time + ' 23:59:59'
+            info_list.append(info)
+        return info_list
+
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def get_category(url):
+    try:
+        text = requests.get(url, headers={'User-Agent': get_ua()}).text
+        html = etree.HTML(text)
+        cate_temp = html.xpath('/html/body/div[3]/div[1]/article/section[1]/dl/dd[1]/span/text()')
+        cate = cate_temp[0] if cate_temp else ''
+        return cate
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def main(wb, session, OrderModel, WebsiteModel):
+    print('开始爬取实现订单')
+    sheet = wb.create_sheet('实现', 3)
+    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
+    count = 1
+    website = session.query(WebsiteModel).get(4)
+    for i in range(10, 0, -1):
+        url = 'https://shixian.com/job/all?page=%d&sort_arrow=down' % i
+        info_list = get_info(url)
+        if isinstance(info_list, list):
+            for info in info_list:
+                desc = info['desc']
+                link = info['link']
+                contact = get_contact(desc)
+                dl_time = datetime.strptime(info['start_time'], "%Y-%m-%d %H:%M:%S")
+                is_valid = True if datetime.now() <= dl_time else False
+                sid= 'sx-' + link.split('/')[-1]
+                cate = get_category(link)
+                if isinstance(cate, str):
+                    order_query = session.query(OrderModel).get(sid)
+                    if order_query:
+                        is_valided = order_query.is_valid
+                        order_query.is_valid = is_valid
+                        if is_valid == True:
+                            sheet.append([count, desc, link, '', contact, ''])
+                            count += 1
+                            if is_valided == False:
+                                order_query.is_delete = False
+                        if is_valided == True and is_valid == False:
+                            order_query.is_delete = True
+                    else:
+                        order = OrderModel(id=sid, desc=desc, link=link, contact=contact, category=cate,
+                                          pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True)
+                        order.website = website
+                        session.add(order)
+                        if is_valid == True:
+                            sheet.append([count, desc, link, '', contact, ''])
+                            count += 1
+                else:
+                    message = '实现详情爬取第%d行出错：%s' % (cate[0], cate[1])
+                    print(message)
+                    send_message(message)
+                time.sleep(random.random()/10)
+            session.commit()
+        elif isinstance(info_list, tuple):
+            message = '实现爬取第%d行出错：%s' % (info_list[0], info_list[1])
+            print(message)
+            send_message(message)
+    print('结束爬取实现订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
+    main(wb, session, Order, Website)
+    now = datetime.now()
+    wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 2.0/utils.py
+++ b/Version 2.0/utils.py
+'''
+工具函数文件
+'''
+
+import re
+import os
+from datetime import datetime
+
+from fake_useragent import UserAgent
+from sqlalchemy import create_engine
+from sqlalchemy import Column, Integer, String, Text, DateTime, ForeignKey, Boolean
+from sqlalchemy.orm import sessionmaker, relationship
+from sqlalchemy.ext.declarative import declarative_base
+
+from config import contact_regex, DB_URL, web_name_list, web_url_list, reserve_file_count
+from sender import send_message
+
+
+def get_ua():
+    '''获取随机请求头'''
+    try:
+        return UserAgent().chrome
+    except:
+        get_ua()
+
+
+def get_contact(desc):
+    '''根据字符串获取联系人'''
+    contact_group = re.findall(contact_regex, desc, re.VERBOSE)
+    if len(contact_group):
+        contact_group = [e for t in contact_group for e in t if e != '']
+        contact_group = list(set(contact_group))
+        return '|'.join(contact_group)
+    return ''
+
+def delete_data():
+    files = os.listdir('./data')
+    file_count = len(files)
+    if file_count > reserve_file_count:
+        delete_count = file_count-reserve_file_count
+        delete_files = files[:delete_count]
+        for file in delete_files:
+            os.remove('data/' + file)
+        message = '已删除过期文件%d个' % delete_count
+        print(message)
+        send_message(message)
+
+def get_mysql_connection():
+    '''连接MySQL数据库'''
+    engine = create_engine(DB_URL)
+    Base = declarative_base(engine)
+    session = sessionmaker(bind=engine)()
+
+    return engine, Base, session
+
+
+def create_table(engine, Base):
+    '''创建表'''
+    class Website(Base):
+        __tablename__ = 'website'
+        id = Column(Integer, primary_key=True, autoincrement=True)
+        name = Column(String(10), nullable=False)
+        link = Column(String(40), nullable=False)
+
+        orders = relationship('Order', backref='website')
+
+    class Order(Base):
+        __tablename__ = 'order'
+        id = Column(String(50), primary_key=True)
+        desc = Column(Text, nullable=False)
+        link = Column(String(80), nullable=False)
+        contact = Column(String(30))
+        category = Column(String(15), nullable=True)
+        pub_time = Column(DateTime, nullable=True)
+        is_valid = Column(Boolean, nullable=False)
+        add_time = Column(DateTime, default=datetime.now)
+        wid = Column(Integer, ForeignKey('website.id'), nullable=False)
+        is_delete = Column(Boolean, default=False)
+
+    if (not engine.dialect.has_table(engine, 'website')) or (not engine.dialect.has_table(engine, 'order')):
+        Base.metadata.create_all()
+        print('表创建成功')
+
+    return Order, Website
+
+
+def add_default_data(session, WebsiteModel):
+    origin_data = session.query(WebsiteModel).all()
+    if len(origin_data) != 6:
+        for data in origin_data:
+            session.delete(data)
+        session.commit()
+        for name, url in zip(web_name_list, web_url_list):
+            website = WebsiteModel(name=name, link=url)
+            session.add(website)
+        session.commit()
+        print('插入数据成功')
+
+
+if __name__ == '__main__':
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
\ No newline at end of file
--- a/Version 2.0/wywaibao_crawler.py
+++ b/Version 2.0/wywaibao_crawler.py
+'''
+51外包抓取
+'''
+
+import random
+import time
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from config import time_point
+from sender import send_message
+from utils import get_contact, get_ua, get_mysql_connection, create_table, add_default_data
+
+
+def get_links(url):
+    link_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        html = etree.HTML(text)
+        orders = html.xpath('//*[@class="xiangmu_item"]')
+        for order in orders:
+            link = 'http://www.51waibao.net/' + order.xpath('./div[1]/div[1]/a/@href')[0]
+            link_list.append(link)
+        return link_list
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def get_detail(url):
+    try:
+        text = requests.get(url, headers={'User-Agent': get_ua()}).text
+        html = etree.HTML(text)
+        info = html.xpath('//*[@id="form1"]/div[6]/div[3]')[0]
+        wid = info.xpath('./div[1]/div[1]/ul/li[1]/text()')[0].split('waibao')[1]
+        cate = info.xpath('./div[1]/div[1]/ul/li[2]/text()')[0][6:]
+        status = info.xpath('./div[1]/div[1]/ul/li[6]/text()')[0]
+        pub_time = info.xpath('./div[1]/div[1]/ul/li[7]/text()')[0][6:]
+        desc_list = info.xpath('./div[2]/div[2]//text()')
+        desc = '\n'.join([dl.strip() for dl in desc_list])
+        return [wid, cate, status, pub_time, desc]
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def main(wb, session, OrdrModel, WebsiteModel):
+    print('开始爬取51外包订单')
+    sheet = wb.create_sheet('51外包', 4)
+    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
+    count = 1
+    website = session.query(WebsiteModel).get(5)
+    for i in range(10, 0, -1):
+        url = 'http://www.51waibao.net/Project.html?page=%d' % i
+        link_list = get_links(url)
+        if isinstance(link_list, list):
+            for link in link_list:
+                result = get_detail(link)
+                if isinstance(result, list):
+                    date_str = result[3]
+                    publish_time = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
+                    if publish_time < time_point:
+                        continue
+                    desc = result[4]
+                    contact = get_contact(desc)
+                    wid = 'wy-' + result[0]
+                    is_valid = False if '项目已过期' in result[2] else True
+                    order_query = session.query(OrdrModel).get(wid)
+                    if order_query:
+                        is_valided = order_query.is_valid
+                        order_query.is_valid = is_valid
+                        if is_valid == True:
+                            sheet.append([count, desc, link, publish_time, contact, ''])
+                            count += 1
+                            if is_valided == False:
+                                order_query.is_delete = False
+                        if is_valided == True and is_valid == False:
+                            order_query.is_delete = True
+                    else:
+                        order = OrdrModel(id=wid, desc=desc, link=link, contact=contact, category=result[1], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True)
+                        order.website = website
+                        session.add(order)
+                        if is_valid == True:
+                            sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
+                            count += 1
+                else:
+                    message = '51外包详情爬取第%d行出错：%s' % (result[0], result[1])
+                    print(message)
+                    send_message(message)
+                time.sleep(random.random() / 10)
+            session.commit()
+        elif isinstance(link_list, tuple):
+            message = '51外包爬取第%d行出错：%s' % (link_list[0], link_list[1])
+            print(message)
+            send_message(message)
+    print('结束爬取51外包订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
+    main(wb, session, Order, Website)
+    now = datetime.now()
+    wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/Version 2.0/yuanjisong_crawler.py
+++ b/Version 2.0/yuanjisong_crawler.py
+'''
+猿急送抓取
+'''
+
+from datetime import datetime
+
+import requests
+from lxml import etree
+from openpyxl import Workbook
+
+from utils import get_contact, get_ua, get_mysql_connection, create_table,  add_default_data
+from sender import send_message
+from config import emoji_regex
+
+
+def get_info(url):
+    info_list = []
+    try:
+        text = requests.get(url, headers={'User-Agent':get_ua()}).text
+        html = etree.HTML(text)
+        orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]')
+        for order in orders:
+            info = {}
+            link = str(order.xpath('./a/@href')[0])
+            desc_str = order.xpath('./div[1]/div[1]/div/a/p/text()')[0]
+            desc = emoji_regex.sub('[Emoji]', desc_str)
+            status = order.xpath('./div[2]/a/text()')[0]
+            info['link'] = link
+            info['desc'] = desc.strip()
+            info['status'] = status
+            info_list.append(info)
+        return info_list
+    except Exception as e:
+        return e.__traceback__.tb_lineno, e.args[0]
+
+
+def main(wb, session, OrderModel, WebsiteModel):
+    print('开始爬取猿急送订单')
+    sheet = wb.create_sheet('猿急送', 5)
+    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
+    count = 1
+    website = session.query(WebsiteModel).get(6)
+    for i in range(10, 0, -1):
+        url = 'https://www.yuanjisong.com/job/allcity/page%d' % i
+        info_list = get_info(url)
+        if isinstance(info_list, list):
+            for info in info_list:
+                desc = info['desc']
+                link = info['link']
+                contact = get_contact(desc)
+                is_valid = True if info['status'] == '投递职位' else False
+                yid = 'yj-{}'.format(int(link.split('/')[-1]))
+                order_query = session.query(OrderModel).get(yid)
+                if order_query:
+                    is_valided = order_query.is_valid
+                    order_query.is_valid = is_valid
+                    # if is_valided == False and is_valid == True:
+                    #     sheet.append([count, desc, link, contact])
+                    #     count += 1
+                    #     order_query.is_delete = False
+                    if is_valid == True:
+                        sheet.append([count, desc, link, '', contact, ''])
+                        count += 1
+                        if is_valided == False:
+                            order_query.is_delete = False
+                    if is_valided == True and is_valid == False:
+                        order_query.is_delete = True
+                else:
+                    order = OrderModel(id=yid, desc=desc, link=link, contact=contact, category='',
+                                      pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True)
+                    order.website = website
+                    session.add(order)
+                    if is_valid == True:
+                        sheet.append([count, desc, link, '', contact, ''])
+                        count += 1
+            session.commit()
+        elif isinstance(info_list, tuple):
+            message = '猿急送爬取第%d行出错：%s' % (info_list[0], info_list[1])
+            print(message)
+            send_message(message)
+    print('结束爬取猿急送订单')
+
+
+if __name__ == '__main__':
+    wb = Workbook()
+    engine, Base, session = get_mysql_connection()
+    Order, Website = create_table(engine, Base)
+    add_default_data(session, Website)
+    main(wb, session, Order, Website)
+    now = datetime.now()
+    wb.save(r'data/%s.xlsx' % now.strftime("%Y-%m-%d %H-%M-%S"))
--- a/readme.md
+++ b/readme.md
+## 1.项目说明
+本项目实现定时爬取外包平台的订单并保存，爬取的网站主要包括码市、开源中国、人人开发、实现、51外包和猿急送6个外包订单平台，并先后实现两个版本：
+### Version 1.0
+1.0版本是初始版本，实现爬取6个网站的订单信息，并分别保存到Excel表格的6个Sheet中，同时实现定时爬取，每半个小时爬取一次，并将文件上传到企业微信群以便寻单人员使用。
+
+### Version 2.0
+2.0版本在1.0版本的基础上实现了保存所有订单到MySQL数据库中，并判断单子的有效性（是否已经过期或已被接），有效的单子保存到Excel表格并发送到企业微信群中，同时如果发生异常，会将异常信息发送到测试群，便于开发人员排错。
+
+## 2.项目配置
+所有配置信息均在各版本的config.py中，可以根据自己的需要进行修改。
+### 安装依赖库
+下载或clone项目后，需要使用各版本中的requirements.txt安装依赖库，直接在各版本目录下执行`pip install -r requirements.txt -i https://pypi.douban.com/simple`即可。
+
+### 配置文件修改
+config.py中的企业微信机器人key、数据库配置等均需根据自己的需要进行修改，同时需要创建数据库名为it_outsource或者自己需要的名称。
+
+## 3.项目运行
+直接在各版本目录下执行`python crawler.py`即可运行项目、开始采集订单。
\ No newline at end of file