clean_data.py 2.6 KB
Newer Older
CSDN-Ada助手's avatar
CSDN-Ada助手 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2023/2/21 16:09
# @Author  : clong
# @File    : clean_data.py.py

import base64
import html
import json
import re
import json



def decode_base64(context):
    if not isinstance(context, str):
        return ""
    return base64.b64decode(context).decode(encoding="utf-8")


def clean_text(text):
    """清理内容"""
    if text is None:
        return ""
    pattern = re.compile(r'<[^>]+>|&#.*?;', re.S)
    result = pattern.sub('', text)
    result = html.unescape(result)
    return result


def clean_blog_data():
    # 数据从odps读取
    raw_data_path = "./data/raw_data.txt"
    data_path = "./data/data.json"
    with open(raw_data_path) as file_r:
        with open(data_path, "w") as file_w:
            index = 1
            for line in file_r:
                try:
                    articleid, content, title, tags, username, createtime = line.split("\t")
                    content = decode_base64(content)
                    content = clean_text(content)
                    meta = {"ID": index}
                    ss = json.dumps({"meta":meta,"text":content}, check_circular=False)
                    file_w.write(ss + "\n")
                    index += 1
                except Exception as e:
                    print(str(e))
                    continue


def clean_ask_data():
    # 数据从odps读取
    import pandas as pd
    raw_data_path = "./data/ask.csv"
    data_path = "./data/ask.jsonl"

    df = pd.read_csv(raw_data_path)
    with open(data_path, "w") as file_w:
        index = 1
        for row in df.itertuples():
            title = row[2]
            question_body = row[3]
            answer_body = row[4]
            content = str(title) + "\n" + str(question_body) + "\n" + str(answer_body)
            content = clean_text(content)
            meta = {"ID": index}
            ss = json.dumps({"meta": meta, "text": content}, check_circular=False)
            file_w.write(ss + "\n")
CSDN-Ada助手's avatar
readme  
CSDN-Ada助手 已提交
70
            index += 1
CSDN-Ada助手's avatar
CSDN-Ada助手 已提交
71 72


CSDN-Ada助手's avatar
readme  
CSDN-Ada助手 已提交
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
def belle_to_csv():
    import json
    import pandas as pd
    belle_data_path = "./data/Belle.train.json"
    prompts_path = "./data/prompts.csv"

    data_list = []
    with open(belle_data_path) as file:
        for line in file:
            line = line.strip()
            json_obj = json.loads(line.strip())
            data_list.append({
                "input": json_obj["input"],
                "target": json_obj["target"]
            })
            print(json_obj["input"])

    pf = pd.DataFrame(data_list, columns=["input", "target"])
    pf.to_csv(prompts_path, index=False)