#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2023/2/21 16:09 # @Author : clong # @File : clean_data.py.py import base64 import html import json import re import json def decode_base64(context): if not isinstance(context, str): return "" return base64.b64decode(context).decode(encoding="utf-8") def clean_text(text): """清理内容""" if text is None: return "" pattern = re.compile(r'<[^>]+>|&#.*?;', re.S) result = pattern.sub('', text) result = html.unescape(result) return result def clean_blog_data(): # 数据从odps读取 raw_data_path = "./data/raw_data.txt" data_path = "./data/data.json" with open(raw_data_path) as file_r: with open(data_path, "w") as file_w: index = 1 for line in file_r: try: articleid, content, title, tags, username, createtime = line.split("\t") content = decode_base64(content) content = clean_text(content) meta = {"ID": index} ss = json.dumps({"meta":meta,"text":content}, check_circular=False) file_w.write(ss + "\n") index += 1 except Exception as e: print(str(e)) continue def clean_ask_data(): # 数据从odps读取 import pandas as pd raw_data_path = "./data/ask.csv" data_path = "./data/ask.jsonl" df = pd.read_csv(raw_data_path) with open(data_path, "w") as file_w: index = 1 for row in df.itertuples(): title = row[2] question_body = row[3] answer_body = row[4] content = str(title) + "\n" + str(question_body) + "\n" + str(answer_body) content = clean_text(content) meta = {"ID": index} ss = json.dumps({"meta": meta, "text": content}, check_circular=False) file_w.write(ss + "\n") index += 1 def belle_to_csv(): import json import pandas as pd belle_data_path = "./data/Belle.train.json" prompts_path = "./data/prompts.csv" data_list = [] with open(belle_data_path) as file: for line in file: line = line.strip() json_obj = json.loads(line.strip()) data_list.append({ "input": json_obj["input"], "target": json_obj["target"] }) print(json_obj["input"]) pf = pd.DataFrame(data_list, columns=["input", "target"]) pf.to_csv(prompts_path, index=False)