#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2023/2/21 16:09 # @Author : clong # @File : clean_data.py.py import base64 import html import json import re import json def decode_base64(context): if not isinstance(context, str): return "" return base64.b64decode(context).decode(encoding="utf-8") def clean_text(text): """清理内容""" if text is None: return "" pattern = re.compile(r'<[^>]+>|&#.*?;', re.S) result = pattern.sub('', text) result = html.unescape(result) return result def clean_blog_data(): # 数据从odps读取 raw_data_path = "./data/raw_data.txt" data_path = "./data/data.json" with open(raw_data_path) as file_r: with open(data_path, "w") as file_w: index = 1 for line in file_r: try: articleid, content, title, tags, username, createtime = line.split("\t") content = decode_base64(content) content = clean_text(content) meta = {"ID": index} ss = json.dumps({"meta":meta,"text":content}, check_circular=False) file_w.write(ss + "\n") index += 1 except Exception as e: print(str(e)) continue def clean_ask_data(): # 数据从odps读取 import pandas as pd raw_data_path = "./data/ask.csv" data_path = "./data/ask.jsonl" df = pd.read_csv(raw_data_path) with open(data_path, "w") as file_w: index = 1 for row in df.itertuples(): title = row[2] question_body = row[3] answer_body = row[4] content = str(title) + "\n" + str(question_body) + "\n" + str(answer_body) content = clean_text(content) meta = {"ID": index} ss = json.dumps({"meta": meta, "text": content}, check_circular=False) file_w.write(ss + "\n") index +=1 clean_ask_data()