#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2023/2/21 16:09
# @Author  : clong
# @File    : clean_data.py.py

import base64
import html
import json
import re
import json


def decode_base64(context):
    if not isinstance(context, str):
        return ""
    return base64.b64decode(context).decode(encoding="utf-8")


def clean_text(text):
    """清理内容"""
    if text is None:
        return ""
    pattern = re.compile(r'<[^>]+>|&#.*?;', re.S)
    result = pattern.sub('', text)
    result = html.unescape(result)
    return result


def clean_blog_data():
    # 数据从odps读取
    raw_data_path = "./data/raw_data.txt"
    data_path = "./data/data.json"
    with open(raw_data_path) as file_r:
        with open(data_path, "w") as file_w:
            index = 1
            for line in file_r:
                try:
                    articleid, content, title, tags, username, createtime = line.split("\t")
                    content = decode_base64(content)
                    content = clean_text(content)
                    meta = {"ID": index}
                    ss = json.dumps({"meta":meta,"text":content}, check_circular=False)
                    file_w.write(ss + "\n")
                    index += 1
                except Exception as e:
                    print(str(e))
                    continue


def clean_ask_data():
    # 数据从odps读取
    import pandas as pd
    raw_data_path = "./data/ask.csv"
    data_path = "./data/ask.jsonl"

    df = pd.read_csv(raw_data_path)
    with open(data_path, "w") as file_w:
        index = 1
        for row in df.itertuples():
            title = row[2]
            question_body = row[3]
            answer_body = row[4]
            content = str(title) + "\n" + str(question_body) + "\n" + str(answer_body)
            content = clean_text(content)
            meta = {"ID": index}
            ss = json.dumps({"meta": meta, "text": content}, check_circular=False)
            file_w.write(ss + "\n")
            index +=1


clean_ask_data()