node_knowledge_mapping.py 3.6 KB
Newer Older
L
luxin 已提交
1 2 3 4 5 6 7 8 9
'''
@File    :   node_knowledge_mapping.py
@Time    :   2022/05/30 16:21:40
@Author  :   Lu Xin 
@Contact :   luxin@csdn.net
'''

# here put the import lib
import re
L
luxin 已提交
10
import os
L
luxin 已提交
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
import ipdb

import pandas as pd

from treelib import Tree
from treelib import Node

from path import get_tree_dir
from path import get_index_dir

from utils import load_json
from utils import load_markdown


class NodeKnowledgeMapping():
    def __init__(self, category="blog") -> None:
        self.tree_name = None
        self.category = category
        self.tree = Tree()
        self.text_id_dict = None
        self.section_text_dict = None

    def load(self):
        self.__load_tree()
        self.__load_index()

    def __construct_tree(self, tree_dict, parent):
        for node_text, node_info in tree_dict.items():
            node_id = node_info["node_id"]
            subtree_list = node_info["children"]
            node = Node(
                tag=node_text, 
                identifier=node_id)
            self.tree.add_node(node, parent=parent)
            for subtree_dict in subtree_list:
                self.__construct_tree(subtree_dict, node_id)

    def __load_tree(self):
        self.text_id_dict = {}

        tree_dict = load_json(get_tree_dir())
        self.tree_name = list(tree_dict.keys())[0].lower()
        self.__construct_tree(tree_dict, None)
        paths_to_leaves = self.tree.paths_to_leaves()
        for path in paths_to_leaves:
            text = "-".join(
                [self.tree.get_node(node_id).tag.replace(" ", "").lower() \
                 for node_id in path[1: ]])
            id = path[-1]
            self.text_id_dict[text] = id
        
    def __load_index(self):
        self.section_text_dict = {}

        mk_list = load_markdown(get_index_dir())
        _len = len(mk_list)
        _index = 0
        while _index < (_len - 1):
            line = mk_list[_index]
            line_next = mk_list[_index + 1]

            if line.startswith("##") and not line_next.startswith("##"):
                section = re.sub(r"^#{1,10} {1,5}", "", line)
L
luxin 已提交
74
                section = re.sub(r"^\[.*?\]", "", section).strip()
L
luxin 已提交
75 76 77 78 79 80 81 82 83
                text = line_next.replace(" ", "").lower()
                if not text.startswith(self.tree_name):
                    text = self.tree_name + text
                if text.find("不采纳") == -1:
                    self.section_text_dict[section] = text
                _index += 2
            else:
                _index += 1

L
luxin 已提交
84 85
    def get_node_knowledge_mapping(self, file_name, section_xx_dict):
        columns = ["node_id", "text", "book_text", "xx", "tree_name", "category"]
L
luxin 已提交
86 87 88 89 90 91 92
        contents = []
        for section, text in self.section_text_dict.items():
            if text in self.text_id_dict:
                node_id = self.text_id_dict[text]
            else:
                print("路径 \"{}\" 不存在!".format(text))
                continue
L
luxin 已提交
93 94 95 96 97
            xx = section_xx_dict.get(section, None)
            if xx is not None:
                if isinstance(xx, (int, float)):
                    xx = str(int(xx))
            contents.append([node_id, text, section, xx, self.tree_name, self.category])
L
luxin 已提交
98
        
L
luxin 已提交
99
        output_file_name = os.path.splitext(file_name)[0] + ".csv"
L
luxin 已提交
100
        df = pd.DataFrame(contents, columns=columns)
L
luxin 已提交
101
        df.to_csv(output_file_name, index=False)
L
luxin 已提交
102 103 104 105 106 107
        

def main():
    nkm = NodeKnowledgeMapping()
    nkm.load()

L
luxin 已提交
108 109 110
    file_name = "./data/chapter_to_article.json"
    section_xx_dict = load_json(file_name)
    nkm.get_node_knowledge_mapping(file_name, section_xx_dict)
L
luxin 已提交
111 112 113 114


if __name__=='__main__':
    main()