From c85ee21e037ad4baf18d9ad52f6f4fcd821ea2ce Mon Sep 17 00:00:00 2001
From: jiangzhonglian <jiang-s@163.com>
Date: Fri, 4 Sep 2020 18:54:01 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=95=B0=E6=8D=AE=E6=A0=87?=
 =?UTF-8?q?=E6=B3=A8=E7=9A=84=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tutorials/keras/brat_tag.py | 121 ++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 tutorials/keras/brat_tag.py

diff --git a/tutorials/keras/brat_tag.py b/tutorials/keras/brat_tag.py
new file mode 100644
index 0000000..65aa888
--- /dev/null
+++ b/tutorials/keras/brat_tag.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+
+"""
+数据格式转化
+"""
+import os
+import emoji
+from middleware.utils import get_catalog_files
+from config.setting import Config
+
+tag_dic = {"实体对象": "ORG",
+           "正向观点": "Po_VIEW",
+           "中性观点": "Mi_VIEW",
+           "负向观点": "Ne_VIEW"}
+
+
+# 转换成可训练的格式，最后以"END O"结尾
+def from_ann2dic(r_ann_path, r_txt_path, w_path):
+    q_dic = {}
+    print("开始读取文件:%s" % r_ann_path)
+    with open(r_ann_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+        for line in lines:
+            line_arr = line.split()
+            # print(">>> ", line_arr)
+            cls = tag_dic[line_arr[1]]
+            start_index = int(line_arr[2])
+            end_index = int(line_arr[3])
+            length = end_index - start_index
+            for r in range(length):
+                q_dic[start_index+r] = ("B-%s" % cls) if r == 0 else ("I-%s" % cls)
+
+    # 存储坐标和对应的列名:  {23: 'B-Ne_VIEW', 24: 'I-Ne_VIEW', 46: 'B-ORG', 47: 'I-ORG'}
+    print("q_dic: ", q_dic)
+
+    print("开始读取文件内容: %s" % r_txt_path)
+    with open(r_txt_path, "r", encoding="utf-8") as f:
+        content_str = f.read()
+
+    print("开始写入文本%s" % w_path)
+    with open(w_path, "w", encoding="utf-8") as w:
+        for i, strA in enumerate(content_str):
+            # print(">>> %s-%s" % (i, strA))
+            if strA == "\n":
+                w.write("\n")
+            else:
+                if i in q_dic:
+                    tag = q_dic[i]
+                else:
+                    tag = "O"  # 大写字母O
+                w.write('%s %s\n' % (strA, tag))
+        w.write('%s\n' % "END O")
+
+
+# 生成train.txt、dev.txt、test.txt
+# 除8，9-new.txt分别用于dev和test外,剩下的合并成train.txt
+def create_train_data(data_root_dir, w_path):
+    if os.path.exists(w_path):
+        os.remove(w_path)
+    for file in os.listdir(data_root_dir):
+        path = os.path.join(data_root_dir, file)
+        if file.endswith("8-new.txt"):
+            # 重命名为dev.txt
+            os.rename(path, os.path.join(data_root_dir, "dev.txt"))
+            continue
+        if file.endswith("9-new.txt"):
+            # 重命名为test.txt
+            os.rename(path, os.path.join(data_root_dir, "test.txt"))
+            continue
+        q_list = []
+        print("开始读取文件:%s" % file)
+        with open(path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+            for line in lines:
+                line = line.rstrip()
+                if line == "END O":
+                    break
+                q_list.append(line)
+
+        # 获取list 列表: ['美 O', '！ O', '气 O', '质 O', '特 O', '别 O', '好 O', '', '造 O', '型 O', '独 O', '特 O', '， O', '尺 B-ORG', '码 I-ORG', '偏 B-Ne_VIEW', '大 I-Ne_VIEW', '， O']
+        # print("q_list: ", q_list)
+        print("开始写入文本: %s" % w_path)
+        with open(w_path, "a", encoding="utf-8") as f:
+            for item in q_list:
+                f.write('%s\n' % item)
+
+
+def brat_1_format_origin(catalog):
+    """
+    格式化原始文件（去除表情符号的影响，brat占2个字符，但是python占1个字符）
+    """
+    with open('%s/origin/origin.txt' % path_root, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    with open('%s/tag_befer/befer.txt' % path_root, "w", encoding="utf-8") as f:
+        # 转换原始文件
+        for line in lines:
+            text = emoji.demojize(line)
+            f.write('%s' % text)
+        # 创建标注的新文件
+        with open('%s/tag_befer/befer.ann' % path_root, "w", encoding="utf-8") as f:
+            pass
+
+def brat_2_create_train_data(catalog):
+    file_list = get_catalog_files("%s/tag_after" % catalog, status=-1, str1=".DS_Store")
+    file_list = list(set([i.split("/")[-1].split(".")[0] for i in file_list]))
+    print(file_list)
+    for filename in file_list:
+        r_ann_path = os.path.join(catalog, "tag_after/%s.ann" % filename)
+        r_txt_path = os.path.join(catalog, "tag_after/%s.txt" % filename)
+        w_path = os.path.join(catalog,  "new/%s-new.txt" % filename)
+        print("filename", r_ann_path, r_txt_path, w_path)
+        from_ann2dic(r_ann_path, r_txt_path, w_path)
+    # 生成train.txt、dev.txt、test.txt
+    create_train_data("%s/new" % catalog, "%s/new/train.txt" % catalog)
+
+
+def main():
+    catalog = Config.nlp_ner.path_root
+
+    # brat_1_format_origin(catalog)
+    brat_2_create_train_data(catalog)
-- 
GitLab