2021.9.2.15.07

600d14a4 · 王道之 · 600d14a4 · 600d14a4 · 600d14a4 · 600d14a4
15 changed file
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/pythonCut_Words.iml" filepath="$PROJECT_DIR$/.idea/pythonCut_Words.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/pythonCut_Words.iml
+++ b/.idea/pythonCut_Words.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/cut_words.py
+++ b/cut_words.py
+# # This is a sample Python script.
+#
+# # Press Shift+F10 to execute it or replace it with your code.
+# # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+#
+#
+# def print_hi(name):
+#     # Use a breakpoint in the code line below to debug your script.
+#     print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
+#
+#
+# # Press the green button in the gutter to run the script.
+# if __name__ == '__main__':
+#     print_hi('PyCharm')
+#
+# # See PyCharm help at https://www.jetbrains.com/help/pycharm/
+
+
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+import os
+import jieba
+import jieba.analyse # 导入提取关䭞词的库
+# 对训练集 测试集文本都进行切词处理,对测试集数据打上主题标签
+#存至文件؍
+def save_file(save_path, content):
+    with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:
+        fp.write(content)
+# 读取文件
+def read_file(file_path):
+    with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:
+        content = fp.readlines()
+        # print(content)
+    return str(content)
+# 抽取测试集的主题关键词
+def extract_theme(content):
+    themes = []
+    tags = jieba.analyse.extract_tags(content, topK=3, withWeight=True, allowPOS=['n','ns','v','vn'],withFlag=True)
+    for i in tags:
+        themes.append(i[0].word)
+    return str(themes)
+def cast_words(origin_path, save_path, theme_tag):
+    '''
+    train_words_path: 原始文本路径
+    train_save_path: 切词后文本路径
+    :return:
+    '''
+    file_lists = os.listdir(origin_path) #原文档所在路径
+
+    print('\n'+'file_lists:')
+    print(file_lists)
+    print('\n'+'origin_path:')
+    print(origin_path)
+
+    for dir_1 in file_lists: #找到文件夹
+        file_path = origin_path + dir_1 + "/" #原始文件路径
+
+        print('\n' + 'dir_1:')
+        print(dir_1)
+
+        print('\n' + 'file_path:')
+        print(file_path)
+
+        seg_path = save_path + dir_1 + "/" #切词后文件路径
+
+        print('\n' + 'save_path:')
+        print(save_path)
+
+        print('\n' + 'seg_path:')
+        print(seg_path)
+
+        if not os.path.exists(seg_path):
+            os.makedirs(seg_path)
+        detail_paths = os.listdir(file_path)
+
+        print('\n' + 'detail_paths:')
+        print(detail_paths)
+
+        for detail_path in detail_paths: #找到文件夹下具体文件路径
+            full_path = file_path + detail_path #原始文件下每个文档路径
+
+            print('\n' + 'detail_path:')
+            print(detail_path)
+
+            print('\n' + 'full_path:')
+            print(full_path)
+
+            file_content = read_file(full_path)
+
+            print('\n' + 'file_content:')
+            print(file_content)
+
+            file_content = file_content.strip() # replace("\r\n", " ")
+                                                # 删除换行
+            print('\n' + 'file_content.strip():')
+            print(file_content)
+
+            file_content = file_content.replace("\'", "")
+
+            print('\n' + 'file_content.replace("\'", ""):')
+            print(file_content)
+
+            file_content = file_content.replace("\\n", "")
+
+            print('\n' + 'file_content.replace("\\n", ""):')
+            print(file_content)
+
+            content_seg1 = jieba.cut(file_content) # 为文件内容分词
+            content_seg2 = jieba.cut(file_content)  # 为文件内容分词
+            # for tip in content_seg:
+            #     print('这是关键词：' + tip)
+
+
+            if theme_tag is not None:
+                print("文件路径:{} ".format(theme_tag + detail_path))
+                theme = extract_theme(" ".join(content_seg1)) #theme为该文章主题关键词
+
+                # for tip in theme:
+                #     print('这是关键词：' + tip)
+                print("文章主题关键词:{} ".format(theme))
+                save_file(theme_tag + detail_path, theme) # 将训练集文章的主题关键词存到标签存储路径
+
+            save_file(seg_path + detail_path, " ".join(content_seg2)) # 将处理后的文件保存到分词后语料目录
+
+if __name__ == "__main__":
+    # 对训练集进行分词
+    train_words_path = './train_segments/'  #./
+    train_save_path = './train_words/'   #./
+    cast_words(train_words_path,train_save_path,theme_tag=None)
+    # 对测试集进行分词 抽取文章主题标签
+    train_words_path = './test_segments/'  #
+    train_save_path = './test_words/'  #
+    theme_tag_path = './theme_tag/' #存放测试集文章主题标签路径  theme_tag/
+    cast_words(train_words_path, train_save_path, theme_tag=theme_tag_path)
--- a/test_segments/kong_test/网络.txt
+++ b/test_segments/kong_test/网络.txt
+安全、防止水合物和段塞生成的重要措施之一。因此，针对未来还上油田开发技术，我们预先开展了水深1500米管道式油气水分离器的概念设计。通过该研究，提出适合海洋环境的体积小、重量轻、分离效率高、便于操作和维护的新型油气水三相分离器，使其成为海洋深水油气田开
--- a/test_words/kong_test/网络.txt
+++ b/test_words/kong_test/网络.txt
+[ 安全 、 防止 水合物 和 段 塞 生成 的 重要 措施 之一 。 因此 ， 针对 未来 还 上 油田 开发技术 ， 我们 预先 开展 了 水深 1500 米 管道 式 油气 水 分离器 的 概念设计 。 通过 该 研究 ， 提出 适合 海洋环境 的 体积小 、 重量轻 、 分离 效率高 、 便于 操作 和 维护 的 新型 油气 水 三相 分离器 ， 使 其 成为 海洋 深水 油气田 开 ]
\ No newline at end of file
--- a/theme_tag/kong_test.txt
+++ b/theme_tag/kong_test.txt
+[][][]
\ No newline at end of file
--- a/theme_tag/网络.txt
+++ b/theme_tag/网络.txt
+['分离器', '油气', '概念设计']['分离器', '油气', '概念设计']['分离器', '油气', '概念设计']['分离器', '油气', '概念设计']
\ No newline at end of file
--- a/train_segments/kong_train/kong_train.txt
+++ b/train_segments/kong_train/kong_train.txt
+这位朋友你是个大帅哥吧？
--- a/train_words/kong_train/kong_train.txt
+++ b/train_words/kong_train/kong_train.txt
+[ 这位 朋友 你 是 个 大 帅哥 吧 ？ ]
\ No newline at end of file
--- a/venv/Scripts/python.exe
+++ b/venv/Scripts/python.exe
--- a/word_to_bunch.py
+++ b/word_to_bunch.py