Implement train data generation and preprocess for chinese poetry

5c886ced · wangmeng28 · fb18316b · 5c886ced · 5c886ced · 5c886ced
4 changed file
--- a/generate_chinese_poetry/data/dict.txt
+++ b/generate_chinese_poetry/data/dict.txt
--- a/generate_chinese_poetry/data/download.sh
+++ b/generate_chinese_poetry/data/download.sh
+#!/bin/bash
+
+git clone https://github.com/chinese-poetry/chinese-poetry.git
+
+if [ ! -d raw ]
+then
+    mkdir raw
+fi
+
+mv chinese-poetry/json/poet.tang.* raw/
+rm -rf chinese-poetry
--- a/generate_chinese_poetry/preprocess.py
+++ b/generate_chinese_poetry/preprocess.py
+# -*- coding: utf-8 -*-
+import os
+import io
+import re
+import json
+import click
+import collections
+
+
+def build_vocabulary(dataset, cutoff=0):
+    dictionary = collections.defaultdict(int)
+    for data in dataset:
+        for sent in data[2]:
+            for char in sent:
+                dictionary[char] += 1
+    dictionary = filter(lambda x: x[1] >= cutoff, dictionary.items())
+    dictionary = sorted(dictionary, key=lambda x: (-x[1], x[0]))
+    vocab, _ = list(zip(*dictionary))
+    return (u"<unk>", u"<s>", u"<e>") + vocab
+
+
+@click.command("preprocess")
+@click.option("--datadir", type=str, help="Path to raw data")
+@click.option("--outfile", type=str, help="Path to save the training data")
+@click.option("--dictfile", type=str, help="Path to save the dictionary file")
+def preprocess(datadir, outfile, dictfile):
+    dataset = []
+    note_pattern1 = re.compile(u"（.*?）", re.U)
+    note_pattern2 = re.compile(u"〖.*?〗", re.U)
+    note_pattern3 = re.compile(u"-.*?-。?", re.U)
+    note_pattern4 = re.compile(u"（.*$", re.U)
+    note_pattern5 = re.compile(u"。。.*）$", re.U)
+    note_pattern6 = re.compile(u"。。", re.U)
+    note_pattern7 = re.compile(u"[《》「」\[\]]", re.U)
+    print("Loading raw data...")
+    for fn in os.listdir(datadir):
+        with io.open(os.path.join(datadir, fn), "r", encoding="utf8") as f:
+            for data in json.load(f):
+                title = data['title']
+                author = data['author']
+                p = "".join(data['paragraphs'])
+                p = "".join(p.split())
+                p = note_pattern1.sub(u"", p)
+                p = note_pattern2.sub(u"", p)
+                p = note_pattern3.sub(u"", p)
+                p = note_pattern4.sub(u"", p)
+                p = note_pattern5.sub(u"。", p)
+                p = note_pattern6.sub(u"。", p)
+                p = note_pattern7.sub(u"", p)
+                if (p == u"" or u"{" in p or u"}" in p or u"｛" in p or
+                        u"｝" in p or u"、" in p or u"：" in p or u"；" in p or
+                        u"！" in p or u"？" in p or u"●" in p or u"□" in p or
+                        u"囗" in p or u"）" in p):
+                    continue
+                paragraphs = p.split(u"。")
+                paragraphs = filter(lambda x: len(x), paragraphs)
+                if len(paragraphs) > 1:
+                    dataset.append((title, author, paragraphs))
+    print("Finished...")
+
+    print("Constructing vocabularies...")
+    vocab = build_vocabulary(dataset, cutoff=10)
+    with io.open(dictfile, "w", encoding="utf8") as f:
+        for v in vocab:
+            f.write(v + "\n")
+    print("Finished...")
+
+    print("Writing processed data...")
+    with io.open(outfile, "w", encoding="utf8") as f:
+        for data in dataset:
+            title = data[0]
+            author = data[1]
+            paragraphs = ".".join(data[2])
+            f.write("\t".join((title, author, paragraphs)) + "\n")
+    print("Finished...")
+
+
+if __name__ == "__main__":
+    preprocess()
--- a/generate_chinese_poetry/train.py
+++ b/generate_chinese_poetry/train.py
@@ -44,7 +44,7 @@ def load_initial_model(model_path, parameters):
 @click.option(
    "--decoder_depth",
    default=3,
-    help="The number of stacked LSTM layers in encoder.")
+    help="The number of stacked LSTM layers in decoder.")
 @click.option(
    "--train_data_path", required=True, help="The path of trainning data.")
 @click.option(