diff --git a/generate_sequence_by_rnn_lm/.gitignore b/generate_sequence_by_rnn_lm/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..203ec9a67426fee99e6228716433bb1bec8ff14f --- /dev/null +++ b/generate_sequence_by_rnn_lm/.gitignore @@ -0,0 +1,3 @@ +*.pyc +*.tar.gz +models diff --git a/generate_sequence_by_rnn_lm/README.md b/generate_sequence_by_rnn_lm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c749bc4cc5c6895a1291db86ab27565e57d27304 --- /dev/null +++ b/generate_sequence_by_rnn_lm/README.md @@ -0,0 +1,162 @@ +# 使用循环神经网语言模型生成文本 + +语言模型(Language Model)是一个概率分布模型,简单来说,就是用来计算一个句子的概率的模型。利用它可以确定哪个词序列的可能性更大,或者给定若干个词,可以预测下一个最可能出现的词。语言模型是自然语言处理领域里一个重要的基础模型。 + +## 应用场景 +**语言模型被应用在很多领域**,如: + +* **自动写作**:语言模型可以根据上文生成下一个词,递归下去可以生成整个句子、段落、篇章。 +* **QA**:语言模型可以根据Question生成Answer。 +* **机器翻译**:当前主流的机器翻译模型大多基于Encoder-Decoder模式,其中Decoder就是一个待条件的语言模型,用来生成目标语言。 +* **拼写检查**:语言模型可以计算出词序列的概率,一般在拼写错误处序列的概率会骤减,可以用来识别拼写错误并提供改正候选集。 +* **词性标注、句法分析、语音识别......** + +## 关于本例 +本例实现基于RNN的语言模型,以及利用语言模型生成文本,本例的目录结构如下: + +```text +. +├── data +│ └── train_data_examples.txt # 示例数据,可参考示例数据的格式,提供自己的数据 +├── config.py # 配置文件,包括data、train、infer相关配置 +├── generate.py # 预测任务脚本,即生成文本 +├── beam_search.py # beam search 算法实现 +├── network_conf.py # 本例中涉及的各种网络结构均定义在此文件中,希望进一步修改模型结构,请修改此文件 +├── reader.py # 读取数据接口 +├── README.md +├── train.py # 训练任务脚本 +└── utils.py # 定义通用的函数,例如:构建字典、加载字典等 +``` + +## RNN 语言模型 +### 简介 + +RNN是一个序列模型,基本思路是:在时刻$t$,将前一时刻$t-1$的隐藏层输出和$t$时刻的词向量一起输入到隐藏层从而得到时刻$t$的特征表示,然后用这个特征表示得到$t$时刻的预测输出,如此在时间维上递归下去。可以看出RNN善于使用上文信息、历史知识,具有“记忆”功能。理论上RNN能实现“长依赖”(即利用很久之前的知识),但在实际应用中发现效果并不理想,研究提出了LSTM和GRU等变种,通过引入门机制对传统RNN的记忆单元进行了改进,弥补了传统RNN在学习长序列时遇到的难题。本例模型使用了LSTM或GRU,可通过配置进行修改。下图是RNN(广义上包含了LSTM、GRU等)语言模型“循环”思想的示意图: + +

+ +### 模型实现 + +本例中RNN语言模型的实现简介如下: + +- **定义模型参数**:`config.py`中定义了模型的参数变量。 +- **定义模型结构**:`network_conf.py`中的`rnn_lm`**函数**中定义了模型的**结构**,如下: + - 输入层:将输入的词(或字)序列映射成向量,即词向量层: `embedding`。 + - 中间层:根据配置实现RNN层,将上一步得到的`embedding`向量序列作为输入。 + - 输出层:使用`softmax`归一化计算单词的概率。 + - loss:定义多类交叉熵作为模型的损失函数。 +- **训练模型**:`train.py`中的`main`方法实现了模型的训练,实现流程如下: + - 准备输入数据:建立并保存词典、构建train和test数据的reader。 + - 初始化模型:包括模型的结构、参数。 + - 构建训练器:demo中使用的是Adam优化算法。 + - 定义回调函数:构建`event_handler`来跟踪训练过程中loss的变化,并在每轮训练结束时保存模型的参数。 + - 训练:使用trainer训练模型。 + +- **生成文本**:`generate.py` 实现了文本的生成,实现流程如下: + - 加载训练好的模型和词典文件。 + - 读取`gen_file`文件,每行是一个句子的前缀,用[柱搜索算法(Beam Search)](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md#柱搜索算法)根据前缀生成文本。 + - 将生成的文本及其前缀保存到文件`gen_result`。 + +## 使用说明 + +运行本例的方法如下: + +* 1,运行`python train.py`命令,开始train模型(默认使用RNN),待训练结束。 +* 2,运行`python generate.py`运行文本生成。(输入的文本默认为`data/train_data_examples.txt`,生成的文本默认保存到`data/gen_result.txt`中。) + + +**如果需要使用自己的语料、定制模型,需要修改`config.py`中的配置,细节和适配工作详情如下:** + + +### 语料适配 + +* 清洗语料:去除原文中空格、tab、乱码,按需去除数字、标点符号、特殊符号等。 +* 内容格式:每个句子占一行;每行中的各词之间使用一个空格符分开。 +* 按需要配置`config.py`中的如下参数: + + ```python + train_file = "data/train_data_examples.txt" + test_file = "" + + vocab_file = "data/word_vocab.txt" + model_save_dir = "models" + ``` + 1. `train_file`:指定训练数据的路径,**需要预先分词**。 + 2. `test_file`:指定测试数据的路径,如果训练数据不为空,将在每个 `pass` 训练结束对指定的测试数据进行测试。 + 3. `vocab_file`:指定字典的路径,如果字典文件不存在,将会对训练语料进行词频统计,构建字典。 + 4. `model_save_dir`:指定模型保存的路径,如果指定的文件夹不存在,将会自动创建。 + +### 构建字典的策略 +- 当指定的字典文件不存在时,将对训练数据进行词频统计,自动构建字典`config.py` 中有如下两个参数与构建字典有关: + + ```python + max_word_num = 51200 - 2 + cutoff_word_fre = 0 + ``` + 1. `max_word_num`:指定字典中含有多少个词。 + 2. `cutoff_word_fre`:字典中词语在训练语料中出现的最低频率。 +- 加入指定了 `max_word_num = 5000`,并且 `cutoff_word_fre = 10`,词频统计发现训练语料中出现频率高于10次的词语仅有3000个,那么最终会取3000个词构成词典。 +- 构建词典时,会自动加入两个特殊符号: + 1. ``:不出现在字典中的词 + 2. ``:句子的结束符 + + *注:需要注意的是,词典越大生成的内容越丰富,但训练耗时越久。一般中文分词之后,语料中不同的词能有几万乃至几十万,如果`max_word_num`取值过小则导致``占比过高,如果`max_word_num`取值较大,则严重影响训练速度(对精度也有影响)。所以,也有“按字”训练模型的方式,即:把每个汉字当做一个词,常用汉字也就几千个,使得字典的大小不会太大、不会丢失太多信息,但汉语中同一个字在不同词中语义相差很大,有时导致模型效果不理想。建议多试试、根据实际情况选择是“按词训练”还是“按字训练”。* + +### 模型适配、训练 + +* 按需调整`config.py`中如下配置,来修改 rnn 语言模型的网络结果: + + ```python + rnn_type = "lstm" # "gru" or "lstm" + emb_dim = 256 + hidden_size = 256 + stacked_rnn_num = 2 + ``` + 1. `rnn_type`:支持 ”gru“ 或者 ”lstm“ 两种参数,选择使用何种 RNN 单元。 + 2. `emb_dim`:设置词向量的维度。 + 3. `hidden_size`:设置 RNN 单元隐层大小。 + 4. `stacked_rnn_num`:设置堆叠 RNN 单元的个数,构成一个更深的模型。 + +* 运行`python train.py`命令训练模型,模型将被保存到`model_save_dir`指定的目录。 + +### 按需生成文本 + +* 按需调整`config.py`中以下变量,详解如下: + + ```python + gen_file = "data/train_data_examples.txt" + gen_result = "data/gen_result.txt" + max_gen_len = 25 # the max number of words to generate + beam_size = 5 + model_path = "models/rnn_lm_pass_00000.tar.gz" + ``` + 1. `gen_file`:指定输入数据文件,每行是一个句子的前缀,**需要预先分词**。 + 2. `gen_result`:指定输出文件路径,生成结果将写入此文件。 + 3. `max_gen_len`:指定每一句生成的话最长长度,如果模型无法生成出``,当生成 `max_gen_len` 个词语后,生成过程会自动终止。 + 4. `beam_size`:Beam Search 算法每一步的展开宽度。 + 5. `model_path`:指定训练好的模型的路径。 + + 其中,`gen_file` 中保存的是待生成的文本前缀,每个前缀占一行,形如: + + ```text + 若隐若现 地像 幽灵 , 像 死神 + ``` + 将需要生成的文本前缀按此格式存入文件即可; + +* 运行`python generate.py`命令运行beam search 算法为输入前缀生成文本,下面是模型生成的结果: + + ```text + 81 若隐若现 地像 幽灵 , 像 死神 + -12.2542 一样 。 他 是 个 怪物 + -12.6889 一样 。 他 是 个 英雄 + -13.9877 一样 。 他 是 我 的 敌人 + -14.2741 一样 。 他 是 我 的 + -14.6250 一样 。 他 是 我 的 朋友 + ``` + 其中: + 1. 第一行 `81 若隐若现 地像 幽灵 , 像 死神`以`\t`为分隔,共有两列: + - 第一列是输入前缀在训练样本集中的序号。 + - 第二列是输入的前缀。 + 2. 第二 ~ `beam_size + 1` 行是生成结果,同样以 `\t` 分隔为两列: + - 第一列是该生成序列的对数概率(log probability)。 + - 第二列是生成的文本序列,正常的生成结果会以符号``结尾,如果没有以``结尾,意味着超过了最大序列长度,生成强制终止。 diff --git a/generate_sequence_by_rnn_lm/beam_search.py b/generate_sequence_by_rnn_lm/beam_search.py new file mode 100644 index 0000000000000000000000000000000000000000..b0bfa2b3c3a814454ad2f847347ead3848d13ec2 --- /dev/null +++ b/generate_sequence_by_rnn_lm/beam_search.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# coding=utf-8 +import os +import math +import numpy as np + +import paddle.v2 as paddle + +from utils import logger, load_reverse_dict + +__all__ = ["BeamSearch"] + + +class BeamSearch(object): + """ + Generating sequence by beam search + NOTE: this class only implements generating one sentence at a time. + """ + + def __init__(self, inferer, word_dict_file, beam_size=1, max_gen_len=100): + """ + constructor method. + + :param inferer: object of paddle.Inference that represents the entire + network to forward compute the test batch + :type inferer: paddle.Inference + :param word_dict_file: path of word dictionary file + :type word_dict_file: str + :param beam_size: expansion width in each iteration + :type param beam_size: int + :param max_gen_len: the maximum number of iterations + :type max_gen_len: int + """ + self.inferer = inferer + self.beam_size = beam_size + self.max_gen_len = max_gen_len + self.ids_2_word = load_reverse_dict(word_dict_file) + logger.info("dictionay len = %d" % (len(self.ids_2_word))) + + try: + self.eos_id = next(x[0] for x in self.ids_2_word.iteritems() + if x[1] == "") + self.unk_id = next(x[0] for x in self.ids_2_word.iteritems() + if x[1] == "") + except StopIteration: + logger.fatal(("the word dictionay must contain an ending mark " + "in the text generation task.")) + + self.candidate_paths = [] + self.final_paths = [] + + def _top_k(self, softmax_out, k): + """ + get indices of the words with k highest probablities. + NOTE: will be excluded if it is among the top k words, then word + with (k + 1)th highest probability will be returned. + + :param softmax_out: probablity over the dictionary + :type softmax_out: narray + :param k: number of word indices to return + :type k: int + :return: indices of k words with highest probablities. + :rtype: list + """ + ids = softmax_out.argsort()[::-1] + return ids[ids != self.unk_id][:k] + + def _forward_batch(self, batch): + """ + forward a test batch. + + :params batch: the input data batch + :type batch: list + :return: probablities of the predicted word + :rtype: ndarray + """ + return self.inferer.infer(input=batch, field=["value"]) + + def _beam_expand(self, next_word_prob): + """ + In every iteration step, the model predicts the possible next words. + For each input sentence, the top k words is added to end of the original + sentence to form a new generated sentence. + + :param next_word_prob: probablities of the next words + :type next_word_prob: ndarray + :return: the expanded new sentences. + :rtype: list + """ + assert len(next_word_prob) == len(self.candidate_paths), ( + "Wrong forward computing results!") + top_beam_words = np.apply_along_axis(self._top_k, 1, next_word_prob, + self.beam_size) + new_paths = [] + for i, words in enumerate(top_beam_words): + old_path = self.candidate_paths[i] + for w in words: + log_prob = old_path["log_prob"] + math.log(next_word_prob[i][w]) + gen_ids = old_path["ids"] + [w] + if w == self.eos_id: + self.final_paths.append({ + "log_prob": log_prob, + "ids": gen_ids + }) + else: + new_paths.append({"log_prob": log_prob, "ids": gen_ids}) + return new_paths + + def _beam_shrink(self, new_paths): + """ + to return the top beam_size generated sequences with the highest + probabilities at the end of evey generation iteration. + + :param new_paths: all possible generated sentences + :type new_paths: list + :return: a state flag to indicate whether to stop beam search + :rtype: bool + """ + + if len(self.final_paths) >= self.beam_size: + max_candidate_log_prob = max( + new_paths, key=lambda x: x["log_prob"])["log_prob"] + min_complete_path_log_prob = min( + self.final_paths, key=lambda x: x["log_prob"])["log_prob"] + if min_complete_path_log_prob >= max_candidate_log_prob: + return True + + new_paths.sort(key=lambda x: x["log_prob"], reverse=True) + self.candidate_paths = new_paths[:self.beam_size] + return False + + def gen_a_sentence(self, input_sentence): + """ + generating sequence for an given input + + :param input_sentence: one input_sentence + :type input_sentence: list + :return: the generated word sequences + :rtype: list + """ + self.candidate_paths = [{"log_prob": 0., "ids": input_sentence}] + input_len = len(input_sentence) + + for i in range(self.max_gen_len): + next_word_prob = self._forward_batch( + [[x["ids"]] for x in self.candidate_paths]) + new_paths = self._beam_expand(next_word_prob) + + min_candidate_log_prob = min( + new_paths, key=lambda x: x["log_prob"])["log_prob"] + + path_to_remove = [ + path for path in self.final_paths + if path["log_prob"] < min_candidate_log_prob + ] + for p in path_to_remove: + self.final_paths.remove(p) + + if self._beam_shrink(new_paths): + self.candidate_paths = [] + break + + gen_ids = sorted( + self.final_paths + self.candidate_paths, + key=lambda x: x["log_prob"], + reverse=True)[:self.beam_size] + self.final_paths = [] + + def _to_str(x): + text = " ".join(self.ids_2_word[idx] + for idx in x["ids"][input_len:]) + return "%.4f\t%s" % (x["log_prob"], text) + + return map(_to_str, gen_ids) diff --git a/generate_sequence_by_rnn_lm/config.py b/generate_sequence_by_rnn_lm/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2085b7ba5f7448fbad26794e30e004b42543ca9a --- /dev/null +++ b/generate_sequence_by_rnn_lm/config.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# coding=utf-8 +import os + +################## for building word dictionary ################## + +max_word_num = 51200 - 2 +cutoff_word_fre = 0 + +################## for training task ######################### +# path of training data +train_file = "data/train_data_examples.txt" +# path of testing data, if testing file does not exist, +# testing will not be performed at the end of each training pass +test_file = "" +# path of word dictionary, if this file does not exist, +# word dictionary will be built from training data. +vocab_file = "data/word_vocab.txt" +# directory to save the trained model +# create a new directory if the directoy does not exist +model_save_dir = "models" + +batch_size = 32 # the number of training examples in one forward/backward pass +num_passes = 20 # how many passes to train the model + +log_period = 50 +save_period_by_batches = 50 + +use_gpu = True # to use gpu or not +trainer_count = 1 # number of trainer + +################## for model configuration ################## +rnn_type = "lstm" # "gru" or "lstm" +emb_dim = 256 +hidden_size = 256 +stacked_rnn_num = 2 + +################## for text generation ################## +gen_file = "data/train_data_examples.txt" +gen_result = "data/gen_result.txt" +max_gen_len = 25 # the max number of words to generate +beam_size = 5 +model_path = "models/rnn_lm_pass_00000.tar.gz" + +if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) diff --git a/generate_sequence_by_rnn_lm/data/train_data_examples.txt b/generate_sequence_by_rnn_lm/data/train_data_examples.txt new file mode 100644 index 0000000000000000000000000000000000000000..db1ad611b0eb6882aac617baeebeea7c029eff7c --- /dev/null +++ b/generate_sequence_by_rnn_lm/data/train_data_examples.txt @@ -0,0 +1,5 @@ +我们 不会 伤害 你 的 。 他们 也 这么 说 。 +你 拥有 你 父亲 皇室 的 血统 。 是 合法 的 继承人 。 +叫 什么 你 可以 告诉 我 。 +你 并 没有 留言 说 要 去 哪里 。 是 的 , 因为 我 必须 要 去 完成 这件 事 。 +你 查出 是 谁 住 在 隔壁 房间 吗 ? diff --git a/generate_sequence_by_rnn_lm/generate.py b/generate_sequence_by_rnn_lm/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..672ef2e187e25bd6766ac57aa808616a5a7a5a04 --- /dev/null +++ b/generate_sequence_by_rnn_lm/generate.py @@ -0,0 +1,75 @@ +# coding=utf-8 +import os +import gzip +import numpy as np + +import paddle.v2 as paddle + +from utils import logger, load_dict +from beam_search import BeamSearch +import config as conf +from network_conf import rnn_lm + + +def rnn_generate(gen_input_file, model_path, max_gen_len, beam_size, + word_dict_file): + """ + use RNN model to generate sequences. + + :param word_id_dict: vocab. + :type word_id_dict: dictionary with content of "{word, id}", + "word" is string type , "id" is int type. + :param num_words: the number of the words to generate. + :type num_words: int + :param beam_size: beam width. + :type beam_size: int + :return: save prediction results to output_file + """ + + assert os.path.exists(gen_input_file), "test file does not exist!" + assert os.path.exists(model_path), "trained model does not exist!" + assert os.path.exists( + word_dict_file), "word dictionary file does not exist!" + + # load word dictionary + word_2_ids = load_dict(word_dict_file) + try: + UNK_ID = word_2_ids[""] + except KeyError: + logger.fatal("the word dictionary must contain a token!") + sys.exit(-1) + + # initialize paddle + paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count) + + # load the trained model + pred_words = rnn_lm( + len(word_2_ids), + conf.emb_dim, + conf.hidden_size, + conf.stacked_rnn_num, + conf.rnn_type, + is_infer=True) + + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(model_path, "r")) + + inferer = paddle.inference.Inference( + output_layer=pred_words, parameters=parameters) + + generator = BeamSearch(inferer, word_dict_file, beam_size, max_gen_len) + # generate text + with open(conf.gen_file, "r") as fin, open(conf.gen_result, "w") as fout: + for idx, line in enumerate(fin): + fout.write("%d\t%s" % (idx, line)) + for gen_res in generator.gen_a_sentence([ + word_2_ids.get(w, UNK_ID) + for w in line.lower().strip().split() + ]): + fout.write("%s\n" % gen_res) + fout.write("\n") + + +if __name__ == "__main__": + rnn_generate(conf.gen_file, conf.model_path, conf.max_gen_len, + conf.beam_size, conf.vocab_file) diff --git a/language_model/images/ngram.png b/generate_sequence_by_rnn_lm/images/ngram.png similarity index 100% rename from language_model/images/ngram.png rename to generate_sequence_by_rnn_lm/images/ngram.png diff --git a/language_model/images/rnn.png b/generate_sequence_by_rnn_lm/images/rnn.png similarity index 100% rename from language_model/images/rnn.png rename to generate_sequence_by_rnn_lm/images/rnn.png diff --git a/generate_sequence_by_rnn_lm/index.html b/generate_sequence_by_rnn_lm/index.html new file mode 100644 index 0000000000000000000000000000000000000000..17e81d9e52494efc18948f8e6e70c5d253614fbc --- /dev/null +++ b/generate_sequence_by_rnn_lm/index.html @@ -0,0 +1,226 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/generate_sequence_by_rnn_lm/network_conf.py b/generate_sequence_by_rnn_lm/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..7306337bf7515ddfd4df137c3ee81f8aa4fa7b90 --- /dev/null +++ b/generate_sequence_by_rnn_lm/network_conf.py @@ -0,0 +1,62 @@ +# coding=utf-8 + +import paddle.v2 as paddle + + +def rnn_lm(vocab_dim, + emb_dim, + hidden_size, + stacked_rnn_num, + rnn_type="lstm", + is_infer=False): + """ + RNN language model definition. + + :param vocab_dim: size of vocabulary. + :type vocab_dim: int + :param emb_dim: dimension of the embedding vector + :type emb_dim: int + :param rnn_type: the type of RNN cell. + :type rnn_type: int + :param hidden_size: number of hidden unit. + :type hidden_size: int + :param stacked_rnn_num: number of stacked rnn cell. + :type stacked_rnn_num: int + :return: cost and output layer of model. + :rtype: LayerOutput + """ + + # input layers + input = paddle.layer.data( + name="input", type=paddle.data_type.integer_value_sequence(vocab_dim)) + if not is_infer: + target = paddle.layer.data( + name="target", + type=paddle.data_type.integer_value_sequence(vocab_dim)) + + # embedding layer + input_emb = paddle.layer.embedding(input=input, size=emb_dim) + + # rnn layer + if rnn_type == "lstm": + for i in range(stacked_rnn_num): + rnn_cell = paddle.networks.simple_lstm( + input=rnn_cell if i else input_emb, size=hidden_size) + elif rnn_type == "gru": + for i in range(stacked_rnn_num): + rnn_cell = paddle.networks.simple_gru( + input=rnn_cell if i else input_emb, size=hidden_size) + else: + raise Exception("rnn_type error!") + + # fc(full connected) and output layer + output = paddle.layer.fc( + input=[rnn_cell], size=vocab_dim, act=paddle.activation.Softmax()) + + if is_infer: + last_word = paddle.layer.last_seq(input=output) + return last_word + else: + cost = paddle.layer.classification_cost(input=output, label=target) + + return cost, output diff --git a/generate_sequence_by_rnn_lm/reader.py b/generate_sequence_by_rnn_lm/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..5515ad77396bc7f7553c8a14e1833379adf62a26 --- /dev/null +++ b/generate_sequence_by_rnn_lm/reader.py @@ -0,0 +1,32 @@ +# coding=utf-8 +import collections +import os + +MIN_LEN = 3 +MAX_LEN = 100 + + +def rnn_reader(file_name, word_dict): + """ + create reader for RNN, each line is a sample. + + :param file_name: file name. + :param min_sentence_length: sentence's min length. + :param max_sentence_length: sentence's max length. + :param word_dict: vocab with content of '{word, id}', + 'word' is string type , 'id' is int type. + :return: data reader. + """ + + def reader(): + UNK_ID = word_dict[''] + with open(file_name) as file: + for line in file: + words = line.strip().lower().split() + if len(words) < MIN_LEN or len(words) > MAX_LEN: + continue + ids = [word_dict.get(w, UNK_ID) + for w in words] + [word_dict['']] + yield ids[:-1], ids[1:] + + return reader diff --git a/generate_sequence_by_rnn_lm/train.py b/generate_sequence_by_rnn_lm/train.py new file mode 100644 index 0000000000000000000000000000000000000000..2958592748c0fe982972717017dede11c04ebb7e --- /dev/null +++ b/generate_sequence_by_rnn_lm/train.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# coding=utf-8 +import os +import sys +import gzip +import pdb + +import paddle.v2 as paddle +import config as conf +import reader +from network_conf import rnn_lm +from utils import logger, build_dict, load_dict + + +def train(topology, + train_reader, + test_reader, + model_save_dir="models", + num_passes=10): + """ + train model. + + :param topology: cost layer of the model to train. + :type topology: LayerOuput + :param train_reader: train data reader. + :type trainer_reader: collections.Iterable + :param test_reader: test data reader. + :type test_reader: collections.Iterable + :param model_save_dir: path to save the trained model + :type model_save_dir: str + :param num_passes: number of epoch + :type num_passes: int + """ + if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) + + # initialize PaddlePaddle + paddle.init( + use_gpu=conf.use_gpu, gpu_id=3, trainer_count=conf.trainer_count) + + # create optimizer + adam_optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, + regularization=paddle.optimizer.L2Regularization(rate=1e-3), + model_average=paddle.optimizer.ModelAverage( + average_window=0.5, max_average_window=10000)) + + # create parameters + parameters = paddle.parameters.create(topology) + # create trainer + trainer = paddle.trainer.SGD( + cost=topology, parameters=parameters, update_equation=adam_optimizer) + + # define the event_handler callback + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if not event.batch_id % conf.log_period: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) + + if (not event.batch_id % + conf.save_period_by_batches) and event.batch_id: + save_name = os.path.join(model_save_dir, + "rnn_lm_pass_%05d_batch_%03d.tar.gz" % + (event.pass_id, event.batch_id)) + with gzip.open(save_name, "w") as f: + parameters.to_tar(f) + + if isinstance(event, paddle.event.EndPass): + if test_reader is not None: + result = trainer.test(reader=test_reader) + logger.info("Test with Pass %d, %s" % + (event.pass_id, result.metrics)) + save_name = os.path.join(model_save_dir, "rnn_lm_pass_%05d.tar.gz" % + (event.pass_id)) + with gzip.open(save_name, "w") as f: + parameters.to_tar(f) + + logger.info("start training...") + trainer.train( + reader=train_reader, event_handler=event_handler, num_passes=num_passes) + + logger.info("Training is finished.") + + +def main(): + # prepare vocab + if not (os.path.exists(conf.vocab_file) and + os.path.getsize(conf.vocab_file)): + logger.info(("word dictionary does not exist, " + "build it from the training data")) + build_dict(conf.train_file, conf.vocab_file, conf.max_word_num, + conf.cutoff_word_fre) + logger.info("load word dictionary.") + word_dict = load_dict(conf.vocab_file) + logger.info("dictionay size = %d" % (len(word_dict))) + + cost = rnn_lm( + len(word_dict), conf.emb_dim, conf.hidden_size, conf.stacked_rnn_num, + conf.rnn_type) + + # define reader + reader_args = { + "file_name": conf.train_file, + "word_dict": word_dict, + } + train_reader = paddle.batch( + paddle.reader.shuffle( + reader.rnn_reader(**reader_args), buf_size=102400), + batch_size=conf.batch_size) + test_reader = None + if os.path.exists(conf.test_file) and os.path.getsize(conf.test_file): + test_reader = paddle.batch( + paddle.reader.shuffle( + reader.rnn_reader(**reader_args), buf_size=65536), + batch_size=config.batch_size) + + train( + topology=cost, + train_reader=train_reader, + test_reader=test_reader, + model_save_dir=conf.model_save_dir, + num_passes=conf.num_passes) + + +if __name__ == "__main__": + main() diff --git a/generate_sequence_by_rnn_lm/utils.py b/generate_sequence_by_rnn_lm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..179b68f8234caa776c03729aff7bfab22b8e5592 --- /dev/null +++ b/generate_sequence_by_rnn_lm/utils.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# coding=utf-8 +import os +import logging +from collections import defaultdict + +__all__ = ["build_dict", "load_dict"] + +logger = logging.getLogger("paddle") +logger.setLevel(logging.DEBUG) + + +def build_dict(data_file, + save_path, + max_word_num, + cutoff_word_fre=5, + insert_extra_words=["", ""]): + """ + :param data_file: path of data file + :type data_file: str + :param save_path: path to save the word dictionary + :type save_path: str + :param vocab_max_size: if vocab_max_size is set, top vocab_max_size words + will be added into word vocabulary + :type vocab_max_size: int + :param cutoff_thd: if cutoff_thd is set, words whose frequencies are less + than cutoff_thd will not be added into word vocabulary. + NOTE that: vocab_max_size and cutoff_thd cannot be set at the same time + :type cutoff_word_fre: int + :param extra_keys: extra keys defined by users that added into the word + dictionary, ususally these keys include , start and ending marks + :type extra_keys: list + """ + word_count = defaultdict(int) + with open(data_file, "r") as f: + for idx, line in enumerate(f): + if not (idx + 1) % 100000: + logger.debug("processing %d lines ... " % (idx + 1)) + words = line.strip().lower().split() + for w in words: + word_count[w] += 1 + + sorted_words = sorted( + word_count.iteritems(), key=lambda x: x[1], reverse=True) + + stop_pos = len(sorted_words) if sorted_words[-1][ + 1] > cutoff_word_fre else next(idx for idx, v in enumerate(sorted_words) + if v[1] < cutoff_word_fre) + + stop_pos = min(max_word_num, stop_pos) + with open(save_path, "w") as fdict: + for w in insert_extra_words: + fdict.write("%s\t-1\n" % (w)) + for idx, info in enumerate(sorted_words): + if idx == stop_pos: break + fdict.write("%s\t%d\n" % (info[0], info[-1])) + + +def load_dict(dict_path): + """ + load word dictionary from the given file. Each line of the give file is + a word in the word dictionary. The first column of the line, seperated by + TAB, is the key, while the line index is the value. + + :param dict_path: path of word dictionary + :type dict_path: str + :return: the dictionary + :rtype: dict + """ + return dict((line.strip().split("\t")[0], idx) + for idx, line in enumerate(open(dict_path, "r").readlines())) + + +def load_reverse_dict(dict_path): + """ + load word dictionary from the given file. Each line of the give file is + a word in the word dictionary. The line index is the key, while the first + column of the line, seperated by TAB, is the value. + + :param dict_path: path of word dictionary + :type dict_path: str + :return: the dictionary + :rtype: dict + """ + return dict((idx, line.strip().split("\t")[0]) + for idx, line in enumerate(open(dict_path, "r").readlines())) diff --git a/hsigmoid/infer.py b/hsigmoid/infer.py index ff080ad7a1f4915518e317d5efc4a7a7aec49d0f..8645d00d20047f620d1beef17c60bb5b69996ff9 100644 --- a/hsigmoid/infer.py +++ b/hsigmoid/infer.py @@ -11,7 +11,7 @@ logger = logging.getLogger("paddle") logger.setLevel(logging.WARNING) -def decode_res(infer_res, dict_size): +def decode_result(infer_res, dict_size): """ Inferring probabilities are orginized as a complete binary tree. The actual labels are leaves (indices are counted from class number). @@ -41,10 +41,10 @@ def decode_res(infer_res, dict_size): return predict_lbls -def predict(batch_ins, idx_word_dict, dict_size, inferer): +def infer_a_batch(batch_ins, idx_word_dict, dict_size, inferer): infer_res = inferer.infer(input=batch_ins) - predict_lbls = decode_res(infer_res, dict_size) + predict_lbls = decode_result(infer_res, dict_size) predict_words = [idx_word_dict[lbl] for lbl in predict_lbls] # map to word # Ouput format: word1 word2 word3 word4 -> predict label @@ -53,7 +53,7 @@ def predict(batch_ins, idx_word_dict, dict_size, inferer): for w in ins]) + " -> " + predict_words[i]) -def main(model_path): +def infer(model_path, batch_size): assert os.path.exists(model_path), "trained model does not exist." paddle.init(use_gpu=False, trainer_count=1) @@ -68,19 +68,17 @@ def main(model_path): inferer = paddle.inference.Inference( output_layer=prediction_layer, parameters=parameters) idx_word_dict = dict((v, k) for k, v in word_dict.items()) - batch_size = 64 - batch_ins = [] - ins_iter = paddle.dataset.imikolov.test(word_dict, 5) - for ins in ins_iter(): + batch_ins = [] + for ins in paddle.dataset.imikolov.test(word_dict, 5)(): batch_ins.append(ins[:-1]) if len(batch_ins) == batch_size: - predict(batch_ins, idx_word_dict, dict_size, inferer) + infer_a_batch(batch_ins, idx_word_dict, dict_size, inferer) batch_ins = [] if len(batch_ins) > 0: - predict(batch_ins, idx_word_dict, dict_size, inferer) + infer_a_batch(batch_ins, idx_word_dict, dict_size, inferer) if __name__ == "__main__": - main("models/hsigmoid_batch_00010.tar.gz") + infer("models/hsigmoid_batch_00010.tar.gz", 20) diff --git a/language_model/README.md b/language_model/README.md deleted file mode 100644 index 75c3417ef2308acb15641f570d1c63f9f1366299..0000000000000000000000000000000000000000 --- a/language_model/README.md +++ /dev/null @@ -1,200 +0,0 @@ -# 语言模型 - -## 简介 -语言模型即 Language Model,简称LM。它是一个概率分布模型,简单来说,就是用来计算一个句子的概率的模型。利用它可以确定哪个词序列的可能性更大,或者给定若干个词,可以预测下一个最可能出现的词。语言模型是自然语言处理领域里一个重要的基础模型。 - -## 应用场景 -**语言模型被应用在很多领域**,如: - -* **自动写作**:语言模型可以根据上文生成下一个词,递归下去可以生成整个句子、段落、篇章。 -* **QA**:语言模型可以根据Question生成Answer。 -* **机器翻译**:当前主流的机器翻译模型大多基于Encoder-Decoder模式,其中Decoder就是一个语言模型,用来生成目标语言。 -* **拼写检查**:语言模型可以计算出词序列的概率,一般在拼写错误处序列的概率会骤减,可以用来识别拼写错误并提供改正候选集。 -* **词性标注、句法分析、语音识别......** - -## 关于本例 -Language Model 常见的实现方式有 N-Gram、RNN、seq2seq。本例中实现了基于N-Gram、RNN的语言模型。**本例的文件结构如下**(`images` 文件夹与使用无关可不关心): - - -```text -. -├── data # toy、demo数据,用户可据此格式化自己的数据 -│ ├── chinese.test.txt # test用的数据demo -| ├── chinese.train.txt # train用的数据demo -│ └── input.txt # infer用的输入数据demo -├── config.py # 配置文件,包括data、train、infer相关配置 -├── infer.py # 预测任务脚本,即生成文本 -├── network_conf.py # 本例中涉及的各种网络结构均定义在此文件中,希望进一步修改模型结构,请修改此文件 -├── reader.py # 读取数据接口 -├── README.md # 文档 -├── train.py # 训练任务脚本 -└── utils.py # 定义通用的函数,例如:构建字典、加载字典等 -``` - -**注:一般情况下基于N-Gram的语言模型不如基于RNN的语言模型效果好,所以实际使用时建议使用基于RNN的语言模型,本例中也将着重介绍基于RNN的模型,简略介绍基于N-Gram的模型。** - -## RNN 语言模型 -### 简介 - -RNN是一个序列模型,基本思路是:在时刻t,将前一时刻t-1的隐藏层输出和t时刻的词向量一起输入到隐藏层从而得到时刻t的特征表示,然后用这个特征表示得到t时刻的预测输出,如此在时间维上递归下去。可以看出RNN善于使用上文信息、历史知识,具有“记忆”功能。理论上RNN能实现“长依赖”(即利用很久之前的知识),但在实际应用中发现效果并不理想,于是出现了很多RNN的变种,如常用的LSTM和GRU,它们对传统RNN的cell进行了改进,弥补了传统RNN的不足,本例中即使用了LSTM、GRU。下图是RNN(广义上包含了LSTM、GRU等)语言模型“循环”思想的示意图: - -

- -### 模型实现 - -本例中RNN语言模型的实现简介如下: - -* **定义模型参数**:`config.py`中的`Config_rnn`**类**中定义了模型的参数变量。 -* **定义模型结构**:`network_conf.py`中的`rnn_lm`**函数**中定义了模型的**结构**,如下: - * 输入层:将输入的词(或字)序列映射成向量,即embedding。 - * 中间层:根据配置实现RNN层,将上一步得到的embedding向量序列作为输入。 - * 输出层:使用softmax归一化计算单词的概率,将output结果返回 - * loss:定义模型的cost为多类交叉熵损失函数。 -* **训练模型**:`train.py`中的`main`方法实现了模型的训练,实现流程如下: - * 准备输入数据:建立并保存词典、构建train和test数据的reader。 - * 初始化模型:包括模型的结构、参数。 - * 构建训练器:demo中使用的是Adam优化算法。 - * 定义回调函数:构建`event_handler`来跟踪训练过程中loss的变化,并在每轮训练结束时保存模型的参数。 - * 训练:使用trainer训练模型。 - -* **生成文本**:`infer.py`中的`main`方法实现了文本的生成,实现流程如下: - * 根据配置选择生成方法:RNN模型 or N-Gram模型。 - * 加载train好的模型和词典文件。 - * 读取`input_file`文件(每行为一个sentence的前缀),用启发式图搜索算法`beam_search`根据各sentence的前缀生成文本。 - * 将生成的文本及其前缀保存到文件`output_file`。 - - -## N-Gram 语言模型 - -### 简介 -N-Gram模型也称为N-1阶马尔科夫模型,它有一个有限历史假设:当前词的出现概率仅仅与前面N-1个词相关。一般采用最大似然估计(Maximum Likelihood Estimation,MLE)方法对模型的参数进行估计。当N取1、2、3时,N-Gram模型分别称为unigram、bigram和trigram语言模型。一般情况下,N越大、训练语料的规模越大,参数估计的结果越可靠,但由于模型较简单、表达能力不强以及数据稀疏等问题。一般情况下用N-Gram实现的语言模型不如RNN、seq2seq效果好。下图是基于神经网络的N-Gram语言模型结构示意图: - -

- -### 模型实现 - -本例中N-Gram语言模型的实现简介如下: - -* **定义模型参数**:`config.py`中的`Config_ngram`**类**中定义了模型的参数变量。 -* **定义模型结构**:`network_conf.py`中的`ngram_lm`**函数**中定义了模型的**结构**,如下: - * 输入层:本例中N取5,将前四个词分别做embedding,然后连接起来作为输入。 - * 中间层:根据配置实现DNN层,将上一步得到的embedding向量序列作为输入。 - * 输出层:使用softmax归一化计算单词的概率,将output结果返回 - * loss:定义模型的cost为多类交叉熵损失函数。 -* **训练模型**:`train.py`中的`main`方法实现了模型的训练,实现流程与上文中RNN语言模型基本一致。 -* **生成文本**:`infer.py`中的`main`方法实现了文本的生成,实现流程与上文中RNN语言模型基本一致,区别在于构建input时本例会取每个前缀的最后4(N-1)个词作为输入。 - -## 使用说明 - -运行本例的方法如下: - -* 1,运行`python train.py`命令,开始train模型(默认使用RNN),待训练结束。 -* 2,运行`python infer.py`命令做prediction。(输入的文本默认为`data/input.txt`,生成的文本默认保存到`data/output.txt`中。) - - -**如果用户需要使用自己的语料、定制模型,需要修改的地方主要是`语料`和`config.py`中的配置,需要注意的细节和适配工作详情如下:** - - -### 语料适配 - -* 清洗语料:去除原文中空格、tab、乱码,按需去除数字、标点符号、特殊符号等。 -* 编码格式:utf-8,本例中已经对中文做了适配。 -* 内容格式:每个句子占一行;每行中的各词之间使用一个空格符分开。 -* 按需要配置`config.py`中对于data的配置: - - ```python - # -- config : data -- - - train_file = 'data/chinese.train.txt' - test_file = 'data/chinese.test.txt' - vocab_file = 'data/vocab_cn.txt' # the file to save vocab - - build_vocab_method = 'fixed_size' # 'frequency' or 'fixed_size' - vocab_max_size = 3000 # when build_vocab_method = 'fixed_size' - unk_threshold = 1 # # when build_vocab_method = 'frequency' - - min_sentence_length = 3 - max_sentence_length = 60 - ``` - - 其中,`build_vocab_method `指定了构建词典的方法:**1,按词频**,即将出现次数小于`unk_threshold `的词视为``;**2,按词典长度**,`vocab_max_size`定义了词典的最大长度,如果语料中出现的不同词的个数大于这个值,则根据各词的词频倒序排,取`top(vocab_max_size)`个词纳入词典。 - - 其中`min_sentence_length`和`max_sentence_length `分别指定了句子的最小和最大长度,小于最小长度的和大于最大长度的句子将被过滤掉、不参与训练。 - - *注:需要注意的是词典越大生成的内容越丰富但训练耗时越久,一般中文分词之后,语料中不同的词能有几万乃至几十万,如果vocab\_max\_size取值过小则导致\占比过高,如果vocab\_max\_size取值较大则严重影响训练速度(对精度也有影响),所以也有“按字”训练模型的方式,即:把每个汉字当做一个词,常用汉字也就几千个,使得字典的大小不会太大、不会丢失太多信息,但汉语中同一个字在不同词中语义相差很大,有时导致模型效果不理想。建议用户多试试、根据实际情况选择是“按词训练”还是“按字训练”。* - -### 模型适配、训练 - -* 按需调整`config.py`中对于模型的配置,详解如下: - - ```python - # -- config : train -- - - use_which_model = 'rnn' # must be: 'rnn' or 'ngram' - use_gpu = False # whether to use gpu - trainer_count = 1 # number of trainer - - - class Config_rnn(object): - """ - config for RNN language model - """ - rnn_type = 'gru' # or 'lstm' - emb_dim = 200 - hidden_size = 200 - num_layer = 2 - num_passs = 2 - batch_size = 32 - model_file_name_prefix = 'lm_' + rnn_type + '_params_pass_' - - - class Config_ngram(object): - """ - config for N-Gram language model - """ - emb_dim = 200 - hidden_size = 200 - num_layer = 2 - N = 5 - num_passs = 2 - batch_size = 32 - model_file_name_prefix = 'lm_ngram_pass_' - ``` - - 其中,`use_which_model`指定了要train的模型,如果使用RNN语言模型则设置为'rnn',如果使用N-Gram语言模型则设置为'ngram';`use_gpu`指定了train的时候是否使用gpu;`trainer_count`指定了并行度、用几个trainer去train模型;`rnn_type` 用于配置rnn cell类型,可以取‘lstm’或‘gru’;`hidden_size`配置unit个数;`num_layer`配置RNN的层数;`num_passs`配置训练的轮数;`emb_dim`配置embedding的dimension;`batch_size `配置了train model时每个batch的大小;`model_file_name_prefix `配置了要保存的模型的名字前缀。 - -* 运行`python train.py`命令训练模型,模型将被保存到当前目录。 - -### 按需生成文本 - -* 按需调整`config.py`中对于infer的配置,详解如下: - - ```python - # -- config : infer -- - - input_file = 'data/input.txt' # input file contains sentence prefix each line - output_file = 'data/output.txt' # the file to save results - num_words = 10 # the max number of words need to generate - beam_size = 5 # beam_width, the number of the prediction sentence for each prefix - ``` - - 其中,`input_file`中保存的是待生成的文本前缀,utf-8编码,每个前缀占一行,形如: - - ```text - 我 - 我 是 - ``` - 用户将需要生成的文本前缀按此格式存入文件即可; - `num_words`指定了要生成多少个单词(实际生成过程中遇到结束符会停止生成,所以实际生成的词个数可能会比此值小);`beam_size`指定了beam search方法的width,即每个前缀生成多少个候选词序列;`output_file`指定了生成结果的存放位置。 - -* 运行`python infer.py`命令生成文本,生成的结果格式如下: - - ```text - 我 - 我 0.107702672482 - 我 爱 。我 中国 中国 0.000177299271939 - 我 爱 中国 。我 是 中国 4.51695544709e-05 - 我 爱 中国 中国 0.000910127729821 - 我 爱 中国 。我 是 0.00015957862922 - ``` - 其中,‘我’是前缀,其下方的五个句子时补全的结果,每个句子末尾的浮点数表示此句子的生成概率。 diff --git a/language_model/config.py b/language_model/config.py deleted file mode 100644 index b78cb836d7587af8acde8801d745d066f9930337..0000000000000000000000000000000000000000 --- a/language_model/config.py +++ /dev/null @@ -1,54 +0,0 @@ -# coding=utf-8 - -# -- config : data -- - -train_file = 'data/chinese.train.txt' -test_file = 'data/chinese.test.txt' -vocab_file = 'data/vocab_cn.txt' # the file to save vocab - -build_vocab_method = 'fixed_size' # 'frequency' or 'fixed_size' -vocab_max_size = 3000 # when build_vocab_method = 'fixed_size' -unk_threshold = 1 # # when build_vocab_method = 'frequency' - -min_sentence_length = 3 -max_sentence_length = 60 - -# -- config : train -- - -use_which_model = 'ngram' # must be: 'rnn' or 'ngram' -use_gpu = False # whether to use gpu -trainer_count = 1 # number of trainer - - -class Config_rnn(object): - """ - config for RNN language model - """ - rnn_type = 'gru' # or 'lstm' - emb_dim = 200 - hidden_size = 200 - num_layer = 2 - num_passs = 2 - batch_size = 32 - model_file_name_prefix = 'lm_' + rnn_type + '_params_pass_' - - -class Config_ngram(object): - """ - config for N-Gram language model - """ - emb_dim = 200 - hidden_size = 200 - num_layer = 2 - N = 5 - num_passs = 2 - batch_size = 32 - model_file_name_prefix = 'lm_ngram_pass_' - - -# -- config : infer -- - -input_file = 'data/input.txt' # input file contains sentence prefix each line -output_file = 'data/output.txt' # the file to save results -num_words = 10 # the max number of words need to generate -beam_size = 5 # beam_width, the number of the prediction sentence for each prefix diff --git a/language_model/data/chinese.test.txt b/language_model/data/chinese.test.txt deleted file mode 100755 index e0dbe2634313a36c7d234cdf2e5bef151981675b..0000000000000000000000000000000000000000 --- a/language_model/data/chinese.test.txt +++ /dev/null @@ -1,39 +0,0 @@ -我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。 \ No newline at end of file diff --git a/language_model/data/chinese.train.txt b/language_model/data/chinese.train.txt deleted file mode 100755 index e0dbe2634313a36c7d234cdf2e5bef151981675b..0000000000000000000000000000000000000000 --- a/language_model/data/chinese.train.txt +++ /dev/null @@ -1,39 +0,0 @@ -我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。我 是 中国 人 。 -我 爱 中国 。 \ No newline at end of file diff --git a/language_model/data/input.txt b/language_model/data/input.txt deleted file mode 100644 index d0913ec91b66e77d49008684095baa4f2d668930..0000000000000000000000000000000000000000 --- a/language_model/data/input.txt +++ /dev/null @@ -1,10 +0,0 @@ -我 -我 是 -我 是 中国 -我 爱 -我 是 中国 人。 -我 爱 中国 -我 爱 中国 。我 -我 爱 中国 。我 爱 -我 爱 中国 。我 是 -我 爱 中国 。我 是 中国 \ No newline at end of file diff --git a/language_model/infer.py b/language_model/infer.py deleted file mode 100644 index 26ae4b822d1d76f241ca1b6d0a0439c9db4d7ca5..0000000000000000000000000000000000000000 --- a/language_model/infer.py +++ /dev/null @@ -1,214 +0,0 @@ -# coding=utf-8 -import paddle.v2 as paddle -import gzip -import numpy as np -from utils import * -import network_conf -from config import * - - -def generate_using_rnn(word_id_dict, num_words, beam_size): - """ - Demo: use RNN model to do prediction. - - :param word_id_dict: vocab. - :type word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :param num_words: the number of the words to generate. - :type num_words: int - :param beam_size: beam width. - :type beam_size: int - :return: save prediction results to output_file - """ - - # prepare and cache model - config = Config_rnn() - _, output_layer = network_conf.rnn_lm( - vocab_size=len(word_id_dict), - emb_dim=config.emb_dim, - rnn_type=config.rnn_type, - hidden_size=config.hidden_size, - num_layer=config.num_layer) # network config - model_file_name = config.model_file_name_prefix + str(config.num_passs - - 1) + '.tar.gz' - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(model_file_name)) # load parameters - inferer = paddle.inference.Inference( - output_layer=output_layer, parameters=parameters) - - # tools, different from generate_using_ngram's tools - id_word_dict = dict( - [(v, k) for k, v in word_id_dict.items()]) # {id : word} - - def str2ids(str): - return [[[ - word_id_dict.get(w, word_id_dict['']) for w in str.split() - ]]] - - def ids2str(ids): - return [[[id_word_dict.get(id, ' ') for id in ids]]] - - # generate text - with open(input_file) as file: - output_f = open(output_file, 'w') - for line in file: - line = line.decode('utf-8').strip() - # generate - texts = {} # type: {text : probability} - texts[line] = 1 - for _ in range(num_words): - texts_new = {} - for (text, prob) in texts.items(): - if '' in text: # stop prediction when appear - texts_new[text] = prob - continue - # next word's probability distribution - predictions = inferer.infer(input=str2ids(text)) - predictions[-1][word_id_dict['']] = -1 # filter - # find next beam_size words - for _ in range(beam_size): - cur_maxProb_index = np.argmax( - predictions[-1]) # next word's id - text_new = text + ' ' + id_word_dict[ - cur_maxProb_index] # text append next word - texts_new[text_new] = texts[text] * predictions[-1][ - cur_maxProb_index] - predictions[-1][cur_maxProb_index] = -1 - texts.clear() - if len(texts_new) <= beam_size: - texts = texts_new - else: # cutting - texts = dict( - sorted( - texts_new.items(), key=lambda d: d[1], reverse=True) - [:beam_size]) - - # save results to output file - output_f.write(line.encode('utf-8') + '\n') - for (sentence, prob) in texts.items(): - output_f.write('\t' + sentence.encode('utf-8', 'replace') + '\t' - + str(prob) + '\n') - output_f.write('\n') - - output_f.close() - print('already saved results to ' + output_file) - - -def generate_using_ngram(word_id_dict, num_words, beam_size): - """ - Demo: use N-Gram model to do prediction. - - :param word_id_dict: vocab. - :type word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :param num_words: the number of the words to generate. - :type num_words: int - :param beam_size: beam width. - :type beam_size: int - :return: save prediction results to output_file - """ - - # prepare and cache model - config = Config_ngram() - _, output_layer = network_conf.ngram_lm( - vocab_size=len(word_id_dict), - emb_dim=config.emb_dim, - hidden_size=config.hidden_size, - num_layer=config.num_layer) # network config - model_file_name = config.model_file_name_prefix + str(config.num_passs - - 1) + '.tar.gz' - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(model_file_name)) # load parameters - inferer = paddle.inference.Inference( - output_layer=output_layer, parameters=parameters) - - # tools, different from generate_using_rnn's tools - id_word_dict = dict( - [(v, k) for k, v in word_id_dict.items()]) # {id : word} - - def str2ids(str): - return [[ - word_id_dict.get(w, word_id_dict['']) for w in str.split() - ]] - - def ids2str(ids): - return [[id_word_dict.get(id, ' ') for id in ids]] - - # generate text - with open(input_file) as file: - output_f = open(output_file, 'w') - for line in file: - line = line.decode('utf-8').strip() - words = line.split() - if len(words) < config.N: - output_f.write(line.encode('utf-8') + "\n\tnone\n") - continue - # generate - texts = {} # type: {text : probability} - texts[line] = 1 - for _ in range(num_words): - texts_new = {} - for (text, prob) in texts.items(): - if '' in text: # stop prediction when appear - texts_new[text] = prob - continue - # next word's probability distribution - predictions = inferer.infer( - input=str2ids(' '.join(text.split()[-config.N:]))) - predictions[-1][word_id_dict['']] = -1 # filter - # find next beam_size words - for _ in range(beam_size): - cur_maxProb_index = np.argmax( - predictions[-1]) # next word's id - text_new = text + ' ' + id_word_dict[ - cur_maxProb_index] # text append nextWord - texts_new[text_new] = texts[text] * predictions[-1][ - cur_maxProb_index] - predictions[-1][cur_maxProb_index] = -1 - texts.clear() - if len(texts_new) <= beam_size: - texts = texts_new - else: # cutting - texts = dict( - sorted( - texts_new.items(), key=lambda d: d[1], reverse=True) - [:beam_size]) - - # save results to output file - output_f.write(line.encode('utf-8') + '\n') - for (sentence, prob) in texts.items(): - output_f.write('\t' + sentence.encode('utf-8', 'replace') + '\t' - + str(prob) + '\n') - output_f.write('\n') - - output_f.close() - print('already saved results to ' + output_file) - - -def main(): - # init paddle - paddle.init(use_gpu=use_gpu, trainer_count=trainer_count) - - # prepare and cache vocab - if os.path.isfile(vocab_file): - word_id_dict = load_vocab(vocab_file) # load word dictionary - else: - if build_vocab_method == 'fixed_size': - word_id_dict = build_vocab_with_fixed_size( - train_file, vocab_max_size) # build vocab - else: - word_id_dict = build_vocab_using_threshhold( - train_file, unk_threshold) # build vocab - save_vocab(word_id_dict, vocab_file) # save vocab - - # generate - if use_which_model == 'rnn': - generate_using_rnn( - word_id_dict=word_id_dict, num_words=num_words, beam_size=beam_size) - elif use_which_model == 'ngram': - generate_using_ngram( - word_id_dict=word_id_dict, num_words=num_words, beam_size=beam_size) - else: - raise Exception('use_which_model must be rnn or ngram!') - - -if __name__ == "__main__": - main() diff --git a/language_model/network_conf.py b/language_model/network_conf.py deleted file mode 100644 index e53ca66cc1adc49c6e7dcb50b77219030e74681d..0000000000000000000000000000000000000000 --- a/language_model/network_conf.py +++ /dev/null @@ -1,95 +0,0 @@ -# coding=utf-8 - -import paddle.v2 as paddle - - -def rnn_lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer): - """ - RNN language model definition. - - :param vocab_size: size of vocab. - :param emb_dim: embedding vector's dimension. - :param rnn_type: the type of RNN cell. - :param hidden_size: number of unit. - :param num_layer: layer number. - :return: cost and output layer of model. - """ - - assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0 - - # input layers - input = paddle.layer.data( - name="input", type=paddle.data_type.integer_value_sequence(vocab_size)) - target = paddle.layer.data( - name="target", type=paddle.data_type.integer_value_sequence(vocab_size)) - - # embedding layer - input_emb = paddle.layer.embedding(input=input, size=emb_dim) - - # rnn layer - if rnn_type == 'lstm': - rnn_cell = paddle.networks.simple_lstm( - input=input_emb, size=hidden_size) - for _ in range(num_layer - 1): - rnn_cell = paddle.networks.simple_lstm( - input=rnn_cell, size=hidden_size) - elif rnn_type == 'gru': - rnn_cell = paddle.networks.simple_gru(input=input_emb, size=hidden_size) - for _ in range(num_layer - 1): - rnn_cell = paddle.networks.simple_gru( - input=rnn_cell, size=hidden_size) - else: - raise Exception('rnn_type error!') - - # fc(full connected) and output layer - output = paddle.layer.fc( - input=[rnn_cell], size=vocab_size, act=paddle.activation.Softmax()) - - # loss - cost = paddle.layer.classification_cost(input=output, label=target) - - return cost, output - - -def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer, gram_num=4): - """ - N-Gram language model definition. - - :param vocab_size: size of vocab. - :param emb_dim: embedding vector's dimension. - :param hidden_size: size of unit. - :param num_layer: number of hidden layers. - :param gram_size: gram number in n-gram method - :return: cost and output layer of model. - """ - - assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0 - - # input layers - emb_layers = [] - for i in range(gram_num): - word = paddle.layer.data( - name="__word%02d__" % (i + 1), - type=paddle.data_type.integer_value(vocab_size)) - emb = paddle.layer.embedding( - input=word, - size=emb_dim, - param_attr=paddle.attr.Param(name="_proj", initial_std=1e-3)) - emb_layers.append(emb) - next_word = paddle.layer.data( - name="__next_word__", type=paddle.data_type.integer_value(vocab_size)) - - # hidden layer - for i in range(num_layer): - hidden = paddle.layer.fc( - input=hidden if i else paddle.layer.concat(input=emb_layers), - size=hidden_size, - act=paddle.activation.Relu()) - - predict_word = paddle.layer.fc( - input=[hidden], size=vocab_size, act=paddle.activation.Softmax()) - - # loss - cost = paddle.layer.classification_cost(input=predict_word, label=next_word) - - return cost, predict_word diff --git a/language_model/reader.py b/language_model/reader.py deleted file mode 100644 index 4a7fedea1d76fc5ad07b7bee303af476dfb87710..0000000000000000000000000000000000000000 --- a/language_model/reader.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding=utf-8 -import collections -import os - - -def rnn_reader(file_name, min_sentence_length, max_sentence_length, - word_id_dict): - """ - create reader for RNN, each line is a sample. - - :param file_name: file name. - :param min_sentence_length: sentence's min length. - :param max_sentence_length: sentence's max length. - :param word_id_dict: vocab with content of '{word, id}', 'word' is string type , 'id' is int type. - :return: data reader. - """ - - def reader(): - UNK = word_id_dict[''] - with open(file_name) as file: - for line in file: - words = line.decode('utf-8', 'ignore').strip().split() - if len(words) < min_sentence_length or len( - words) > max_sentence_length: - continue - ids = [word_id_dict.get(w, UNK) for w in words] - ids.append(word_id_dict['']) - target = ids[1:] - target.append(word_id_dict['']) - yield ids[:], target[:] - - return reader - - -def ngram_reader(file_name, N, word_id_dict): - """ - create reader for N-Gram. - - :param file_name: file name. - :param N: N-Gram's N. - :param word_id_dict: vocab with content of '{word, id}', 'word' is string type , 'id' is int type. - :return: data reader. - """ - assert N >= 2 - - def reader(): - ids = [] - UNK_ID = word_id_dict[''] - cache_size = 10000000 - with open(file_name) as file: - for line in file: - words = line.decode('utf-8', 'ignore').strip().split() - ids += [word_id_dict.get(w, UNK_ID) for w in words] - ids_len = len(ids) - if ids_len > cache_size: # output - for i in range(ids_len - N - 1): - yield tuple(ids[i:i + N]) - ids = [] - ids_len = len(ids) - for i in range(ids_len - N - 1): - yield tuple(ids[i:i + N]) - - return reader diff --git a/language_model/train.py b/language_model/train.py deleted file mode 100644 index a2b79b531b9e6ec4f0546d0685a38f65823df960..0000000000000000000000000000000000000000 --- a/language_model/train.py +++ /dev/null @@ -1,139 +0,0 @@ -# coding=utf-8 -import sys -import paddle.v2 as paddle -import reader -from utils import * -import network_conf -import gzip -from config import * - - -def train(model_cost, train_reader, test_reader, model_file_name_prefix, - num_passes): - """ - train model. - - :param model_cost: cost layer of the model to train. - :param train_reader: train data reader. - :param test_reader: test data reader. - :param model_file_name_prefix: model's prefix name. - :param num_passes: epoch. - :return: - """ - - # init paddle - paddle.init(use_gpu=use_gpu, trainer_count=trainer_count) - - # create parameters - parameters = paddle.parameters.create(model_cost) - - # create optimizer - adam_optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, - regularization=paddle.optimizer.L2Regularization(rate=1e-3), - model_average=paddle.optimizer.ModelAverage( - average_window=0.5, max_average_window=10000)) - - # create trainer - trainer = paddle.trainer.SGD( - cost=model_cost, parameters=parameters, update_equation=adam_optimizer) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print("\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics)) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # save model each pass - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_reader) - print("\nTest with Pass %d, %s" % (event.pass_id, result.metrics)) - with gzip.open( - model_file_name_prefix + str(event.pass_id) + '.tar.gz', - 'w') as f: - parameters.to_tar(f) - - # start to train - print('start training...') - trainer.train( - reader=train_reader, event_handler=event_handler, num_passes=num_passes) - - print("Training finished.") - - -def main(): - # prepare vocab - print('prepare vocab...') - if build_vocab_method == 'fixed_size': - word_id_dict = build_vocab_with_fixed_size( - train_file, vocab_max_size) # build vocab - else: - word_id_dict = build_vocab_using_threshhold( - train_file, unk_threshold) # build vocab - save_vocab(word_id_dict, vocab_file) # save vocab - - # init model and data reader - if use_which_model == 'rnn': - # init RNN model - print('prepare rnn model...') - config = Config_rnn() - cost, _ = network_conf.rnn_lm( - len(word_id_dict), config.emb_dim, config.rnn_type, - config.hidden_size, config.num_layer) - - # init RNN data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - reader.rnn_reader(train_file, min_sentence_length, - max_sentence_length, word_id_dict), - buf_size=65536), - batch_size=config.batch_size) - - test_reader = paddle.batch( - paddle.reader.shuffle( - reader.rnn_reader(test_file, min_sentence_length, - max_sentence_length, word_id_dict), - buf_size=65536), - batch_size=config.batch_size) - - elif use_which_model == 'ngram': - # init N-Gram model - print('prepare ngram model...') - config = Config_ngram() - assert config.N == 5 - cost, _ = network_conf.ngram_lm( - vocab_size=len(word_id_dict), - emb_dim=config.emb_dim, - hidden_size=config.hidden_size, - num_layer=config.num_layer) - - # init N-Gram data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - reader.ngram_reader(train_file, config.N, word_id_dict), - buf_size=65536), - batch_size=config.batch_size) - - test_reader = paddle.batch( - paddle.reader.shuffle( - reader.ngram_reader(test_file, config.N, word_id_dict), - buf_size=65536), - batch_size=config.batch_size) - else: - raise Exception('use_which_model must be rnn or ngram!') - - # train model - train( - model_cost=cost, - train_reader=train_reader, - test_reader=test_reader, - model_file_name_prefix=config.model_file_name_prefix, - num_passes=config.num_passs) - - -if __name__ == "__main__": - main() diff --git a/language_model/utils.py b/language_model/utils.py deleted file mode 100644 index 8beef77c686c8850c4437e707c07d89697a21953..0000000000000000000000000000000000000000 --- a/language_model/utils.py +++ /dev/null @@ -1,87 +0,0 @@ -# coding=utf-8 -import os -import collections - - -def save_vocab(word_id_dict, vocab_file_name): - """ - save vocab. - - :param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :param vocab_file_name: vocab file name. - """ - f = open(vocab_file_name, 'w') - for (k, v) in word_id_dict.items(): - f.write(k.encode('utf-8') + '\t' + str(v) + '\n') - print('save vocab to ' + vocab_file_name) - f.close() - - -def load_vocab(vocab_file_name): - """ - load vocab from file. - :param vocab_file_name: vocab file name. - :return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - """ - assert os.path.isfile(vocab_file_name) - dict = {} - with open(vocab_file_name) as file: - for line in file: - if len(line) < 2: - continue - kv = line.decode('utf-8').strip().split('\t') - dict[kv[0]] = int(kv[1]) - return dict - - -def build_vocab_using_threshhold(file_name, unk_threshold): - """ - build vacab using_ threshhold. - - :param file_name: - :param unk_threshold: threshhold. - :type unk_threshold: int. - :return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - """ - counter = {} - with open(file_name) as file: - for line in file: - words = line.decode('utf-8', 'ignore').strip().split() - for word in words: - if word in counter: - counter[word] += 1 - else: - counter[word] = 1 - counter_new = {} - for (word, frequency) in counter.items(): - if frequency >= unk_threshold: - counter_new[word] = frequency - counter.clear() - counter_new = sorted(counter_new.items(), key=lambda d: -d[1]) - words = [word_frequency[0] for word_frequency in counter_new] - word_id_dict = dict(zip(words, range(2, len(words) + 2))) - word_id_dict[''] = 0 - word_id_dict[''] = 1 - return word_id_dict - - -def build_vocab_with_fixed_size(file_name, vocab_max_size): - """ - build vacab with assigned max size. - - :param vocab_max_size: vocab's max size. - :return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - """ - words = [] - for line in open(file_name): - words += line.decode('utf-8', 'ignore').strip().split() - - counter = collections.Counter(words) - counter = sorted(counter.items(), key=lambda x: -x[1]) - if len(counter) > vocab_max_size: - counter = counter[:vocab_max_size] - words, counts = zip(*counter) - word_id_dict = dict(zip(words, range(2, len(words) + 2))) - word_id_dict[''] = 0 - word_id_dict[''] = 1 - return word_id_dict diff --git a/nce_cost/.gitignore b/nce_cost/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..203ec9a67426fee99e6228716433bb1bec8ff14f --- /dev/null +++ b/nce_cost/.gitignore @@ -0,0 +1,3 @@ +*.pyc +*.tar.gz +models diff --git a/nce_cost/README.md b/nce_cost/README.md index fce8bdaf80501e5bed650e93efc6c438284031c9..81c26c288874d59d1a3da2f6ca6fdd3cfe304d8a 100644 --- a/nce_cost/README.md +++ b/nce_cost/README.md @@ -1,12 +1,12 @@ # 噪声对比估计加速词向量训练 -## 背景介绍 -在自然语言处理领域中,通常使用特征向量来表示一个单词,但是如何使用准确的词向量来表示语义却是一个难点,详细内容可以在[词向量章节](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)中查阅到,原作者使用神经概率语言模型(Neural Probabilistic Language Model, NPLM)来训练词向量,尽管 NPLM 有优异的精度表现,但是相对于传统的 N-gram 统计模型,训练时间还是太漫长了\[[3](#参考文献)\]。常用的优化这个问题算法主要有两个:一个是 hierarchical-sigmoid \[[2](#参考文献)\] 另一个 噪声对比估计(Noise-contrastive estimation, NCE)\[[1](#参考文献)\]。为了克服这个问题本文引入了 NCE 方法。本文将以训练 NPLM 作为例子来讲述如何使用 NCE。 -## NCE 概览 -NCE 是一种快速对离散分布进行估计的方法,应用到本文中的问题:训练 NPLM 计算开销很大,原因是 softmax 函数计算时需要考虑每个类别的指数项,必须计算字典中的所有单词,而在一般语料集上面字典往往非常大\[[3](#参考文献)\],从而导致整个训练过程十分耗时。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比,NCE 不再使用复杂的二叉树来构造目标函数,而是采用相对简单的随机负采样,以大幅提升计算效率。 +词向量是许多自然语言处理任务的基础,详细介绍可见 PaddleBook 中的[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)一节,其中通过训练神经概率语言模型(Neural Probabilistic Language Model, NPLM)得到词向量,是一种流行的方式。然而,神经概率语言模型的最后一层往往需要计算一个词典之上的概率分布,词典越大这一层的计算量也就越大,往往非常耗时。在models的另一篇我们已经介绍了[Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid),这里我们介绍另一种加速词向量训练的方法:使用噪声对比估计(Noise-contrastive estimation, NCE)损失函数\[[1](#参考文献)\]。 +## NCE +NPLM 的最后一层 `softmax` 函数计算时需要考虑每个类别的指数项,必须计算字典中的所有单词,而在一般语料集上面字典往往非常大\[[3](#参考文献)\],从而导致整个训练过程十分耗时。NCE 是一种快速对离散分布进行估计的方法。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比,NCE 不再使用复杂的二叉树来构造目标函数,而是采用相对简单的随机负采样,以大幅提升计算效率。 -假设已知具体的上下文 $h$,并且知道这个分布为 $P^h(w)$ ,并将从中抽样出来的数据作为正样例,而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布,默认为无偏的均匀分布。这里我们同时假设噪音样例 k 倍于数据样例,则训练数据被抽中的概率为\[[1](#参考文献)\]: + +假设已知具体的上下文 $h$,并且知道这个分布为 $P^h(w)$ ,并将从中抽样出来的数据作为正样例,而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布,默认为无偏的均匀分布。这里我们同时假设噪音样例 $k$ 倍于数据样例,则训练数据被抽中的概率为\[[1](#参考文献)\]: $$P^h(D=1|w,\theta)=\frac { P_\theta^h(w) }{ P^h_\theta(w)+kP_n(w) } =\sigma (\Delta s_\theta(w,h))$$ @@ -17,7 +17,7 @@ J^h(\theta )=E_{ P_d^h }\left[ \log { P^h(D=1|w,\theta ) } \right] +kE_{ P_n }\ $$ \\\\\qquad =E_{ P_d^h }\left[ \log { \sigma (\Delta s_\theta(w,h)) } \right] +kE_{ P_n }\left[ \log (1-\sigma (\Delta s_\theta(w,h))) \right]$$ -总体上来说,NCE 是通过构造逻辑回归(logistic regression),对正样例和负样例做二分类,对于每一个样本,将自身的预测词 label 作为正样例,同时采样出 $k$ 个其他词 label 作为负样例,从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 softmax 分类需要计算每个类别的分数,然后归一化得到概率,节约了大量的时间消耗。 +总体上来说,NCE 是通过构造逻辑回归(logistic regression),对正样例和负样例做二分类,对于每一个样本,将自身的预测词 label 作为正样例,同时采样出 $k$ 个其他词 label 作为负样例,从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 `softmax ` 分类需要计算每个类别的分数,然后归一化得到概率,节约了大量的计算时间。 ## 实验数据 本文采用 Penn Treebank (PTB) 数据集([Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz))来训练语言模型。PaddlePaddle 提供 [paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py) 接口来方便调用这些数据,如果当前目录没有找到数据它会自动下载并验证文件的完整性。并提供大小为5的滑动窗口对数据做预处理工作,方便后期处理。语料语种为英文,共有42068句训练数据,3761句测试数据。 @@ -42,70 +42,60 @@ N-gram 神经概率语言模型详细网络结构见图1: 5. **NCE层**:训练时可以直接实用 PaddlePaddle 提供的 NCE Layer。 -## 训练阶段 -训练直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集,如果未包含,则自动下载。运行过程中,每1000个 iteration 会打印模型训练信息,主要包含训练损失,每个 pass 会计算测试数据集上的损失,并同时会保存最新的模型快照。在 PaddlePaddle 中有已经实现好的 NCE Layer,一些参数需要自行根据实际场景进行设计,可参考的调参方案如下: - - -| 参数名 | 参数作用 | 介绍 | -|:------ |:-------| :--------| -| param\_attr / bias\_attr | 用来设置参数名字 | 可以方便后面预测阶段好来实现网络的参数共享,具体内容在下一个章节里会陈述。| -| num\_neg\_samples | 参数负责控制对负样例的采样个数。 | 可以控制正负样本比例,这个值取值区间为 [1, 字典大小-1],负样本个数越多则整个模型的训练速度越慢,模型精度也会越高 | -| neg\_distribution | 控制生成负样例标签的分布,默认是一个均匀分布。 | 可以自行控制负样本采样时各个类别的采样权重,比如希望正样例为“晴天”时,负样例“洪水”在训练时更被着重区分,则可以将“洪水”这个类别的采样权重增加。 | -| act | 表示使用何种激活函数。 | 根据 NCE 的原理,这里应该使用 sigmoid 函数。 | +## 训练 +在命令行窗口运行命令``` python train.py ```可以直接开启训练任务。 +- 程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集,如果未包含,则自动下载。 +- 运行过程中,每10个 batch 会打印模型训练在训练集上的代价值 +- 每个 pass 结束后,会计算测试数据集上的损失,并同时会保存最新的模型快照。 -具体代码实现如下: +在模型文件`network_conf.py`中 NCE 调用代码如下: ```python cost = paddle.layer.nce( - input=hidden_layer, - label=next_word, - num_classes=dict_size, - param_attr=paddle.attr.Param(name='nce_w'), - bias_attr=paddle.attr.Param(name='nce_b'), - act=paddle.activation.Sigmoid(), - num_neg_samples=25, - neg_distribution=None) + input=hidden_layer, + label=next_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name="nce_w"), + bias_attr=paddle.attr.Param(name="nce_b"), + act=paddle.activation.Sigmoid(), + num_neg_samples=25, + neg_distribution=None) ``` +NCE 层的一些重要参数解释如下: -## 预测阶段 -预测直接运行` python infer.py `,程序首先会加载最新模型,然后按照 batch 大小依次进行预测,并打印预测结果。因为训练和预测计算逻辑不一样,预测阶段需要共享 NCE Layer 中的逻辑回归训练时得到的参数,所以要写一个推断层,推断层的参数为预先训练好的参数。 - -具体实现推断层的方法:先是通过 `paddle.attr.Param` 方法获取参数值,然后使用 `paddle.layer.trans_full_matrix_projection` 对隐层输出向量 `hidden_layer` 做一个矩阵右乘,PaddlePaddle 会自行在模型中寻找相同参数名的参数并获取。右乘求和后得到类别向量,将类别向量输入 softmax 做一个归一操作,和为1,从而得到最后的类别概率分布。 - -代码实现如下: - -```python -with paddle.layer.mixed( - size=dict_size, - act=paddle.activation.Softmax(), - bias_attr=paddle.attr.Param(name='nce_b')) as prediction: - prediction += paddle.layer.trans_full_matrix_projection( - input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w')) -``` - -预测的输出形式为: - -``` --------------------------- -No.68 Input: ' for possible -Ground Truth Output: -Predict Output: - --------------------------- -No.69 Input: for possible -Ground Truth Output: on -Predict Output: - --------------------------- -No.70 Input: for possible on -Ground Truth Output: the -Predict Output: the - -``` +| 参数名 | 参数作用 | 介绍 | +|:------ |:-------| :--------| +| param\_attr / bias\_attr | 用来设置参数名字 |方便预测阶段加载参数,具体在预测一节中介绍。| +| num\_neg\_samples | 负样本采样个数|可以控制正负样本比例,这个值取值区间为 [1, 字典大小-1],负样本个数越多则整个模型的训练速度越慢,模型精度也会越高 | +| neg\_distribution | 生成负样例标签的分布,默认是一个均匀分布| 可以自行控制负样本采样时各个类别的采样权重。例如:希望正样例为“晴天”时,负样例“洪水”在训练时更被着重区分,则可以将“洪水”这个类别的采样权重增加| +| act | 使用何种激活函数| 根据 NCE 的原理,这里应该使用 sigmoid 函数 | + +## 预测 +1. 首先修改 `infer.py` 脚本的 `main` 函数指定需要测试的模型。 +2. 需要注意的是,**预测和训练的计算逻辑不同**,需要以一个全连接层:`paddle.layer.fc`替换训练使用的`paddle.train.nce`层, 并直接加载NCE学习到的参数,代码如下: + + ```python + prediction = paddle.layer.fc( + size=dict_size, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(name="nce_b"), + input=hidden_layer, + param_attr=paddle.attr.Param(name="nce_w")) + ``` +3. 运行 `python infer.py`。程序首先会加载指定的模型,然后按照 batch 大小依次进行预测,并打印预测结果。预测的输出格式如下: + + ```text + 0.6734 their may want to move + + ``` + + 每一行是一条预测结果,内部以“\t”分隔,共计3列: + - 第一列:下一个词的概率。 + - 第二列:模型预测的下一个词。 + - 第三列:输入的 $n$ 个词语,内部以空格分隔。 -每一个短线表示一次的预测,第二行显示第几条测试样例,并给出输入的4个单词,第三行为真实的标签,第四行为预测的标签。 ## 参考文献 1. Mnih A, Kavukcuoglu K. [Learning word embeddings efficiently with noise-contrastive estimation](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)[C]//Advances in neural information processing systems. 2013: 2265-2273. diff --git a/nce_cost/index.html b/nce_cost/index.html new file mode 100644 index 0000000000000000000000000000000000000000..8ceaf6859452f8123ffcca862895a4257a282fe0 --- /dev/null +++ b/nce_cost/index.html @@ -0,0 +1,169 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/nce_cost/infer.py b/nce_cost/infer.py index 53e3aef45fc02ac008caa7102836ac47915be1fc..89d80792c85d68ee76234d5558b8f363b8768f92 100644 --- a/nce_cost/infer.py +++ b/nce_cost/infer.py @@ -1,70 +1,49 @@ +#!/usr/bin/env python # -*- encoding:utf-8 -*- -import numpy as np -import glob +import os import gzip -import paddle.v2 as paddle -from nce_conf import network_conf - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - word_dict = paddle.dataset.imikolov.build_dict() - dict_size = len(word_dict) - - prediction_layer = network_conf( - is_train=False, - hidden_size=128, - embedding_size=512, - dict_size=dict_size) - - models_list = glob.glob('./models/*') - models_list = sorted(models_list) - - with gzip.open(models_list[-1], 'r') as f: - parameters = paddle.parameters.Parameters.from_tar(f) +import numpy as np - idx_word_dict = dict((v, k) for k, v in word_dict.items()) - batch_size = 64 - batch_ins = [] - ins_iter = paddle.dataset.imikolov.test(word_dict, 5) +import paddle.v2 as paddle +from network_conf import ngram_lm - infer_data = [] - infer_data_label = [] - for item in paddle.dataset.imikolov.test(word_dict, 5)(): - infer_data.append((item[:4])) - infer_data_label.append(item[4]) - # Choose 100 samples from the test set to show how to infer. - if len(infer_data_label) == 100: - break - feeding = { - 'firstw': 0, - 'secondw': 1, - 'thirdw': 2, - 'fourthw': 3, - 'fifthw': 4 - } +def infer_a_batch(inferer, test_batch, id_to_word): + probs = inferer.infer(input=test_batch) + for i, res in enumerate(zip(test_batch, probs)): + maxid = res[1].argsort()[-1] + print("%.4f\t%s\t%s" % (res[1][maxid], id_to_word[maxid], + " ".join([id_to_word[w] for w in res[0]]))) - predictions = paddle.infer( - output_layer=prediction_layer, - parameters=parameters, - input=infer_data, - feeding=feeding, - field=['value']) - for i, (prob, data, - label) in enumerate(zip(predictions, infer_data, infer_data_label)): - print '--------------------------' - print "No.%d Input: " % (i+1) + \ - idx_word_dict[data[0]] + ' ' + \ - idx_word_dict[data[1]] + ' ' + \ - idx_word_dict[data[2]] + ' ' + \ - idx_word_dict[data[3]] - print 'Ground Truth Output: ' + idx_word_dict[label] - print 'Predict Output: ' + idx_word_dict[prob.argsort( - kind='heapsort', axis=0)[-1]] - print +def infer(model_path, batch_size): + assert os.path.exists(model_path), "the trained model does not exist." + word_to_id = paddle.dataset.imikolov.build_dict() + id_to_word = dict((v, k) for k, v in word_to_id.items()) + dict_size = len(word_to_id) + paddle.init(use_gpu=False, trainer_count=1) -if __name__ == '__main__': - main() + # load the trained model. + with gzip.open(model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + prediction_layer = ngram_lm( + is_train=False, hidden_size=128, emb_size=512, dict_size=dict_size) + inferer = paddle.inference.Inference( + output_layer=prediction_layer, parameters=parameters) + + test_batch = [] + for idx, item in enumerate(paddle.dataset.imikolov.test(word_to_id, 5)()): + test_batch.append((item[:4])) + if len(test_batch) == batch_size: + infer_a_batch(inferer, test_batch, id_to_word) + infer_data = [] + + if len(test_batch): + infer_a_batch(inferer, test_batch, id_to_word) + infer_data = [] + infer_data_label = [] + + +if __name__ == "__main__": + infer("models/model_pass_00000_00020.tar.gz", 10) diff --git a/nce_cost/nce_conf.py b/nce_cost/nce_conf.py deleted file mode 100644 index 962a9ccc80906bc2272245d0e297142397ffb024..0000000000000000000000000000000000000000 --- a/nce_cost/nce_conf.py +++ /dev/null @@ -1,61 +0,0 @@ -# -*- encoding:utf-8 -*- -import math -import paddle.v2 as paddle - - -def network_conf(hidden_size, embedding_size, dict_size, is_train): - - first_word = paddle.layer.data( - name="firstw", type=paddle.data_type.integer_value(dict_size)) - second_word = paddle.layer.data( - name="secondw", type=paddle.data_type.integer_value(dict_size)) - third_word = paddle.layer.data( - name="thirdw", type=paddle.data_type.integer_value(dict_size)) - fourth_word = paddle.layer.data( - name="fourthw", type=paddle.data_type.integer_value(dict_size)) - next_word = paddle.layer.data( - name="fifthw", type=paddle.data_type.integer_value(dict_size)) - - embed_param_attr = paddle.attr.Param( - name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) - first_embedding = paddle.layer.embedding( - input=first_word, size=embedding_size, param_attr=embed_param_attr) - second_embedding = paddle.layer.embedding( - input=second_word, size=embedding_size, param_attr=embed_param_attr) - third_embedding = paddle.layer.embedding( - input=third_word, size=embedding_size, param_attr=embed_param_attr) - fourth_embedding = paddle.layer.embedding( - input=fourth_word, size=embedding_size, param_attr=embed_param_attr) - - context_embedding = paddle.layer.concat(input=[ - first_embedding, second_embedding, third_embedding, fourth_embedding - ]) - - hidden_layer = paddle.layer.fc( - input=context_embedding, - size=hidden_size, - act=paddle.activation.Tanh(), - bias_attr=paddle.attr.Param(learning_rate=1), - param_attr=paddle.attr.Param( - initial_std=1. / math.sqrt(embedding_size * 8), learning_rate=1)) - - if is_train == True: - cost = paddle.layer.nce( - input=hidden_layer, - label=next_word, - num_classes=dict_size, - param_attr=paddle.attr.Param(name='nce_w'), - bias_attr=paddle.attr.Param(name='nce_b'), - act=paddle.activation.Sigmoid(), - num_neg_samples=25, - neg_distribution=None) - return cost - else: - with paddle.layer.mixed( - size=dict_size, - act=paddle.activation.Softmax(), - bias_attr=paddle.attr.Param(name='nce_b')) as prediction: - prediction += paddle.layer.trans_full_matrix_projection( - input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w')) - - return prediction diff --git a/nce_cost/network_conf.py b/nce_cost/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..a9e33e1b2d143c9662a34ea6c7fd3690b5d49e4e --- /dev/null +++ b/nce_cost/network_conf.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- encoding:utf-8 -*- +import math +import paddle.v2 as paddle + + +def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True): + emb_layers = [] + embed_param_attr = paddle.attr.Param( + name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) + for i in range(gram_num): + word = paddle.layer.data( + name="__word%02d__" % (i), + type=paddle.data_type.integer_value(dict_size)) + emb_layers.append( + paddle.layer.embedding( + input=word, size=emb_size, param_attr=embed_param_attr)) + + next_word = paddle.layer.data( + name="__target_word__", type=paddle.data_type.integer_value(dict_size)) + + context_embedding = paddle.layer.concat(input=emb_layers) + + hidden_layer = paddle.layer.fc( + input=context_embedding, + size=hidden_size, + act=paddle.activation.Tanh(), + param_attr=paddle.attr.Param(initial_std=1. / math.sqrt(emb_size * 8))) + + if is_train: + cost = paddle.layer.nce( + input=hidden_layer, + label=next_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name="nce_w"), + bias_attr=paddle.attr.Param(name="nce_b"), + act=paddle.activation.Sigmoid(), + num_neg_samples=25, + neg_distribution=None) + return cost + else: + prediction = paddle.layer.fc( + size=dict_size, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(name="nce_b"), + input=hidden_layer, + param_attr=paddle.attr.Param(name="nce_w")) + + return prediction diff --git a/nce_cost/train.py b/nce_cost/train.py index a8b437c1dd9bfc89fd03598b9a4201693c3074d7..4ab5043725805003cf151c6d0c8af8dbbc8c199f 100644 --- a/nce_cost/train.py +++ b/nce_cost/train.py @@ -1,52 +1,52 @@ +#!/usr/bin/env python # -*- encoding:utf-8 -*- -import paddle.v2 as paddle +import os +import logging import gzip -from nce_conf import network_conf +import paddle.v2 as paddle +from network_conf import ngram_lm + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) -def main(): +def train(model_save_dir): + if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) + paddle.init(use_gpu=False, trainer_count=1) word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) - cost = network_conf( - is_train=True, hidden_size=128, embedding_size=512, dict_size=dict_size) + optimizer = paddle.optimizer.Adam(learning_rate=1e-4) + cost = ngram_lm(hidden_size=128, emb_size=512, dict_size=dict_size) parameters = paddle.parameters.create(cost) - adagrad = paddle.optimizer.Adam(learning_rate=1e-4) - trainer = paddle.trainer.SGD(cost, parameters, adagrad) + trainer = paddle.trainer.SGD(cost, parameters, optimizer) def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 1000 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f" % + (event.pass_id, event.batch_id, event.cost)) if isinstance(event, paddle.event.EndPass): result = trainer.test( paddle.batch(paddle.dataset.imikolov.test(word_dict, 5), 64)) - print "Test here.. Pass %d, Cost %f" % (event.pass_id, result.cost) + logger.info("Test Pass %d, Cost %f" % (event.pass_id, result.cost)) - model_name = "./models/model_pass_%05d.tar.gz" % event.pass_id - print "Save model into %s ..." % model_name - with gzip.open(model_name, 'w') as f: + save_path = os.path.join(model_save_dir, + "model_pass_%05d.tar.gz" % event.pass_id) + logger.info("Save model into %s ..." % save_path) + with gzip.open(save_path, "w") as f: parameters.to_tar(f) - feeding = { - 'firstw': 0, - 'secondw': 1, - 'thirdw': 2, - 'fourthw': 3, - 'fifthw': 4 - } - trainer.train( paddle.batch(paddle.dataset.imikolov.train(word_dict, 5), 64), num_passes=1000, - event_handler=event_handler, - feeding=feeding) + event_handler=event_handler) -if __name__ == '__main__': - main() +if __name__ == "__main__": + train(model_save_dir="models") diff --git a/nested_sequence/README.md b/nested_sequence/README.md deleted file mode 100644 index a0990367ef8b03c70c29d285e22ef85907e1d0b7..0000000000000000000000000000000000000000 --- a/nested_sequence/README.md +++ /dev/null @@ -1 +0,0 @@ -TBD diff --git a/nmt_without_attention/README.md b/nmt_without_attention/README.md index a54b715102574dae1b619997a1ed7a2bfc14131c..2fd43bbdda53091506ca574d8c8b894870471c4f 100644 --- a/nmt_without_attention/README.md +++ b/nmt_without_attention/README.md @@ -51,14 +51,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: ```python -#### Encoder src_word_id = paddle.layer.data( name='source_language_word', type=paddle.data_type.integer_value_sequence(source_dict_dim)) + # source embedding src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim) -# use bidirectional_gru + +# bidirectional GRU as encoder encoded_vector = paddle.networks.bidirectional_gru( input=src_embedding, size=encoder_size, @@ -84,19 +85,17 @@ encoded_vector = paddle.networks.bidirectional_gru( ### 无注意力机制的解码器 -PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 +- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例介绍的则是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: ```python -#### Decoder +# the initialization state for decoder GRU encoder_last = paddle.layer.last_seq(input=encoded_vector) -encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) +encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) -# gru step +# the step function for decoder GRU def gru_decoder_without_attention(enc_vec, current_word): ''' Step function for gru decoder @@ -106,33 +105,29 @@ def gru_decoder_without_attention(enc_vec, current_word): :type current_word: layer object ''' decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) context = paddle.layer.last_seq(input=enc_vec) - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) gru_step = paddle.layer.gru_step( - name='gru_decoder', + name="gru_decoder", act=paddle.activation.Tanh(), gate_act=paddle.activation.Sigmoid(), input=decoder_inputs, output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out + input=gru_step) + return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: @@ -143,34 +138,14 @@ def gru_decoder_without_attention(enc_vec, current_word): 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python -decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector) group_inputs = [group_input1] -if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost -else: +decoder_group_name = "decoder_group" +if is_generating: trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, - embedding_name='_target_language_embedding', + embedding_name="_target_language_embedding", embedding_size=word_vector_dim) group_inputs.append(trg_embedding) @@ -184,6 +159,26 @@ else: max_length=max_length) return beam_gen +else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost ``` ## 数据准备 @@ -191,29 +186,31 @@ else: ## 模型的训练与测试 -在定义好网络结构后,就可以进行模型训练与测试了。根据用户运行时传递的参数是`--train` 还是 `--generate`,Python 脚本的 `main()` 函数分别调用函数`train()`和`generate()`来完成模型的训练与测试。 - ### 模型训练 -模型训练阶段,函数 `train()` 依次完成了如下的逻辑: + +启动模型训练的十分简单,只需在命令行窗口中执行`python train.py`。模型训练阶段 `train.py` 脚本中的 `train()` 函数依次完成了如下的逻辑: **a) 由网络定义,解析网络结构,初始化模型参数** -``` -# initialize model +```python +# define the network topolgy. cost = seq2seq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) ``` **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** -``` -# define optimize method and trainer +```python +# define optimization method optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +# define the trainer instance trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( @@ -223,40 +220,33 @@ wmt14_reader = paddle.batch( **c) 定义事件句柄,打印训练中间结果、保存模型快照** -``` -# define event_handler callback +```python +# define the event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: parameters.to_tar(f) - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost%f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) ``` **d) 开始训练** -``` -# start to train +```python +# start training trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) ``` -启动模型训练的十分简单,只需在命令行窗口中执行 - -``` -python nmt_without_attention_v2.py --train -``` - 输出样例为 -``` +```text Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0} ......... Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498} @@ -268,81 +258,80 @@ Pass 0, Batch 30, Cost 153.633665, {'classification_error_evaluator': 0.86438035 Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.8348183631896973} ``` +### 生成翻译结果 +利用训练好的模型生成翻译文本也十分简单。 + +1. 首先请修改`generate.py`脚本中`main`中传递给`generate`函数的参数,以选择使用哪一个保存的模型来生成。默认参数如下所示: + + ```python + generate( + source_dict_dim=30000, + target_dict_dim=30000, + batch_size=20, + beam_size=3, + model_path="models/nmt_without_att_params_batch_00100.tar.gz") + ``` + +2. 在终端执行命令 `python generate.py`,脚本中的`generate()`执行了依次如下逻辑: + + **a) 加载测试样本** + + ```python + # load data samples for generation + gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) + gen_data = [] + for item in gen_creator(): + gen_data.append((item[0], )) + ``` + + **b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** + + ```python + beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) + with gzip.open(init_models_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + # prob is the prediction probabilities, and id is the prediction word. + beam_result = paddle.infer( + output_layer=beam_gen, + parameters=parameters, + input=gen_data, + field=['prob', 'id']) + ``` + + **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** + + ```python + beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(test_batch) * beam_size + + start_pos, end_pos = 1, 0 + for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") + ``` + +设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下: + +```text +Elles connaissent leur entreprise mieux que personne . +-3.754819 They know their business better than anyone . +-4.445528 They know their businesses better than anyone . +-5.026885 They know their business better than anybody . -### 模型测试 -模型测试阶段,函数`generate()`执行了依次如下逻辑: - -**a) 加载测试样本** - -``` -# load data samples for generation -gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) -gen_data = [] -for item in gen_creator(): - gen_data.append((item[0], )) ``` - -**b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** - -``` -beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) -with gzip.open(init_models_path) as f: - parameters = paddle.parameters.Parameters.from_tar(f) -# prob is the prediction probabilities, and id is the prediction word. -beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) -``` - -**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** - -``` -# get the dictionary -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - -# the delimited element of generated sequences is -1, -# the first element of each generated sequence is the sequence length -seq_list = [] -seq = [] -for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - -prob = beam_result[0] -for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] -``` - -模型测试的执行与模型训练类似,只需执行 - -``` -python nmt_without_attention_v2.py --generate -``` -则自动为测试数据生成了对应的翻译结果。 -设置beam search的宽度为3,输入某个法文句子 - -``` -src: Elles connaissent leur entreprise mieux que personne . -``` - -其对应的英文翻译结果为 - -``` -prob = -3.754819: They know their business better than anyone . -prob = -4.445528: They know their businesses better than anyone . -prob = -5.026885: They know their business better than anybody . -``` - -* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; -* `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 +- 第一行为输入的源语言句子。 +- 第二 ~ beam_size + 1 行是柱搜索生成的 `beam_size` 条翻译结果 + - 相同行的输出以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。 + - 符号`` 表示句子的开始,符号``表示一个句子的结束,如果出现了在词典中未包含的词,则用符号``替代。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 diff --git a/nmt_without_attention/generate.py b/nmt_without_attention/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..1de4f462649a55e4ea235f61d9fa522461752f00 --- /dev/null +++ b/nmt_without_attention/generate.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +import os +import logging +import numpy as np + +from network_conf import seq2seq_net + +logger = logging.getLogger("paddle") +logger.setLevel(logging.WARNING) + + +def infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict): + beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(test_batch) * beam_size + + start_pos, end_pos = 1, 0 + for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") + + +def generate(source_dict_dim, target_dict_dim, model_path, beam_size, + batch_size): + """ + Sequence generation for NMT. + + :param source_dict_dim: size of source dictionary + :type source_dict_dim: int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + :param model_path: path for inital model + :type model_path: string + :param beam_size: the expanson width in each generation setp + :param beam_size: int + :param batch_size: the number of training examples in one forward pass + :param batch_size: int + """ + + assert os.path.exists(model_path), "trained model does not exist." + + # step 1: prepare dictionary + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) + + # step 2: load the trained model + paddle.init(use_gpu=False, trainer_count=1) + with gzip.open(model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + beam_gen = seq2seq_net( + source_dict_dim, + target_dict_dim, + beam_size=beam_size, + max_length=100, + is_generating=True) + inferer = paddle.inference.Inference( + output_layer=beam_gen, parameters=parameters) + + # step 3: iterating over the testing dataset + test_batch = [] + for idx, item in enumerate(paddle.dataset.wmt14.gen(source_dict_dim)()): + test_batch.append([item[0]]) + if len(test_batch) == batch_size: + infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict) + test_batch = [] + + if len(test_batch): + infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict) + test_batch = [] + + +if __name__ == "__main__": + generate( + source_dict_dim=30000, + target_dict_dim=30000, + batch_size=20, + beam_size=3, + model_path="models/nmt_without_att_params_batch_00100.tar.gz") diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html index 35177ee5a679fe4f826dfd219721ef2e36b7df83..d9287ecb4115e910f4c0d2ff29357014e0244391 100644 --- a/nmt_without_attention/index.html +++ b/nmt_without_attention/index.html @@ -93,14 +93,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: ```python -#### Encoder src_word_id = paddle.layer.data( name='source_language_word', type=paddle.data_type.integer_value_sequence(source_dict_dim)) + # source embedding src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim) -# use bidirectional_gru + +# bidirectional GRU as encoder encoded_vector = paddle.networks.bidirectional_gru( input=src_embedding, size=encoder_size, @@ -126,19 +127,17 @@ encoded_vector = paddle.networks.bidirectional_gru( ### 无注意力机制的解码器 -PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 +- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例介绍的则是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: ```python -#### Decoder +# the initialization state for decoder GRU encoder_last = paddle.layer.last_seq(input=encoded_vector) -encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) +encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) -# gru step +# the step function for decoder GRU def gru_decoder_without_attention(enc_vec, current_word): ''' Step function for gru decoder @@ -148,33 +147,29 @@ def gru_decoder_without_attention(enc_vec, current_word): :type current_word: layer object ''' decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) context = paddle.layer.last_seq(input=enc_vec) - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) gru_step = paddle.layer.gru_step( - name='gru_decoder', + name="gru_decoder", act=paddle.activation.Tanh(), gate_act=paddle.activation.Sigmoid(), input=decoder_inputs, output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out + input=gru_step) + return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: @@ -185,34 +180,14 @@ def gru_decoder_without_attention(enc_vec, current_word): 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python -decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector) group_inputs = [group_input1] -if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost -else: +decoder_group_name = "decoder_group" +if is_generating: trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, - embedding_name='_target_language_embedding', + embedding_name="_target_language_embedding", embedding_size=word_vector_dim) group_inputs.append(trg_embedding) @@ -226,6 +201,26 @@ else: max_length=max_length) return beam_gen +else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost ``` ## 数据准备 @@ -233,29 +228,31 @@ else: ## 模型的训练与测试 -在定义好网络结构后,就可以进行模型训练与测试了。根据用户运行时传递的参数是`--train` 还是 `--generate`,Python 脚本的 `main()` 函数分别调用函数`train()`和`generate()`来完成模型的训练与测试。 - ### 模型训练 -模型训练阶段,函数 `train()` 依次完成了如下的逻辑: + +启动模型训练的十分简单,只需在命令行窗口中执行`python train.py`。模型训练阶段 `train.py` 脚本中的 `train()` 函数依次完成了如下的逻辑: **a) 由网络定义,解析网络结构,初始化模型参数** -``` -# initialize model +```python +# define the network topolgy. cost = seq2seq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) ``` **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** -``` -# define optimize method and trainer +```python +# define optimization method optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +# define the trainer instance trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( @@ -265,40 +262,33 @@ wmt14_reader = paddle.batch( **c) 定义事件句柄,打印训练中间结果、保存模型快照** -``` -# define event_handler callback +```python +# define the event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: parameters.to_tar(f) - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost%f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) ``` **d) 开始训练** -``` -# start to train +```python +# start training trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) ``` -启动模型训练的十分简单,只需在命令行窗口中执行 - -``` -python nmt_without_attention_v2.py --train -``` - 输出样例为 -``` +```text Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0} ......... Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498} @@ -310,81 +300,80 @@ Pass 0, Batch 30, Cost 153.633665, {'classification_error_evaluator': 0.86438035 Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.8348183631896973} ``` +### 生成翻译结果 +利用训练好的模型生成翻译文本也十分简单。 + +1. 首先请修改`generate.py`脚本中`main`中传递给`generate`函数的参数,以选择使用哪一个保存的模型来生成。默认参数如下所示: + + ```python + generate( + source_dict_dim=30000, + target_dict_dim=30000, + batch_size=20, + beam_size=3, + model_path="models/nmt_without_att_params_batch_00100.tar.gz") + ``` + +2. 在终端执行命令 `python generate.py`,脚本中的`generate()`执行了依次如下逻辑: + + **a) 加载测试样本** + + ```python + # load data samples for generation + gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) + gen_data = [] + for item in gen_creator(): + gen_data.append((item[0], )) + ``` + + **b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** + + ```python + beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) + with gzip.open(init_models_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + # prob is the prediction probabilities, and id is the prediction word. + beam_result = paddle.infer( + output_layer=beam_gen, + parameters=parameters, + input=gen_data, + field=['prob', 'id']) + ``` + + **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** + + ```python + beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(test_batch) * beam_size + + start_pos, end_pos = 1, 0 + for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") + ``` + +设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下: + +```text +Elles connaissent leur entreprise mieux que personne . +-3.754819 They know their business better than anyone . +-4.445528 They know their businesses better than anyone . +-5.026885 They know their business better than anybody . -### 模型测试 -模型测试阶段,函数`generate()`执行了依次如下逻辑: - -**a) 加载测试样本** - -``` -# load data samples for generation -gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) -gen_data = [] -for item in gen_creator(): - gen_data.append((item[0], )) ``` - -**b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** - -``` -beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) -with gzip.open(init_models_path) as f: - parameters = paddle.parameters.Parameters.from_tar(f) -# prob is the prediction probabilities, and id is the prediction word. -beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) -``` - -**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** - -``` -# get the dictionary -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - -# the delimited element of generated sequences is -1, -# the first element of each generated sequence is the sequence length -seq_list = [] -seq = [] -for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - -prob = beam_result[0] -for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] -``` - -模型测试的执行与模型训练类似,只需执行 - -``` -python nmt_without_attention_v2.py --generate -``` -则自动为测试数据生成了对应的翻译结果。 -设置beam search的宽度为3,输入某个法文句子 - -``` -src: Elles connaissent leur entreprise mieux que personne . -``` - -其对应的英文翻译结果为 - -``` -prob = -3.754819: They know their business better than anyone . -prob = -4.445528: They know their businesses better than anyone . -prob = -5.026885: They know their business better than anybody . -``` - -* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; -* `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 +- 第一行为输入的源语言句子。 +- 第二 ~ beam_size + 1 行是柱搜索生成的 `beam_size` 条翻译结果 + - 相同行的输出以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。 + - 符号`` 表示句子的开始,符号``表示一个句子的结束,如果出现了在词典中未包含的词,则用符号``替代。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 diff --git a/nmt_without_attention/network_conf.py b/nmt_without_attention/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..77a1dc77c3c85c633cd7fbdf085d02780ded0075 --- /dev/null +++ b/nmt_without_attention/network_conf.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +import paddle.v2 as paddle +import sys +import gzip + + +def seq2seq_net(source_dict_dim, + target_dict_dim, + word_vector_dim=620, + rnn_hidden_size=1000, + beam_size=1, + max_length=50, + is_generating=False): + """ + Define the network structure of NMT, including encoder and decoder. + + :param source_dict_dim: size of source dictionary + :type source_dict_dim : int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + :param word_vector_dim: size of source language word embedding + :type word_vector_dim: int + :param rnn_hidden_size: size of hidden state of encoder and decoder RNN + :type rnn_hidden_size: int + :param beam_size: expansion width in each step when generating + :type beam_size: int + :param max_length: max iteration number in generation + :type max_length: int + :param generating: whether to generate sequence or to train + :type generating: bool + """ + + decoder_size = encoder_size = rnn_hidden_size + + src_word_id = paddle.layer.data( + name="source_language_word", + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_embedding = paddle.layer.embedding( + input=src_word_id, size=word_vector_dim) + + # use bidirectional_gru as the encoder + encoded_vector = paddle.networks.bidirectional_gru( + input=src_embedding, + size=encoder_size, + fwd_act=paddle.activation.Tanh(), + fwd_gate_act=paddle.activation.Sigmoid(), + bwd_act=paddle.activation.Tanh(), + bwd_gate_act=paddle.activation.Sigmoid(), + return_seq=True) + #### Decoder + encoder_last = paddle.layer.last_seq(input=encoded_vector) + encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) + + # gru step + def gru_decoder_without_attention(enc_vec, current_word): + """ + Step function for gru decoder + + :param enc_vec: encoded vector of source language + :type enc_vec: layer object + :param current_word: current input of decoder + :type current_word: layer object + """ + decoder_mem = paddle.layer.memory( + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) + + context = paddle.layer.last_seq(input=enc_vec) + + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) + + gru_step = paddle.layer.gru_step( + name="gru_decoder", + act=paddle.activation.Tanh(), + gate_act=paddle.activation.Sigmoid(), + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + out = paddle.layer.fc( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=gru_step) + return out + + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_inputs = [group_input1] + + decoder_group_name = "decoder_group" + if is_generating: + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name="_target_language_embedding", + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + + return beam_gen + else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost diff --git a/nmt_without_attention/nmt_without_attention.py b/nmt_without_attention/nmt_without_attention.py deleted file mode 100644 index 5a61b525e67f7d07f66ae8cc5064c0244bc0b6f3..0000000000000000000000000000000000000000 --- a/nmt_without_attention/nmt_without_attention.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python - -import sys -import gzip -import paddle.v2 as paddle - -### Parameters -word_vector_dim = 620 -latent_chain_dim = 1000 - -beam_size = 5 -max_length = 50 - - -def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): - ''' - Define the network structure of NMT, including encoder and decoder. - - :param source_dict_dim: size of source dictionary - :type source_dict_dim : int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - ''' - - decoder_size = encoder_size = latent_chain_dim - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, size=word_vector_dim) - # use bidirectional_gru - encoded_vector = paddle.networks.bidirectional_gru( - input=src_embedding, - size=encoder_size, - fwd_act=paddle.activation.Tanh(), - fwd_gate_act=paddle.activation.Sigmoid(), - bwd_act=paddle.activation.Tanh(), - bwd_gate_act=paddle.activation.Sigmoid(), - return_seq=True) - #### Decoder - encoder_last = paddle.layer.last_seq(input=encoded_vector) - encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) - - # gru step - def gru_decoder_without_attention(enc_vec, current_word): - ''' - Step function for gru decoder - - :param enc_vec: encoded vector of source language - :type enc_vec: layer object - :param current_word: current input of decoder - :type current_word: layer object - ''' - decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) - - context = paddle.layer.last_seq(input=enc_vec) - - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - act=paddle.activation.Tanh(), - gate_act=paddle.activation.Sigmoid(), - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - out = paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_inputs = [group_input1] - - if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost - else: - - trg_embedding = paddle.layer.GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - return beam_gen - - -def train(source_dict_dim, target_dict_dim): - ''' - Training function for NMT - - :param source_dict_dim: size of source dictionary - :type source_dict_dim: int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - ''' - # initialize model - cost = seq2seq_net(source_dict_dim, target_dict_dim) - parameters = paddle.parameters.create(cost) - - # define optimize method and trainer - optimizer = paddle.optimizer.RMSProp( - learning_rate=1e-3, - gradient_clipping_threshold=10.0, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=optimizer) - # define data reader - wmt14_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), - batch_size=55) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: - parameters.to_tar(f) - - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # start to train - trainer.train( - reader=wmt14_reader, event_handler=event_handler, num_passes=2) - - -def generate(source_dict_dim, target_dict_dim, init_models_path): - ''' - Generating function for NMT - - :param source_dict_dim: size of source dictionary - :type source_dict_dim: int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - :param init_models_path: path for inital model - :type init_models_path: string - ''' - - # load data samples for generation - gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) - gen_data = [] - for item in gen_creator(): - gen_data.append((item[0], )) - - beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) - with gzip.open(init_models_path) as f: - parameters = paddle.parameters.Parameters.from_tar(f) - # prob is the prediction probabilities, and id is the prediction word. - beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) - - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list, seq = [], [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] - - -def usage_helper(): - print "Please specify training/generating phase!" - print "Usage: python nmt_without_attention_v2.py --train/generate" - exit(1) - - -def main(): - if not (len(sys.argv) == 2): - usage_helper() - if sys.argv[1] == '--train': - generating = False - elif sys.argv[1] == '--generate': - generating = True - else: - usage_helper() - - # initialize paddle - paddle.init(use_gpu=False, trainer_count=1) - source_language_dict_dim = 30000 - target_language_dict_dim = 30000 - - if generating: - # modify this path to speicify a trained model. - init_models_path = 'models/nmt_without_att_params_batch_1800.tar.gz' - if not os.path.exists(init_models_path): - print "trained model cannot be found." - exit(1) - generate(source_language_dict_dim, target_language_dict_dim, - init_models_path) - else: - if not os.path.exists('./models'): - os.system('mkdir ./models') - train(source_language_dict_dim, target_language_dict_dim) - - -if __name__ == '__main__': - main() diff --git a/nmt_without_attention/train.py b/nmt_without_attention/train.py new file mode 100644 index 0000000000000000000000000000000000000000..9600df8e5b70cca90543062b040e6dddc540440c --- /dev/null +++ b/nmt_without_attention/train.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +import os +import logging +import paddle.v2 as paddle + +from network_conf import seq2seq_net + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +def train(save_dir_path, source_dict_dim, target_dict_dim): + ''' + Training function for NMT + + :param save_dir_path: path of the directory to save the trained models. + :param save_dir_path: str + :param source_dict_dim: size of source dictionary + :type source_dict_dim: int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + ''' + if not os.path.exists(save_dir_path): + os.mkdir(save_dir_path) + + # initialize PaddlePaddle + paddle.init(use_gpu=False, trainer_count=1) + + cost = seq2seq_net(source_dict_dim, target_dict_dim) + parameters = paddle.parameters.create(cost) + + # define optimization method and the trainer instance + optimizer = paddle.optimizer.RMSProp( + learning_rate=1e-3, + gradient_clipping_threshold=10.0, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + + # define data reader + wmt14_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), + batch_size=8) + + # define the event_handler callback + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: + parameters.to_tar(f) + + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) + + # start training + trainer.train( + reader=wmt14_reader, event_handler=event_handler, num_passes=2) + + +if __name__ == '__main__': + train(save_dir_path="models", source_dict_dim=3000, target_dict_dim=3000) diff --git a/ntm_addressing_mechanism/README.md b/ntm_addressing_mechanism/README.md deleted file mode 100644 index a0990367ef8b03c70c29d285e22ef85907e1d0b7..0000000000000000000000000000000000000000 --- a/ntm_addressing_mechanism/README.md +++ /dev/null @@ -1 +0,0 @@ -TBD diff --git a/regression/README.md b/regression/README.md deleted file mode 100644 index a0990367ef8b03c70c29d285e22ef85907e1d0b7..0000000000000000000000000000000000000000 --- a/regression/README.md +++ /dev/null @@ -1 +0,0 @@ -TBD diff --git a/scheduled_sampling/README.md b/scheduled_sampling/README.md index 9af4387e125a969e68704b58ea8c600e847dbf7f..644c1b960cb7a153a69b4c4f282008372194943c 100644 --- a/scheduled_sampling/README.md +++ b/scheduled_sampling/README.md @@ -4,7 +4,7 @@ 序列生成任务的生成目标是在给定源输入的条件下,最大化目标序列的概率。训练时该模型将目标序列中的真实元素作为解码器每一步的输入,然后最大化下一个元素的概率。生成时上一步解码得到的元素被用作当前的输入,然后生成下一个元素。可见这种情况下训练阶段和生成阶段的解码器输入数据的概率分布并不一致。 -Scheduled Sampling\[[1](#参考文献)\]是一种解决训练和生成时输入数据分布不一致的方法。在训练早期该方法主要使用目标序列中的真实元素作为解码器输入,可以将模型从随机初始化的状态快速引导至一个合理的状态。随着训练的进行,该方法会逐渐更多地使用生成的元素作为解码器输入,以解决数据分布不一致的问题。 +Scheduled Sampling \[[1](#参考文献)\]是一种解决训练和生成时输入数据分布不一致的方法。在训练早期该方法主要使用目标序列中的真实元素作为解码器输入,可以将模型从随机初始化的状态快速引导至一个合理的状态。随着训练的进行,该方法会逐渐更多地使用生成的元素作为解码器输入,以解决数据分布不一致的问题。 标准的序列到序列模型中,如果序列前面生成了错误的元素,后面的输入状态将会收到影响,而该误差会随着生成过程不断向后累积。Scheduled Sampling以一定概率将生成的元素作为解码器输入,这样即使前面生成错误,其训练目标仍然是最大化真实目标序列的概率,模型会朝着正确的方向进行训练。因此这种方式增加了模型的容错能力。 @@ -24,14 +24,14 @@ Scheduled Sampling主要应用在序列到序列模型的训练阶段,而生 图1给出了这三种方式的衰减曲线,

-
+
图1. 线性衰减、指数衰减和反向Sigmoid衰减的衰减曲线

如图2所示,在解码器的$t$时刻Scheduled Sampling以概率$\epsilon_i$使用上一时刻的真实元素$y_{t-1}$作为解码器输入,以概率$1-\epsilon_i$使用上一时刻生成的元素$g_{t-1}$作为解码器输入。从图1可知随着$i$的增大$\epsilon_i$会不断减小,解码器将不断倾向于使用生成的元素作为输入,训练阶段和生成阶段的数据分布将变得越来越一致。

-
+
图2. Scheduled Sampling选择不同元素作为解码器输入示意图

diff --git a/scheduled_sampling/img/Scheduled_Sampling.jpg b/scheduled_sampling/images/Scheduled_Sampling.jpg similarity index 100% rename from scheduled_sampling/img/Scheduled_Sampling.jpg rename to scheduled_sampling/images/Scheduled_Sampling.jpg diff --git a/scheduled_sampling/img/decay.jpg b/scheduled_sampling/images/decay.jpg similarity index 100% rename from scheduled_sampling/img/decay.jpg rename to scheduled_sampling/images/decay.jpg diff --git a/scheduled_sampling/index.html b/scheduled_sampling/index.html new file mode 100644 index 0000000000000000000000000000000000000000..224f598126cfbe477058a7af45a30dd9d4c8764f --- /dev/null +++ b/scheduled_sampling/index.html @@ -0,0 +1,277 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/sequence_tagging_for_ner/infer.py b/sequence_tagging_for_ner/infer.py index 693c3a85f7f77e1d5addc6143326b5246043a2c9..cf48bc249c80fd44415d643ffb60bfb0feec4e1f 100644 --- a/sequence_tagging_for_ner/infer.py +++ b/sequence_tagging_for_ner/infer.py @@ -1,8 +1,9 @@ import gzip import reader -from network_conf import * -from utils import * +import paddle.v2 as paddle +from network_conf import ner_net +from utils import load_dict, load_reverse_dict def infer(model_path, batch_size, test_data_file, vocab_file, target_file): diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py index aa7f69087d1f399f87fc2ed92cb4040c5ed0e03e..dd041b6aaaebee0e263abf4bea035d578344409f 100644 --- a/sequence_tagging_for_ner/train.py +++ b/sequence_tagging_for_ner/train.py @@ -2,8 +2,11 @@ import gzip import numpy as np import reader -from utils import * -from network_conf import * +from utils import logger, load_dict, get_embedding +from network_conf import ner_net + +import paddle.v2 as paddle +import paddle.v2.evaluator as evaluator def main(train_data_file, @@ -11,8 +14,12 @@ def main(train_data_file, vocab_file, target_file, emb_file, + model_save_dir, num_passes=10, batch_size=32): + if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) + word_dict = load_dict(vocab_file) label_dict = load_dict(target_file) @@ -77,8 +84,9 @@ def main(train_data_file, if isinstance(event, paddle.event.EndPass): # save parameters - with gzip.open("models/params_pass_%d.tar.gz" % event.pass_id, - "w") as f: + with gzip.open( + os.path.join(model_save_dir, "params_pass_%d.tar.gz" % + event.pass_id), "w") as f: parameters.to_tar(f) result = trainer.test(reader=test_reader, feeding=feeding) @@ -94,8 +102,8 @@ def main(train_data_file, if __name__ == "__main__": main( - train_data_file='data/train', - test_data_file='data/test', - vocab_file='data/vocab.txt', - target_file='data/target.txt', - emb_file='data/wordVectors.txt') + train_data_file="data/train", + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + emb_file="data/wordVectors.txt") diff --git a/text_classification/infer.py b/text_classification/infer.py index a7ac4e1d5a0622435336de72902319c3d8fd8616..238c47df6c2e17374614801468ab0fe29320a669 100644 --- a/text_classification/infer.py +++ b/text_classification/infer.py @@ -6,15 +6,15 @@ import gzip import paddle.v2 as paddle -import network_conf import reader -from utils import * +from network_conf import fc_net, convolution_net +from utils import logger, load_dict def infer(topology, data_dir, model_path, word_dict_path, label_dict_path, batch_size): def _infer_a_batch(inferer, test_batch, ids_2_word, ids_2_label): - probs = inferer.infer(input=test_batch, field=['value']) + probs = inferer.infer(input=test_batch, field=["value"]) assert len(probs) == len(test_batch) for word_ids, prob in zip(test_batch, probs): word_text = " ".join([ids_2_word[id] for id in word_ids[0]]) @@ -22,7 +22,7 @@ def infer(topology, data_dir, model_path, word_dict_path, label_dict_path, " ".join(["{:0.4f}".format(p) for p in prob]), word_text)) - logger.info('begin to predict...') + logger.info("begin to predict...") use_default_data = (data_dir is None) if use_default_data: @@ -33,9 +33,9 @@ def infer(topology, data_dir, model_path, word_dict_path, label_dict_path, test_reader = paddle.dataset.imdb.test(word_dict) else: assert os.path.exists( - word_dict_path), 'the word dictionary file does not exist' + word_dict_path), "the word dictionary file does not exist" assert os.path.exists( - label_dict_path), 'the label dictionary file does not exist' + label_dict_path), "the label dictionary file does not exist" word_dict = load_dict(word_dict_path) word_reverse_dict = load_reverse_dict(word_dict_path) @@ -52,7 +52,7 @@ def infer(topology, data_dir, model_path, word_dict_path, label_dict_path, # load the trained models parameters = paddle.parameters.Parameters.from_tar( - gzip.open(model_path, 'r')) + gzip.open(model_path, "r")) inferer = paddle.inference.Inference( output_layer=prob_layer, parameters=parameters) @@ -70,19 +70,19 @@ def infer(topology, data_dir, model_path, word_dict_path, label_dict_path, test_batch = [] -if __name__ == '__main__': - model_path = 'dnn_params_pass_00000.tar.gz' +if __name__ == "__main__": + model_path = "models/dnn_params_pass_00000.tar.gz" assert os.path.exists(model_path), "the trained model does not exist." - nn_type = 'dnn' + nn_type = "dnn" test_dir = None word_dict = None label_dict = None - if nn_type == 'dnn': - topology = network_conf.fc_net - elif nn_type == 'cnn': - topology = network_conf.convolution_net + if nn_type == "dnn": + topology = fc_net + elif nn_type == "cnn": + topology = convolution_net infer( topology=topology, diff --git a/text_classification/train.py b/text_classification/train.py index f635fdf29f594752a1040f2207f373f6217ede64..4f31b09373ce19b5e8806735fd1d449154b27ed2 100644 --- a/text_classification/train.py +++ b/text_classification/train.py @@ -1,13 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import os import sys import gzip import paddle.v2 as paddle -import network_conf import reader -from utils import * +from utils import logger, parse_train_cmd, build_dict, load_dict +from network_conf import fc_net, convolution_net def train(topology, @@ -15,6 +16,7 @@ def train(topology, test_data_dir=None, word_dict_path=None, label_dict_path=None, + model_save_dir="models", batch_size=32, num_passes=10): """ @@ -33,6 +35,8 @@ def train(topology, :params num_pass: train pass number :type num_pass: int """ + if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) use_default_data = (train_data_dir is None) @@ -136,8 +140,9 @@ def train(topology, result = trainer.test(reader=test_reader, feeding=feeding) logger.info("Test at Pass %d, %s \n" % (event.pass_id, result.metrics)) - with gzip.open("dnn_params_pass_%05d.tar.gz" % event.pass_id, - "w") as f: + with gzip.open( + os.path.join(model_save_dir, "dnn_params_pass_%05d.tar.gz" % + event.pass_id), "w") as f: parameters.to_tar(f) trainer.train( @@ -151,9 +156,9 @@ def train(topology, def main(args): if args.nn_type == "dnn": - topology = network_conf.fc_net + topology = fc_net elif args.nn_type == "cnn": - topology = network_conf.convolution_net + topology = convolution_net train( topology=topology, @@ -162,7 +167,8 @@ def main(args): word_dict_path=args.word_dict, label_dict_path=args.label_dict, batch_size=args.batch_size, - num_passes=args.num_passes) + num_passes=args.num_passes, + model_save_dir=args.model_save_dir) if __name__ == "__main__": diff --git a/text_classification/utils.py b/text_classification/utils.py index 7364add220479cfdd1b96369d5c541a2d3764c6d..831d2b3b071742aa233638784cbd7bb16195b29f 100644 --- a/text_classification/utils.py +++ b/text_classification/utils.py @@ -5,7 +5,7 @@ import os import argparse from collections import defaultdict -logger = logging.getLogger("logger") +logger = logging.getLogger("paddle") logger.setLevel(logging.INFO) @@ -60,6 +60,12 @@ def parse_train_cmd(): help="the number of training examples in one forward/backward pass") parser.add_argument( "--num_passes", type=int, default=10, help="number of passes to train") + parser.add_argument( + "--model_save_dir", + type=str, + required=False, + help=("path to save the trained models."), + default="models") return parser.parse_args()