提交 b090ce32 编写于 作者: L liaogang

Fix conflict with develop

......@@ -95,7 +95,6 @@ function(link_paddle_exe TARGET_NAME)
paddle_parameter
paddle_proto
paddle_cuda
paddle_test_main
${METRIC_LIBS}
${EXTERNAL_LIBS}
${CMAKE_THREAD_LIBS_INIT}
......@@ -130,8 +129,9 @@ endfunction()
# Rest Arguemnts: not used.
function(link_paddle_test TARGET_NAME)
link_paddle_exe(${TARGET_NAME})
target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
${GTEST_LIBRARIES})
target_link_libraries(${TARGET_NAME}
paddle_test_main
${GTEST_LIBRARIES})
endfunction()
# add_unittest_without_exec
......
......@@ -14,9 +14,19 @@
# limitations under the License.
set -e
set -x
BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
# download the dictionary and pretrained model
for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
do
wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
f88c8325ee6da6187f1080e8fe66c1cd
927cf70f27f860aff1a5703ebf7f1584
a52e43655cd25d279777ed509a1ae27b
b92c67fe9ff70fea53596080e351ac80)
for ((i=0; i<${#ITEM_MD5[@]}; i++))
do
FILENAME=${DOWNLOAD_ITEMS[${i}]}
REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
EXPECTED_MD5=${ITEM_MD5[${i}]}
[ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
done
#!/usr/bin/python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import re
import math
def get_best_pass(log_filename):
with open(log_filename, 'r') as f:
text = f.read()
pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
re.S)
results = re.findall(pattern, text)
sorted_results = sorted(results, key=lambda result: float(result[0]))
return sorted_results[0]
log_filename = sys.argv[1]
log = get_best_pass(log_filename)
predict_error = math.sqrt(float(log[0])) / 2
print 'Best pass is %s, error is %s, which means predict get error as %f' % (
log[1], log[0], predict_error)
evaluate_pass = "output/pass-%s" % log[1]
print "evaluating from pass %s" % evaluate_pass
......@@ -72,7 +72,7 @@ PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需
减少数据载入的耗时
++++++++++++++++++
使用 :code:`pydataprovider`时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
.. literalinclude:: src/reduce_min_pool_size.py
......
......@@ -4,6 +4,7 @@ RNN相关模型
.. toctree::
:maxdepth: 1
rnn_config_cn.rst
recurrent_group_cn.md
hierarchical_layer_cn.rst
hrnn_rnn_api_compare_cn.rst
RNN 配置
=================
本教程将指导你如何在 PaddlePaddle 中配置循环神经网络(RNN)。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何:
- 准备用来学习循环神经网络的序列数据。
- 配置循环神经网络架构。
- 使用学习完成的循环神经网络模型生成序列。
我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence 模型的代码可以在`demo / seqToseq`找到。
准备序列数据
---------------------
PaddlePaddle 不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。 它们都是序列,它们的大小是`src_dict``trg_dict``trg_dict`
``` sourceCode
settings.input_types = [
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.trg_dict))]
```
`process`函数中,每个`yield`函数将返回三个整数列表。每个整数列表被视为一个整数序列:
``` sourceCode
yield src_ids, trg_ids, trg_ids_next
```
有关如何编写数据提供程序的更多细节描述,请参考 [PyDataProvider2](../../ui/data_provider/index.html)。完整的数据提供文件在 `demo/seqToseq/dataprovider.py`
配置循环神经网络架构
-----------------------------------------------
### 简单门控循环神经网络(Gated Recurrent Neural Network)
循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
![image](../../../tutorials/sentiment_analysis/bi_lstm.jpg)
一般来说,循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T**t* = 1 执行以下操作。
*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>),*y*<sub>*t*</sub> = *f*<sub>*y*</sub>(*x*<sub>*t*</sub>)
其中 *f*<sub>*x*</sub>(.) 称为**单步函数**(即单时间步执行的函数,step function),而 *f*<sub>*y*</sub>(.) 称为**输出函数**。在 vanilla 循环神经网络中,单步函数和输出函数都非常简单。然而,PaddlePaddle 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to sequence 模型演示如何配置复杂的循环神经网络模型。在本节中,我们将使用简单的 vanilla 循环神经网络作为使用`recurrent_group`配置简单循环神经网络的例子。 注意,如果你只需要使用简单的RNN,GRU或LSTM,那么推荐使用`grumemory``lstmemory`,因为它们的计算效率比`recurrent_group`更高。
对于 vanilla RNN,在每个时间步长,**单步函数**为:
*x*<sub>*t* + 1</sub> = *W*<sub>*x*</sub>*x*<sub>*t*</sub> + *W*<sub>*i*</sub>*I*<sub>*t*</sub> + *b*
其中 *x*<sub>*t*</sub> 是RNN状态,并且 *I*<sub>*t*</sub> 是输入,*W*<sub>*x*</sub>*W*<sub>*i*</sub> 分别是RNN状态和输入的变换矩阵。*b* 是偏差。它的**输出函数**只需要*x*<sub>*t*</sub>作为输出。
`recurrent_group`是构建循环神经网络的最重要的工具。 它定义了**单步函数****输出函数**和循环神经网络的输入。注意,这个函数的`step`参数需要实现`step function`(单步函数)和`output function`(输出函数):
``` sourceCode
def simple_rnn(input,
size=None,
name=None,
reverse=False,
rnn_bias_attr=None,
act=None,
rnn_layer_attr=None):
def __rnn_step__(ipt):
out_mem = memory(name=name, size=size)
rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
full_matrix_projection(out_mem)],
name = name,
bias_attr = rnn_bias_attr,
act = act,
layer_attr = rnn_layer_attr,
size = size)
return rnn_out
return recurrent_group(name='%s_recurrent_group' % name,
step=__rnn_step__,
reverse=reverse,
input=input)
```
PaddlePaddle 使用“Memory”(记忆模块)实现单步函数。**Memory**是在PaddlePaddle中构造循环神经网络时最重要的概念。 Memory是在单步函数中循环使用的状态,例如*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>)。 一个Memory包含**输出****输入**。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有**boot layer(引导层)**,其输出被用作Memory的初始值。 在我们的例子中,门控循环单元的输出被用作输出Memory。请注意,`rnn_out`层的名称与`out_mem`的名称相同。这意味着`rnn_out` (*x*<sub>*t* + 1</sub>)的输出被用作`out_mem`Memory的**输出**
Memory也可以是序列。在这种情况下,在每个时间步中,我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 其他高级功能包括定义多个Memory,以及使用子序列来定义分级循环神经网络架构。
我们在函数的结尾返回`rnn_out`。 这意味着 `rnn_out` 层的输出被用作门控循环神经网络的**输出**函数。
### Sequence to Sequence Model with Attention
我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
![image](../../../tutorials/text_generation/encoder-decoder-attention-model.png)
在这个模型中,源序列 *S* = {*s*<sub>1</sub>, …, *s*<sub>*T*</sub>} 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 *H*<sub>*S*</sub> = {*H*<sub>1</sub>, …, *H*<sub>*T*</sub>} 被称为 *编码向量*。解码器是门控循环神经网络。当解读每一个*y*<sub>*t*</sub>时, 这个门控循环神经网络生成一系列权重 *W*<sub>*S*</sub><sup>*t*</sup> = {*W*<sub>1</sub><sup>*t*</sup>, …, *W*<sub>*T*</sub><sup>*t*</sup>}, 用于计算编码向量的加权和。加权和用来生成*y*<sub>*t*</sub>
模型的编码器部分如下所示。它叫做`grumemory`来表示门控循环神经网络。如果网络架构简单,那么推荐使用循环神经网络的方法,因为它比 `recurrent_group` 更快。我们已经实现了大多数常用的循环神经网络架构,可以参考 [Layers](../../ui/api/trainer_config_helpers/layers_index.html) 了解更多细节。
我们还将编码向量投射到 `decoder_size` 维空间。这通过获得反向循环网络的第一个实例,并将其投射到 `decoder_size` 维空间完成:
``` sourceCode
# 定义源语句的数据层
src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
# 计算每个词的词向量
src_embedding = embedding_layer(
input=src_word_id,
size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding'))
# 应用前向循环神经网络
src_forward = grumemory(input=src_embedding, size=encoder_size)
# 应用反向递归神经网络(reverse=True表示反向循环神经网络)
src_backward = grumemory(input=src_embedding,
size=encoder_size,
reverse=True)
# 将循环神经网络的前向和反向部分混合在一起
encoded_vector = concat_layer(input=[src_forward, src_backward])
# 投射编码向量到 decoder_size
encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
size = decoder_size)
# 计算反向RNN的第一个实例
backward_first = first_seq(input=src_backward)
# 投射反向RNN的第一个实例到 decoder size
decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
```
解码器使用 `recurrent_group` 来定义循环神经网络。单步函数和输出函数在 `gru_decoder_with_attention` 中定义:
``` sourceCode
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
StaticInput(input=encoded_proj,is_seq=True)]
trg_embedding = embedding_layer(
input=data_layer(name='target_language_word',
size=target_dict_dim),
size=word_vector_dim,
param_attr=ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# 对于配备有注意力机制的解码器,在训练中,
# 目标向量(groudtruth)是数据输入,
# 而源序列的编码向量可以被无边界的memory访问
# StaticInput 意味着不同时间步的输入都是相同的值,
# 否则它以一个序列输入,不同时间步的输入是不同的。
# 所有输入序列应该有相同的长度。
decoder = recurrent_group(name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
```
单步函数的实现如下所示。首先,它定义解码网络的**Memory**。然后定义 attention,门控循环单元单步函数和输出函数:
``` sourceCode
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
# 定义解码器的Memory
# Memory的输出定义在 gru_step 内
# 注意 gru_step 应该与它的Memory名字相同
decoder_mem = memory(name='gru_decoder',
size=decoder_size,
boot_layer=decoder_boot)
# 计算 attention 加权编码向量
context = simple_attention(encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
# 混合当前词向量和attention加权编码向量
decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
full_matrix_projection(current_word)],
size = decoder_size * 3)
# 定义门控循环单元循环神经网络单步函数
gru_step = gru_step_layer(name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
# 定义输出函数
out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
size=target_dict_dim,
bias_attr=True,
act=SoftmaxActivation())
return out
```
生成序列
-----------------
训练模型后,我们可以使用它来生成序列。通常的做法是使用**beam search** 生成序列。以下代码片段定义 beam search 算法。注意,`beam_search` 函数假设 `step` 的输出函数返回的是下一个时刻输出词的 softmax 归一化概率向量。我们对模型进行了以下更改。
- 使用 `GeneratedInput` 来表示 trg\_embedding。 `GeneratedInput` 将上一时间步所生成的词的向量来作为当前时间步的输入。
- 使用 `beam_search` 函数。这个函数需要设置:
- `bos_id`: 开始标记。每个句子都以开始标记开头。
- `eos_id`: 结束标记。每个句子都以结束标记结尾。
- `beam_size`: beam search 算法中的beam大小。
- `max_length`: 生成序列的最大长度。
- 使用 `seqtext_printer_evaluator` 根据索引矩阵和字典打印文本。这个函数需要设置:
- `id_input`: 数据的整数ID,用于标识生成的文件中的相应输出。
- `dict_file`: 用于将词ID转换为词的字典文件。
- `result_file`: 生成结果文件的路径。
代码如下:
``` sourceCode
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
StaticInput(input=encoded_proj,is_seq=True)]
# 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。
# 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。
# 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 <s>。
trg_embedding = GeneratedInput(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = beam_search(name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0, # Beginnning token.
eos_id=1, # End of sentence token.
beam_size=beam_size,
max_length=max_length)
seqtext_printer_evaluator(input=beam_gen,
id_input=data_layer(name="sent_id", size=1),
dict_file=trg_dict_path,
result_file=gen_trans_file)
outputs(beam_gen)
```
注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 [Semantic Role Labeling Demo](../../demo/semantic_role_labeling/index.html) 了解更多详细信息。
完整的配置文件在`demo/seqToseq/seqToseq_net.py`
RNN 配置
RNN配置
========
本教程将指导你如何在 PaddlePaddle
......@@ -20,7 +20,7 @@ PaddlePaddle
不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。
它们都是序列,它们的大小是\ ``src_dict``\ ,\ ``trg_dict``\ 和\ ``trg_dict``\ :
.. code:: sourcecode
.. code:: python
settings.input_types = [
integer_value_sequence(len(settings.src_dict)),
......@@ -29,12 +29,11 @@ PaddlePaddle
在\ ``process``\ 函数中,每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列:
.. code:: sourcecode
.. code:: python
yield src_ids, trg_ids, trg_ids_next
有关如何编写数据提供程序的更多细节描述,请参考
`PyDataProvider2 <../../ui/data_provider/index.html>`__\ 。完整的数据提供文件在
有关如何编写数据提供程序的更多细节描述,请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
``demo/seqToseq/dataprovider.py``\ 。
配置循环神经网络架构
......@@ -45,18 +44,17 @@ PaddlePaddle
循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
.. figure:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
:alt: image
.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
:align: center
image
一般来说,循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
一般来说,循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t*
= 1 执行以下操作。
.. math::
*x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ ),\ *y*\ \ *t*\  = *f*\ \ *y*\ (*x*\ \ *t*\ )
x_{t+1} = f_x(x_t), y_t = f_y(x_t)
其中 *f*\ \ *x*\ (.) 称为\ **单步函数**\ (即单时间步执行的函数,step
function),而 *f*\ \ *y*\ (.) 称为\ **输出函数**\ 。在 vanilla
其中 :math:`f_x(.)` 称为\ **单步函数**\ (即单时间步执行的函数,step
function),而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
循环神经网络中,单步函数和输出函数都非常简单。然而,PaddlePaddle
可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
sequence
......@@ -67,16 +65,17 @@ vanilla
对于 vanilla RNN,在每个时间步长,\ **单步函数**\ 为:
*x*\ \ *t* + 1 = *W*\ \ *x*\ \ *x*\ \ *t*\  + *W*\ \ *i*\ \ *I*\ \ *t*\  + *b*
.. math::
其中 *x*\ \ *t*\ 是RNN状态,并且 *I*\ \ *t*\ 是输入,\ *W*\ \ *x*\ 和
*W*\ \ *i*\ 分别是RNN状态和输入的变换矩阵。\ *b*
是偏差。它的\ **输出函数**\ 只需要\ *x*\ \ *t*\ 作为输出。
x_{t+1} = W_x x_t + W_i I_t + b
其中 :math:`x_t` 是RNN状态,并且 :math:`I_t` 是输入,:math:`W_x` 和
:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
``recurrent_group``\ 是构建循环神经网络的最重要的工具。
它定义了\ **单步函数**\ ,\ **输出函数**\ 和循环神经网络的输入。注意,这个函数的\ ``step``\ 参数需要实现\ ``step function``\ (单步函数)和\ ``output function``\ (输出函数):
.. code:: sourcecode
.. code:: python
def simple_rnn(input,
size=None,
......@@ -102,7 +101,7 @@ vanilla
PaddlePaddle
使用“Memory”(记忆模块)实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
Memory是在单步函数中循环使用的状态,例如\ *x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ )
Memory是在单步函数中循环使用的状态,例如 :math:`x_{t+1} = f_x(x_t)`
一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
layer(引导层)**\ ,其输出被用作Memory的初始值。
在我们的例子中,门控循环单元的输出被用作输出Memory。请注意,\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
......@@ -120,30 +119,25 @@ Sequence to Sequence Model with Attention
我们将使用 sequence to sequence model with attention
作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
.. figure:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
:alt: image
image
.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
:align: center
在这个模型中,源序列 *S* = {*s*\ 1, …, \ *s*\ \ *T*\ }
在这个模型中,源序列 :math:`S = \{s_1, \dots, s_T\}`
用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
*H*\ \ *S*\  = {*H*\ 1, …, \ *H*\ \ *T*\ } 被称为
*编码向量*\ 。解码器是门控循环神经网络。当解读每一个\ *y*\ \ *t*\ 时,
这个门控循环神经网络生成一系列权重
*W*\ \ *S*\ \ *t*\  = {*W*\ 1\ *t*\ , …, \ *W*\ \ *T*\ \ *t*\ },
用于计算编码向量的加权和。加权和用来生成\ *y*\ \ *t*\ 。
:math:`H_S = \{H_1, \dots, H_T\}` 被称为
*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
这个门控循环神经网络生成一系列权重 :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单,那么推荐使用循环神经网络的方法,因为它比
``recurrent_group``
更快。我们已经实现了大多数常用的循环神经网络架构,可以参考
`Layers <../../ui/api/trainer_config_helpers/layers_index.html>`__
了解更多细节。
更快。我们已经实现了大多数常用的循环神经网络架构,可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
我们还将编码向量投射到 ``decoder_size``
维空间。这通过获得反向循环网络的第一个实例,并将其投射到
``decoder_size`` 维空间完成:
.. code:: sourcecode
.. code:: python
# 定义源语句的数据层
src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
......@@ -174,7 +168,7 @@ Sequence to Sequence Model with Attention
解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
``gru_decoder_with_attention`` 中定义:
.. code:: sourcecode
.. code:: python
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
StaticInput(input=encoded_proj,is_seq=True)]
......@@ -198,7 +192,7 @@ Sequence to Sequence Model with Attention
单步函数的实现如下所示。首先,它定义解码网络的\ **Memory**\ 。然后定义
attention,门控循环单元单步函数和输出函数:
.. code:: sourcecode
.. code:: python
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
# 定义解码器的Memory
......@@ -253,7 +247,7 @@ attention,门控循环单元单步函数和输出函数:
代码如下:
.. code:: sourcecode
.. code:: python
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
StaticInput(input=encoded_proj,is_seq=True)]
......@@ -279,9 +273,6 @@ attention,门控循环单元单步函数和输出函数:
result_file=gen_trans_file)
outputs(beam_gen)
注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅
`Semantic Role Labeling
Demo <../../demo/semantic_role_labeling/index.html>`__
了解更多详细信息。
注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
......@@ -7,10 +7,11 @@
.. toctree::
:maxdepth: 1
usage/cmd_parameter/index_cn.rst
usage/concepts/use_concepts_cn.rst
usage/cluster/cluster_train_cn.md
usage/cluster/k8s/k8s_cn.md
usage/cluster/k8s/k8s_distributed_cn.md
usage/k8s/k8s_cn.md
usage/k8s/k8s_distributed_cn.md
开发标准
--------
......
......@@ -7,8 +7,10 @@ Usage
.. toctree::
:maxdepth: 1
usage/cmd_parameter/index_en.md
usage/cmd_parameter/index_en.rst
usage/cluster/cluster_train_en.md
usage/k8s/k8s_en.md
usage/k8s/k8s_aws_en.md
Development
------------
......
# 参数概述
虽然Paddle看起来包含了众多参数,但是大部分参数是为开发者提供的,或者已经在集群提交环境中自动设置,因此用户并不需要关心它们。在此,根据这些参数的使用场合,我们将它们划分为不同的类别。例如,`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中,而有些参数需要在集群多机训练中使用等。
<html>
<table border="2" frame="border">
<thead>
<tr>
<th scope="col" class="left"></th>
<th scope="col" class="left">参数</th>
<th scope="col" class="left">本地训练</th>
<th scope="col" class="left">集群训练</th>
<th scope="col" class="left">本地测试</th>
<th scope="col" class="left">集群测试</th>
</tr>
</thead>
<tbody>
<tr>
<td class="left" rowspan="9">通用</td>
<td class="left">job</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">use_gpu</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">local</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">config</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">config_args</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">num_passes</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">trainer_count</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">version</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">show_layer_stat</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">test_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">saving_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">show_parameter_stats_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">init_model_path</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">load_missing_parameter_strategy</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">saving_period_by_batches</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">use_old_updater</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">enable_grad_share</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">grad_share_block_num</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">log_error_clipping</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">log_clipping</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">save_only_one</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">allow_inefficient_sparse_update</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">start_pass</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">训练/测试</td><td class="left">save_dir</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "2">训练过程中测试</td><td class="left">test_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">average_test_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "5">测试</td><td class="left">model_list</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">test_wait</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">test_pass</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">predict_output_dir</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">distribute_test</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">Auc/正负对验证(PnpairValidation)</td><td class="left">predict_file</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">parallel_nn</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">allow_only_one_model_on_one_gpu</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">cudnn_dir</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">cuda_dir</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">cudnn_conv_workspace_limit_in_mb</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "4">递归神经网络(RNN)</td>
<td class="left">beam_size</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">rnn_use_batch</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">prev_batch_state</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">diy_beam_search_prob_so</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">data_server_port</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">pservers</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">port</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">port_num</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">ports_num_for_sparse</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">nics</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">rdma_tcp</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">small_messages</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">loadsave_parameters_in_pserver</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">log_period_server</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">pserver_num_threads</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">sock_send_buf_size</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">sock_recv_buf_size</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">num_gradient_servers</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">parameter_block_size</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">parameter_block_size_for_sparse</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "3">异步随机梯度下降(Async SGD)</td><td class="left">async_count</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">async_lagged_ratio_min</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">async_lagged_ratio_default</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "8">性能调优(Performance Tuning)</td><td class="left">log_barrier_abstract</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">log_barrier_lowest_nodes</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">log_barrier_show_log</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">check_sparse_distribution_batches</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">check_sparse_distribution_ratio</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">check_sparse_distribution_unbalance_degree</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">check_sparse_distribution_in_pserver</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">show_check_sparse_distribution_log</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">数据提供器(Data Provider)</td><td class="left">memory_threshold_on_load_data</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "2">随机数</td><td class="left">seed</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">thread_local_rand_use_global_seed</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">单元测试</td><td class="left">checkgrad_eps</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">矩阵/向量</td><td class="left">enable_parallel_vector</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
</tbody>
</table>
</html>
# 细节描述
## 通用
* `--job`
- 工作模式,包括: **train, test, checkgrad**,其中checkgrad主要为开发者使用,使用者不需要关心。
- 类型: string (默认: train)
* `--config`
- 用于指定网络配置文件。
- 类型: string (默认: null).
* `--use_gpu`
- 训练过程是否使用GPU,设置为true使用GPU模式,否则使用CPU模式。
- 类型: bool (默认: 1).
* `--local`
 - 训练过程是否为本地模式,设置为true使用本地训练或者使用集群上的一个节点,否则使用多机训练。
- 类型: bool (默认: 1).
* `--trainer_count`
- 指定一台机器上使用的线程数。例如,trainer_count = 4, 意思是在GPU模式下使用4个GPU,或者在CPU模式下使用4个线程。每个线程(或GPU)分配到当前数据块样本数的四分之一。也就是说,如果在训练配置中设置batch_size为512,每个线程分配到128个样本用于训练。
- 类型: int32 (默认: 1).
* `--num_passes`
- 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时,意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。
- 类型: int32 (默认: 100).
* `--config_args`
- 传递给配置文件的参数。格式: key1=value1,key2=value2.
- 类型: string (默认: null).
* `--version`
- 是否打印版本信息。
- 类型: bool (默认: 0).
* `--show_layer_stat`
- 是否显示**每个批次数据**中每层的数值统计.
- 类型: bool (默认: 0).
## 训练
* `--log_period`
- 每log_period个批次打印日志进度.
- 类型: int32 (默认: 100).
* `--dot_period`
- 每dot_period个批次输出符号'.'.
- 类型: int32 (默认: 1).
* `--saving_period`
- 每saving_period轮保存训练参数.
- 类型: int32 (默认: 1).
* `--save_dir`
- 保存模型参数的目录,需要明确指定,但不需要提前创建。
- 类型: string (默认: null).
* `--start_pass`
- 从start_pass轮开始训练,会加载上一轮的参数。
- 类型: int32 (默认: 0).
* `--show_parameter_stats_period`
- 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。
- 类型: int32 (默认: 0).
* `--save_only_one`
- 只保存最后一轮的参数,而之前的参数将会被删除。
- 类型: bool (默认: 0).
* `--load_missing_parameter_strategy`
- 当模型参数不存在时,指定加载的方式。目前支持fail/rand/zero三种操作.
- `fail`: 程序直接退出.
- `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数.
- `zero`: 所有参数置为零.
- 类型: string (默认: fail).
* `--init_model_path`
- 初始化模型的路径。如果设置该参数,start\_pass将不起作用。同样也可以在测试模式中指定模型路径。
- 类型: string (默认: null).
* `--saving_period_by_batches`
- 在一轮中每saving_period_by_batches个批次保存一次参数。
- 类型: int32 (默认: 0).
* `--log_error_clipping`
- 当在网络层配置中设置**error_clipping_threshold**时,该参数指示是否打印错误截断日志。如果为true,**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**.
- 类型: bool (默认: 0).
* `--log_clipping`
- 当在训练配置中设置**gradient_clipping_threshold**时,该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**.
- 类型: bool (默认: 0).
* `--use_old_updater`
- 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater,主要为开发者使用,使用者通常无需关心.
- 类型: bool (默认: 0).
* `--enable_grad_share`
- 启用梯度参数的阈值,在多CPU训练时共享该参数.
- 类型: int32 (默认: 100 \* 1024 \* 1024).
* `--grad_share_block_num`
- 梯度参数的分块数目,在多CPU训练时共享该参数.
- 类型: int32 (默认: 64).
## 测试
* `--test_pass`
- 加载test_pass轮的模型用于测试.
- 类型: int32 (默认: -1).
* `--test_period`
- 如果为0,每轮结束时对所有测试数据进行测试;如果不为0,每test_period个批次对所有测试数据进行测试.
- 类型: int32 (默认: 0).
* `--test_wait`
- 指示当指定轮的测试模型不存在时,是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试,可以使用该参数.
- 类型: bool (默认: 0).
* `--model_list`
- 测试时指定的存储模型列表的文件.
- 类型: string (默认: "", null).
* `--predict_output_dir`
- 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定,默认为null,意思是不保存结果。在测试阶段,如果你想要保存某些层的特征图,请指定该目录。需要注意的是,网络层的输出是经过激活函数之后的值.
- 类型: string (默认: "", null).
* `--average_test_period`
- 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除,默认为0,意思是不使用平均参数执行测试.
- 类型: int32 (默认: 0).
* `--distribute_test`
- 在分布式环境中测试,将多台机器的测试结果合并.
- 类型: bool (默认: 0).
* `--predict_file`
- 保存预测结果的文件名。该参数默认为null,意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层,每轮都会保存预测结果.
- 类型: string (默认: "", null).
## GPU
* `--gpu_id`
- 指示使用哪个GPU核.
- 类型: int32 (默认: 0).
* `--allow_only_one_model_on_one_gpu`
- 如果为true,一个GPU设备上不允许配置多个模型.
- 类型: bool (默认: 1).
* `--parallel_nn`
- 指示是否使用多线程来计算一个神经网络。如果为false,设置gpu_id指定使用哪个GPU核(训练配置中的设备属性将会无效)。如果为true,GPU核在训练配置中指定(gpu_id无效).
- 类型: bool (默认: 0).
* `--cudnn_dir`
- 选择路径来动态加载NVIDIA CuDNN库,例如,/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
- 类型: string (默认: "", null)
* `--cuda_dir`
- 选择路径来动态加载NVIDIA CUDA库,例如,/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
- 类型: string (默认: "", null)
* `--cudnn_conv_workspace_limit_in_mb`
- 指定cuDNN的最大工作空间容限,单位是MB,默认为4096MB=4GB.
- 类型: int32 (默认: 4096MB=4GB)
## 自然语言处理(NLP): RNN/LSTM/GRU
* `--rnn_use_batch`
- 指示在简单的RecurrentLayer层的计算中是否使用批处理方法.
- 类型: bool (默认: 0).
* `--prev_batch_state`
- 标识是否为连续的batch计算.
- 类型: bool (默认: 0).
* `--beam_size`
- 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上,都会产生当前层状态的所有继承结果,按启发式损失的大小递增排序。然而,每层上只能保存固定数目个最好的状态,该数目是提前定义好的,称之为集束大小.
- 类型: int32 (默认: 1).
* `--diy_beam_search_prob_so`
 - 用户可以自定义beam search的方法,编译成动态库,供PaddlePaddle加载。 该参数用于指定动态库路径.
- 类型: string (默认: "", null).
## 度量学习(Metric Learning)
* `--external`
- 指示是否使用外部机器进行度量学习.
- 类型: bool (默认: 0).
* `--data_server_port`
- 数据服务器(data server)的监听端口,主要用在度量学习中.
- 类型: int32 (默认: 21134).
## 数据支持(DataProvider)
* `--memory_threshold_on_load_data`
- 内存容限阈值,当超过该阈值时,停止加载数据.
- 类型: double (默认: 1.0).
## 单元测试
* `--checkgrad_eps`
- 使用checkgrad模式时的参数变化大小.
- 类型: double (默认: 1e-05).
## 参数服务器和分布式通信
* `--start_pserver`
- 指示是否开启参数服务器(parameter server).
- 类型: bool (默认: 0).
* `--pservers`
- 参数服务器的IP地址,以逗号间隔.
- 类型: string (默认: "127.0.0.1").
* `--port`
- 参数服务器的监听端口.
- 类型: int32 (默认: 20134).
* `--ports_num`
- 发送参数的端口号,根据默认端口号递增.
- 类型: int32 (默认: 1).
* `--trainer_id`
 - 在分布式训练中,每个训练节点必须指定一个唯一的id号,从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数.
- 类型: int32 (默认: 0).
* `--num_gradient_servers`
- 梯度服务器的数量,该参数在集群提交环境中自动设置.
- 类型: int32 (默认: 1).
* `--small_messages`
- 如果消息数据太小,建议将该参数设为true,启动快速应答,无延迟.
- 类型: bool (默认: 0).
* `--sock_send_buf_size`
- 限制套接字发送缓冲区的大小。如果仔细设置的话,可以有效减小网络的阻塞.
- 类型: int32 (默认: 1024 \* 1024 \* 40).
* `--sock_recv_buf_size`
- 限制套接字接收缓冲区的大小.
- 类型: int32 (默认: 1024 \* 1024 \* 40).
* `--parameter_block_size`
- 参数服务器的参数分块大小。如果未设置,将会自动计算出一个合适的值.
- 类型: int32 (默认: 0).
* `--parameter_block_size_for_sparse`
- 参数服务器稀疏更新的参数分块大小。如果未设置,将会自动计算出一个合适的值.
- 类型: int32 (默认: 0).
* `--log_period_server`
- 在参数服务器终端每log_period_server个批次打印日志进度.
- 类型: int32 (默认: 500).
* `--loadsave_parameters_in_pserver`
- 在参数服务器上加载和保存参数,只有当设置了sparse_remote_update参数时才有效.
- 类型: bool (默认: 0).
* `--pserver_num_threads`
- 同步执行操作的线程数.
- 类型: bool (默认: 1).
* `--ports_num_for_sparse`
- 发送参数的端口号,根据默认值递增(port + ports_num),用于稀疏训练中.
- 类型: int32 (默认: 0).
* `--nics`
- 参数服务器的网络设备名称,已经在集群提交环境中完成设置.
- 类型: string (默认: "xgbe0,xgbe1").
* `--rdma_tcp`
- 使用rdma还是tcp传输协议,该参数已经在集群提交环境中完成设置.
- 类型: string (默认: "tcp").
## 异步随机梯度下降(Async SGD)
* `--async_count`
- 定义异步训练的长度,如果为0,则使用同步训练.
- 类型: int32 (默认: 0).
* `--async_lagged_ratio_min`
- 控制`config_.async_lagged_grad_discard_ratio()`的最小值.
- 类型: double (默认: 1.0).
* `--async_lagged_ratio_default`
- 如果在网络配置中未设置async_lagged_grad_discard_ratio,则使用该参数作为默认值.
- 类型: double (默认: 1.5).
## 性能调优(Performance Tuning)
* `--log_barrier_abstract`
- 如果为true,则显示阻隔性能的摘要信息.
- 类型: bool (默认: 1).
* `--log_barrier_show_log`
- 如果为true,则总会显示阻隔摘要信息,即使间隔很小.
- 类型: bool (默认: 0).
* `--log_barrier_lowest_nodes`
- 最少显示多少个节点.
- 类型: int32 (默认: 5).
* `--check_sparse_distribution_in_pserver`
- 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的.
- 类型: bool (默认: 0).
* `--show_check_sparse_distribution_log`
- 指示是否显示参数服务器上的稀疏参数分布的日志细节.
- 类型: bool (默认: 0).
* `--allow_inefficient_sparse_update`
- 指示是否允许低效率的稀疏更新.
- 类型: bool (默认: 0).
* `--check_sparse_distribution_batches`
- 每运行多少个批次执行一次稀疏参数分布的检查.
- 类型: int32 (默认: 100).
* `--check_sparse_distribution_ratio`
- 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio * check_sparse_distribution_batches次,程序停止.
- 类型: double (默认: 0.6).
* `--check_sparse_distribution_unbalance_degree`
- 不同参数服务器上数据大小的最大值与最小值的比率.
- 类型: double (默认: 2).
## 矩阵/向量/随机数
* `--enable_parallel_vector`
- 启动并行向量的阈值.
- 类型: int32 (默认: 0).
* `--seed`
- 随机数的种子。srand(time)的为0.
- 类型: int32 (默认: 1)
* `--thread_local_rand_use_global_seed`
- 是否将全局种子应用于本地线程的随机数.
- 类型: bool (默认: 0).
......@@ -73,7 +73,7 @@
- type: bool (default: 0).
* `--load_missing_parameter_strategy`
- Specify the loading operation when model file is missing. Now support fail/rand/zere three operations.
- Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
- `fail`: program will exit.
- `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
- `zero`: all parameters are zero.
......@@ -118,11 +118,11 @@
- type: int32 (default: 0).
* `--test_wait`
- Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
 - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
- type: bool (default: 0).
* `--model_list`
- File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
- File that saves the model list when testing.
- type: string (default: "", null).
* `--predict_output_dir`
......@@ -212,7 +212,7 @@
- type: bool (default: 0).
* `--pservers`
- Comma separated IP addresses of pservers. It is set automatically in cluster submitting environment.
- Comma separated IP addresses of pservers.
- type: string (default: "127.0.0.1").
* `--port`
......
.. _cmd_line_index:
设置命令行参数
===============
.. toctree::
:maxdepth: 1
use_case_cn.md
arguments_cn.md
detail_introduction_cn.md
```eval_rst
.. _cmd_line_index:
```
# Set Command-line Parameters
* [Use Case](use_case_en.md)
* [Arguments](arguments_en.md)
* [Detailed Descriptions](detail_introduction_en.md)
.. _cmd_line_index:
Set Command-line Parameters
===========================
.. toctree::
:maxdepth: 1
use_case_en.md
arguments_en.md
detail_introduction_en.md
# 使用案例
## 本地训练
本地训练的实验,诸如图像分类,自然语言处理等,通常都会使用下面这些命令行参数。
```
paddle train \
--use_gpu=1/0 \ #1:GPU,0:CPU(默认为1)
--config=network_config \
--save_dir=output \
--trainer_count=COUNT \ #(默认为1)
--test_period=M \ #(默认为0)
--num_passes=N \ #(默认为100)
--log_period=K \ #(默认为100)
--dot_period=1000 \ #(默认为1)
#[--show_parameter_stats_period=100] \ #(默认为0)
#[--saving_period_by_batches=200] \ #(默认为0)
```
根据你的任务,可以选择是否使用参数`show_parameter_stats_period``saving_period_by_batches`
### 1) 将命令参数传给网络配置
`config_args`是一个很有用的参数,用于将参数传递给网络配置。
```
--config_args=generating=1,beam_size=5,layer_num=10 \
```
`get_config_arg`可用于在网络配置中解析这些参数,如下所示:
```
generating = get_config_arg('generating', bool, False)
beam_size = get_config_arg('beam_size', int, 3)
layer_num = get_config_arg('layer_num', int, 8)
```
`get_config_arg`:
```
get_config_arg(name, type, default_value)
```
- name: `--config_args`中指定的名字
- type: 值类型,包括bool, int, str, float等
- default_value: 默认值
### 2) 使用模型初始化网络
增加如下参数:
```
--init_model_path=model_path
--load_missing_parameter_strategy=rand
```
## 本地测试
方法一:
```
paddle train --job=test \
--use_gpu=1/0 \
--config=network_config \
--trainer_count=COUNT \
--init_model_path=model_path \
```
- 使用init\_model\_path指定测试的模型
- 只能测试单个模型
方法二:
```
paddle train --job=test \
--use_gpu=1/0 \
--config=network_config \
--trainer_count=COUNT \
--model_list=model.list \
```
- 使用model_list指定测试的模型列表
- 可以测试多个模型,文件model.list如下所示:
```
./alexnet_pass1
./alexnet_pass2
```
方法三:
```
paddle train --job=test \
--use_gpu=1/0 \
--config=network_config \
--trainer_count=COUNT \
--save_dir=model \
--test_pass=M \
--num_passes=N \
```
这种方式必须使用Paddle存储的模型路径格式,如:`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如,M=12,N=14这种写法将会测试模型`model/pass-00012``model/pass-00013`
## 稀疏训练
当输入是维度很高的稀疏数据时,通常使用稀疏训练来加速计算过程。例如,输入数据的字典维数是1百万,但是每个样本仅包含几个词。在Paddle中,稀疏矩阵的乘积应用于前向传播过程,而稀疏更新在反向传播之后的权重更新时进行。
### 1) 本地训练
用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
### 2) 集群训练
在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
```
--ports_num_for_sparse=1 #(默认为0)
```
## parallel_nn
用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说,你可以将网络配置成某些层使用GPU计算,而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算,这样可以减小GPU内存,或者采用并行计算来加速某些层的更新。
如果你想使用这些特性,你需要在网络配置中指定设备的ID号(表示为deviceId),并且加上下面的命令行参数:
```
--parallel_nn=true
```
### 案例一:GPU和CPU混合使用
请看下面的例子:
```
#command line:
paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
default_device(0)
fc1=fc_layer(...)
fc2=fc_layer(...)
fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
```
- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外,其他所有层都会使用GPU计算,每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此,fc1和fc2层在GPU上计算。
- device=-1: fc3层使用CPU计算。
- trainer_count:
- trainer_count=1: 如果未设置gpu\_id,那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
- trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如,trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
### 案例二:在不同设备上指定层
```
#command line:
paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
#network:
fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
```
在本例中,我们假设一台机器上有4个GPU。
- trainer_count=1:
- 使用0号GPU计算fc2层。
- 使用1号GPU计算fc3层。
- 使用CPU计算fc4层。
- trainer_count=2:
- 使用0号和1号GPU计算fc2层。
- 使用2号和3号GPU计算fc3层。
- 使用CPU两线程计算fc4层。
- trainer_count=4:
- 运行失败(注意到我们已经假设机器上有4个GPU),因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
**当`device!=-1`时设备ID号的分配:**
```
(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
deviceId: 在层中指定
gpu_id: 默认为0
threadId: 线程ID号,范围: 0,1,..., trainer_count-1
numDevices_: 机器的设备(GPU)数目
numLogicalDevices_: min(max(deviceId + 1), numDevices_)
```
......@@ -134,14 +134,14 @@ fc2=fc_layer(...)
fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
```
- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer l1 and l2 are computed on the GPU.
- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.
- device=-1: use the CPU for layer l3.
- device=-1: use the CPU for layer fc3.
- trainer_count:
- trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers l1 and l2. Otherwise use the GPU with gpu\_id.
- trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.
- trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer l1 and l2.
- trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.
### Case 2: Specify Layers in Different Devices
......@@ -157,14 +157,14 @@ fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
In this case, we assume that there are 4 GPUs in one machine.
- trainer_count=1:
- Use GPU 0 to compute layer l2.
- Use GPU 1 to compute layer l3.
- Use CPU to compute layer l4.
- Use GPU 0 to compute layer fc2.
- Use GPU 1 to compute layer fc3.
- Use CPU to compute layer fc4.
- trainer_count=2:
- Use GPU 0 and 1 to compute layer l2.
- Use GPU 2 and 3 to compute layer l3.
- Use CPU to compute l4 in two threads.
- Use GPU 0 and 1 to compute layer fc2.
- Use GPU 2 and 3 to compute layer fc3.
- Use CPU to compute fc4 in two threads.
- trainer_count=4:
- It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
......
此差异已折叠。
# Kubernetes 单机训练
# Kubernetes单机训练
在这篇文档里,我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中,我们将介绍如何启动分布式训练作业。
......
# Kubernetes 分布式训练
# Kubernetes分布式训练
前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。
......@@ -22,7 +22,7 @@
首先,我们需要拥有一个Kubernetes集群,在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建,可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/),在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机,并且可以按照官方文档在上面部署Kubernetes。在本文的环境中,Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)(Moose filesystem,一种分布式文件系统)共享目录,我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署,可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前,用户将配置与训练数据切分好放在MFS目录中,训练时,程序从此目录拷贝文件到容器内进行训练,将结果保存到此目录里。整体的结构图如下:
![paddle on kubernetes结构图](k8s-paddle-arch.png)
![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
上图描述了一个3节点的分布式训练场景,Kubernetes集群的每个node上都挂载了一个MFS目录,这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行,每个pod包含一个PaddlePaddle容器。在容器创建后,会启动pserver与trainer进程,读取volume中的数据进行这次分布式训练。
......
# Paddle On Kubernetes
>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
## Build Docker Image
In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
### Run Docker Container
```
$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
```
### Download Training Data
Getting into `/root/paddle/demo/quick_start/data` Directory,using `get_data.sh` to download training data.
Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
```
$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
Downloading Amazon Electronics reviews data...
--2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495854086 (473M) [application/x-gzip]
Saving to: 'reviews_Electronics_5.json.gz'
10% [=======> ] 874,279 64.7KB/s eta 2h 13m
```
### Modify Startup Script
After downloading the data,modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
```
set -e
cd /root/paddle/demo/quick_start
cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py
#cfg=trainer_config.bidi-lstm.py
#cfg=trainer_config.db-lstm.py
paddle train \
--config=$cfg \
--save_dir=./output \
--trainer_count=4 \
--log_period=20 \
--num_passes=15 \
--use_gpu=false \
--show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
```
### Commit Docker Image
```
$ docker commit quick_start_data mypaddle/paddle:quickstart
```
## Use Kubernetes For Training
>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
### Create Yaml Files
The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
```
apiVersion: batch/v1
kind: Job
metadata:
name: quickstart
spec:
parallelism: 1
completions: 1
template:
metadata:
name: quickstart
spec:
volumes:
- name: output
hostPath:
path: /home/work/paddle_output
containers:
- name: pi
image: mypaddle/paddle:quickstart
command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"]
volumeMounts:
- name: output
mountPath: /root/paddle/demo/quick_start/output
restartPolicy: Never
```
### Start Paddle Job
Using the above yaml file to start the Kubernetes job.
```
$ kubectl create -f paddle.yaml
```
Get the detailed status of the job:
```
$ kubectl get job
NAME DESIRED SUCCESSFUL AGE
quickstart 1 0 58s
$ kubectl describe job quickstart
Name: quickstart
Namespace: default
Image(s): registry.baidu.com/public/paddle:cpu-demo-latest
Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
Parallelism: 1
Completions: 1
Start Time: Mon, 31 Oct 2016 11:20:16 +0800
Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
Pods Statuses: 0 Running / 1 Succeeded / 0 Failed
Volumes:
output:
Type: HostPath (bare host directory volume)
Path: /home/work/paddle_output
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx
```
### Get Training Result
We can use kubectl command to take a look at the status of related pod.
```
$ kubectl describe pod quickstart-fa0wx
Name: quickstart-fa0wx
Namespace: default
Node: paddle-demo-let02/10.206.202.44
Start Time: Mon, 31 Oct 2016 11:20:17 +0800
Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
Status: Succeeded
IP: 10.0.0.9
Controllers: Job/quickstart
Containers:
quickstart:
Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
Image: registry.baidu.com/public/paddle:cpu-demo-latest
Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
Port:
Command:
bin/bash
-c
/root/paddle/demo/quick_start/train.sh
QoS Tier:
cpu: BestEffort
memory: BestEffort
State: Terminated
Reason: Completed
Exit Code: 0
Started: Mon, 31 Oct 2016 11:20:20 +0800
Finished: Mon, 31 Oct 2016 11:21:46 +0800
Ready: False
Restart Count: 0
Environment Variables:
Conditions:
Type Status
Ready False
Volumes:
output:
Type: HostPath (bare host directory volume)
Path: /home/work/paddle_output
```
We can also ssh to Kubernetes node to take a look at the training result.
```
[root@paddle-demo-let02 paddle_output]# ll
total 60
drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
```
# 中文词向量模型的使用 #
----------
本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。
在此感谢 @lipeng 提出的代码需求,并给出的相关模型格式的定义。
## 介绍 ###
### 中文字典 ###
我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下: "《红楼梦》"将被分为 "《","红楼梦","》",和 "《红楼梦》"。字典采用UTF8编码,输出有2列:词本身和词频。字典共包含 3206325个词和3个特殊标记:
- `<s>`: 分词序列的开始
- `<e>`: 分词序列的结束
- `<unk>`: 未知词
### 中文词向量的预训练模型 ###
遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法,模型采用 n-gram 语言模型,结构如下图:6元上下文作为输入层->全连接层->softmax层 。对应于字典,我们预训练得到4种不同维度的词向量,分别为:32维、64维、128维和256维。
<center>![](./neural-n-gram-model.png)</center>
<center>Figure 1. neural-n-gram-model</center>
### 下载和数据抽取 ###
运行以下的命令下载和获取我们的字典和预训练模型:
cd $PADDLE_ROOT/demo/model_zoo/embedding
./pre_DictAndModel.sh
## 中文短语改写的例子 ##
以下示范如何使用预训练的中文字典和词向量进行短语改写。
### 数据的准备和预处理 ###
首先,运行以下的命令下载数据集。该数据集(utf8编码)包含20个训练样例,5个测试样例和2个生成式样例。
cd $PADDLE_ROOT/demo/seqToseq/data
./paraphrase_data.sh
第二步,将数据处理成规范格式,在训练数集上训练生成词向量字典(数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`):
cd $PADDLE_ROOT/demo/seqToseq/
python preprocess.py -i data/paraphrase [--mergeDict]
- 其中,如果使用`--mergeDict`选项,源语言短语和目标语言短语的字典将被合并(源语言和目标语言共享相同的编码字典)。本实例中,源语言和目标语言都是相同的语言,因此可以使用该选项。
### 使用用户指定的词向量字典 ###
使用如下命令,从预训练模型中,根据用户指定的字典,抽取对应的词向量构成新的词表:
cd $PADDLE_ROOT/demo/model_zoo/embedding
python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
- `--preModel PREMODEL`: 预训练词向量字典模型的路径
- `--preDict PREDICT`: 预训练模型使用的字典的路径
- `--usrModel USRMODEL`: 抽取出的新词表的保存路径
- `--usrDict USRDICT`: 用户指定新的字典的路径,用于构成新的词表
- `-d DIM`: 参数(词向量)的维度
此处,你也可以简单的运行以下的命令:
cd $PADDLE_ROOT/demo/seqToseq/data/
./paraphrase_model.sh
运行成功以后,你将会看到以下的模型结构:
paraphrase_model
|--- _source_language_embedding
|--- _target_language_embedding
### 在PaddlePaddle平台训练模型 ###
首先,配置模型文件,配置如下(可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置):
from seqToseq_net import *
is_generating = False
################## Data Definition #####################
train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
job_mode = job_mode)
############## Algorithm Configuration ##################
settings(
learning_method = AdamOptimizer(),
batch_size = 50,
learning_rate = 5e-4)
################# Network configure #####################
gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
这个配置与`demo/seqToseq/translation/train.conf` 基本相同
然后,使用以下命令进行模型训练:
cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
./train.sh
其中,`train.sh``demo/seqToseq/translation/train.sh` 基本相同,只有2个配置不一样:
- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model`
- `--load_missing_parameter_strategy`:如果参数模型文件缺失,除词向量模型外的参数将使用正态分布随机初始化
如果用户想要了解详细的数据集的格式、模型的结构和训练过程,请查看 [Text generation Tutorial](../text_generation/index_cn.md).
## 可选功能 ##
### 观测词向量
PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能:
cd $PADDLE_ROOT/demo/model_zoo/embedding
python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
- `-i INPUT`: 输入的(二进制)词向量模型名称
- `-o OUTPUT`: 输出的文本模型名称
- `-d DIM`: (词向量)参数维度
运行完以上命令,用户可以在输出的文本模型中看到:
0,4,32156096
-0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
......
- 其中,第一行是`PaddlePaddle` 输出文件的格式说明,包含3个属性::
- `PaddlePaddle`的版本号,本例中为0
- 浮点数占用的字节数,本例中为4
- 总计的参数个数,本例中为32,156,096
- 其余行是(词向量)参数行(假设词向量维度为32)
- 每行打印32个参数以','分隔
- 共有32,156,096/32 = 1,004,877行,也就是说,模型共包含1,004,877个被向量化的词
### 词向量模型的修正
`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令:
cd $PADDLE_ROOT/demo/model_zoo/embedding
python paraconvert.py --t2b -i INPUT -o OUTPUT
- `-i INPUT`: 输入的文本词向量模型名称
- `-o OUTPUT`: 输出的二进制词向量模型名称
请注意,输入的文本格式如下:
-0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
......
- 输入文本中没有头部(格式说明)行
- (输入文本)每行存储一个词,以逗号','分隔
doc/tutorials/gan/gan.png

32.5 KB | W: | H:

doc/tutorials/gan/gan.png

17.4 KB | W: | H:

doc/tutorials/gan/gan.png
doc/tutorials/gan/gan.png
doc/tutorials/gan/gan.png
doc/tutorials/gan/gan.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -4,9 +4,7 @@ This demo implements GAN training described in the original [GAN paper](https://
The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images.
<p align="center">
<img src="./gan.png" width="500" height="300">
</p>
<center>![](./gan.png)</center>
<p align="center">
Figure 1. GAN-Model-Structure
<a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
......@@ -111,9 +109,7 @@ $python gan_trainer.py -d uniform --useGpu 1
```
The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution.
<p align="center">
<img src="./uniform_sample.png" width="300" height="300">
</p>
<center>![](./uniform_sample.png)</center>
<p align="center">
Figure 2. Uniform Sample
</p>
......@@ -135,9 +131,7 @@ To train the GAN model on mnist data, one can use the following command:
$python gan_trainer.py -d mnist --useGpu 1
```
The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3.
<p align="center">
<img src="./mnist_sample.png" width="300" height="300">
</p>
<center>![](./mnist_sample.png)</center>
<p align="center">
Figure 3. MNIST Sample
</p>
doc/tutorials/gan/uniform_sample.png

20.1 KB | W: | H:

doc/tutorials/gan/uniform_sample.png

24.3 KB | W: | H:

doc/tutorials/gan/uniform_sample.png
doc/tutorials/gan/uniform_sample.png
doc/tutorials/gan/uniform_sample.png
doc/tutorials/gan/uniform_sample.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -2,6 +2,7 @@
* [快速入门](quick_start/index_cn.rst)
* [个性化推荐](rec/ml_regression_cn.rst)
* [图像分类](image_classification/index_cn.md)
* [情感分析](sentiment_analysis/index_cn.md)
* [语义角色标注](semantic_role_labeling/index_cn.md)
* [机器翻译](text_generation/index_cn.md)
......@@ -9,3 +10,4 @@
## 常用模型
* [ResNet模型](imagenet_model/resnet_model_cn.md)
* [词向量模型](embedding_model/index_cn.md)
......@@ -7,6 +7,7 @@ There are several examples and demos here.
* [Sentiment Analysis](sentiment_analysis/index_en.md)
* [Semantic Role Labeling](semantic_role_labeling/index_en.md)
* [Text Generation](text_generation/index_en.md)
* [Image Auto-Generation](gan/index_en.md)
## Model Zoo
* [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
......
add_subdirectory(cuda)
add_subdirectory(function)
add_subdirectory(utils)
add_subdirectory(testing)
add_subdirectory(math)
add_subdirectory(parameter)
add_subdirectory(gserver)
......
......@@ -178,6 +178,7 @@ namespace std {
%newobject ParameterOptimizer::create;
%newobject ParameterOptimizer::needSpecialTraversal;
%newobject ParameterUpdater::createLocalUpdater;
%newobject ParameterUpdater::createRemoteUpdater;
%feature("director") UpdateCallback;
%feature("autodoc", 1); // To generate method stub, for code hint in ide
......
......@@ -803,6 +803,8 @@ private:
public:
static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
int passCount);
~ParameterUpdater();
/**
......
......@@ -15,15 +15,25 @@ limitations under the License. */
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/trainer/RemoteParameterUpdater.h"
#include "paddle/trainer/ThreadParameterUpdater.h"
ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
ParameterUpdater *ParameterUpdater::createLocalUpdater(
OptimizationConfig *config) {
auto param = new ParameterUpdater();
param->m->updater.reset(new paddle::SgdThreadUpdater(config->m->getConfig()));
return param;
auto updater = new ParameterUpdater();
updater->m->updater.reset(
new paddle::SgdThreadUpdater(config->m->getConfig()));
return updater;
}
ParameterUpdater *ParameterUpdater::createRemoteUpdater(
OptimizationConfig *config, int passCount) {
auto updater = new ParameterUpdater();
updater->m->updater.reset(new paddle::RemoteParameterUpdater(
config->m->getConfig(), passCount, nullptr));
return updater;
}
ParameterUpdater::~ParameterUpdater() { delete m; }
......
......@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
extern void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
/**
* @brief Context projection forward.
*
* @param[in] input input sequence.
* @param[in] sequence sequence index.
* @param[in] weightData padding data.
* @param[out] output output sequence.
* @param[in] numSequences number of sequences.
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
* @param[in] beginPad number of extra timesteps added at the
* beginning.
* @param[in] isPadding trainable padding.
*
*/
extern void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding);
/**
* @brief Context projection backward data.
*
* @param[in] outputGrad output gradient.
* @param[in] sequence sequence index.
* @param[out] inputGrad input gradient.
* @param[in] numSequences number of sequences.
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
*
*/
extern void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart);
/**
* @brief Context projection backward weight.
*
* @param[in] outputGrad output gradient.
* @param[in] sequence sequence index.
* @param[out] weightGrad weight gradient.
* @param[in] numSequences number of sequences.
* @param[in] weightDim input sequence dimension.
* @param[in] totalPad number of extra timesteps.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
* @param[in] beginPad number of extra timesteps added at the
* beginning.
*
*/
extern void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad);
/**
* @brief Memory copy from sequence to batch.
*
......
......@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
inline void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
inline void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding) {}
inline void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart) {}
inline void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad) {}
inline void hl_sequence2batch_copy(real* batch,
real* sequence,
const int* batchIndex,
......
......@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
CHECK_SYNC("hl_max_sequence_backward failed");
}
template <bool padding>
__global__ void KeContextProjectionForward(real* input,
const int* sequence,
real* weightData,
real* output,
int inputDim,
int contextLength,
int contextStart,
int beginPad) {
int idx = threadIdx.x;
int blockSize = blockDim.x;
int sequenceId = blockIdx.x;
int seqStart = sequence[sequenceId];
int seqEnd = sequence[sequenceId+1];
real value = 0;
int instances = seqEnd - seqStart + contextLength - 1;
output += seqStart * inputDim * contextLength;
input += seqStart * inputDim;
for (int k = 0; k <= inputDim / blockSize; k++) {
if (idx < inputDim) {
for (int i = 0; i < instances; i++) {
// i + contextStart;
if ((i + contextStart) < 0) {
if (padding) {
value = weightData[i * inputDim + idx];
} else {
continue;
}
} else if ((i + contextStart) >= (seqEnd - seqStart)) {
if (padding) {
value =
weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
inputDim + idx];
} else {
continue;
}
} else {
value = input[(i + contextStart) * inputDim + idx];
}
int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
real* output_r =
output + outy * inputDim * contextLength + outx * inputDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
output_r[idx] += value;
if (j - outy == outx) break;
output_r += (contextLength - 1) * inputDim;
}
}
}
idx += blockSize;
}
}
void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding) {
CHECK_NOTNULL(input);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(output);
CHECK(!isPadding || weightData);
int blockSize = 128;
int blocksX = numSequences;
int blocksY = 1;
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
if (isPadding) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weightData, output, inputDim,
contextLength, contextStart, beginPad);
} else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weightData, output, inputDim,
contextLength, contextStart, beginPad);
}
CHECK_SYNC("hl_context_projection_forward failed");
}
__global__ void KeContextProjectionBackwardData(real* outputGrad,
const int* sequence,
real* inputGrad,
int inputDim,
int contextLength,
int contextStart) {
int idx = threadIdx.x;
int blockSize = blockDim.x;
int sequenceId = blockIdx.x;
int seqStart = sequence[sequenceId];
int seqEnd = sequence[sequenceId+1];
real value = 0;
int instances = seqEnd - seqStart + contextLength - 1;
outputGrad += seqStart * inputDim * contextLength;
inputGrad += seqStart * inputDim;
for (int k = 0; k <= inputDim / blockSize; k++) {
if (idx < inputDim) {
for (int i = 0; i < instances; i++) {
if ((i + contextStart) < 0) {
continue;
} else if ((i + contextStart) >= (seqEnd - seqStart)) {
continue;
} else {
// value = 0;
value = inputGrad[(i + contextStart) * inputDim + idx];
}
int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
real* output_r =
outputGrad + outy * inputDim * contextLength + outx * inputDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
value += output_r[idx];
if (j - outy == outx) break;
output_r += (contextLength - 1) * inputDim;
}
inputGrad[(i + contextStart) * inputDim + idx] = value;
}
}
idx += blockSize;
}
}
void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(inputGrad);
int blockSize = 128;
int blocksX = numSequences;
int blocksY = 1;
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
CHECK_SYNC("hl_context_projection_backward_data failed");
}
template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int contextLength,
int contextStart,
int beginPad) {
__shared__ real sum_s[THREADS_Y][THREADS_X];
int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
const int idx = threadIdx.x;
const int idy = threadIdx.y;
int padId = blockIdx.x / padOfBlock;
int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
int instanceId;
real value = 0;
real* output_r;
sum_s[idy][idx] = 0.0f;
if (weightIdx < weightDim) {
for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
int seqStart = sequence[seqId];
int seqEnd = sequence[seqId+1];
output_r = outputGrad + seqStart * weightDim * contextLength;
if (contextStart < 0) {
if (padId + contextStart < 0) {
instanceId = padId;
} else {
// beginPad > 0;
instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
}
} else {
if (padId + (seqEnd - seqStart) < contextStart) {
continue;
} else {
// beginPad == 0;
instanceId = padId + (seqEnd - seqStart) - contextStart;
}
}
int outx = (instanceId - contextLength) < 0 ?
instanceId : (contextLength - 1);
int outy = (instanceId - contextLength) < 0 ?
0 : (instanceId - (contextLength - 1));
output_r += outy * weightDim * contextLength + outx * weightDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
value += output_r[weightIdx];
if (j - outy == outx) break;
output_r += (contextLength - 1) * weightDim;
}
}
sum_s[idy][idx] = value;
}
__syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx];
}
__syncthreads();
}
__syncthreads();
if (weightIdx < weightDim) {
if (idy == 0) {
weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
}
}
}
void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(weightGrad);
int threadsX = 32;
int threadsY = 32;
int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
dim3 threads(threadsX, threadsY);
dim3 grid(blocksX, 1);
KeContextProjectionBackwardWeight<32, 32>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, sequence, weightGrad, numSequences, weightDim,
contextLength, contextStart, beginPad);
CHECK_SYNC("hl_context_projection_backward_weight failed");
}
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output,
real* table,
......
file(GLOB h_files . *_op.h)
file(GLOB cpp_files . *_op.cpp)
file(GLOB h_files . *Op.h)
file(GLOB cpp_files . *Op.cpp)
list(APPEND h_files Function.h)
list(APPEND cpp_files Function.cpp)
if(WITH_GPU)
file(GLOB cu_files . *_op_gpu.cu)
file(GLOB cu_files . *OpGpu.cu)
cuda_compile(cu_objs ${cu_files})
endif()
......@@ -16,10 +16,15 @@ add_library(paddle_test_main STATIC TestMain.cpp)
add_dependencies(paddle_test_main ${external_project_dependencies})
if(WITH_GPU)
if(WITH_TESTING)
# TODO:
# file(GLOB test_files . *_op_test.cpp)
# file(GLOB test_files . *OpTest.cpp)
# add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
add_simple_unittest(cross_map_normal_op_test)
add_simple_unittest(CrossMapNormalOpTest)
add_unittest(ContextProjectionOpTest
ContextProjectionOpTest.cpp
../gserver/tests/TestUtil.cpp)
endif()
endif()
add_style_check_target(paddle_function ${h_files})
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "ContextProjectionOp.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Vector.h"
namespace paddle {
template <>
void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
const CpuMatrix* input_mat,
const CpuMatrix* weight_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad) {
const int* starts = seq_vec.getData();
const size_t num_sequences = seq_vec.getSize() - 1;
auto w_mat = const_cast<CpuMatrix*>(weight_mat);
auto in_mat = const_cast<CpuMatrix*>(input_mat);
for (size_t i = 0; i < num_sequences; ++i) {
for (size_t j = 0; j < context_length; ++j) {
int begin = starts[i] + context_start + j;
int end = starts[i + 1] + context_start + j;
int dst_begin = starts[i];
int dst_end = starts[i + 1];
if (begin < starts[i]) {
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
if (w_mat) {
MatrixPtr sub = w_mat->subMatrix(j, pad_size);
mat->addAtOffset(*sub, j * in_mat->getWidth());
}
dst_begin = starts[i] + pad_size;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
if (w_mat) {
MatrixPtr sub = w_mat->subMatrix(
begin_pad + context_start + j - pad_size, pad_size);
mat->addAtOffset(*sub, j * in_mat->getWidth());
}
dst_end = starts[i + 1] - pad_size;
end = starts[i + 1];
}
if (end <= begin) continue;
MatrixPtr src = in_mat->subMatrix(begin, end - begin);
MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
dst->addAtOffset(*src, j * in_mat->getWidth());
}
}
}
/**
* \param inputs[0] input value.
* \param inputs[1] input weight.
* \param inputs[2] input sequence.
* \param outputs[0] output value.
*/
template <DeviceType Device>
class ContextProjectionForwardFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(3, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(inputs[2].dims_.size(), 1);
/// dim of output = dim of input * context_length
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
/// dim of input == dim of weight
CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
const auto w_mat =
!inputs[1].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
ContextProjectionForward<Device>(out_mat.get(),
in_mat.get(),
w_mat.get(),
seq_vec,
context_length_,
context_start_,
begin_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
};
template <>
void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
CpuMatrix* in_grad_mat,
CpuMatrix* w_grad_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
CHECK(out_grad_mat);
size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
: w_grad_mat ? w_grad_mat->getWidth() : 0;
const int* starts = seq_vec.getData();
size_t num_sequences = seq_vec.getSize() - 1;
for (size_t i = 0; i < num_sequences; ++i) {
for (size_t j = 0; j < context_length; ++j) {
int begin = starts[i] + context_start + j;
int end = starts[i + 1] + context_start + j;
int dst_begin = starts[i];
int dst_end = starts[i + 1];
if (begin < starts[i]) {
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
dst_begin = starts[i] + pad_size;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
MatrixPtr mat =
out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
MatrixPtr sub = w_grad_mat->subMatrix(
begin_pad + context_start + j - pad_size, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
dst_end = starts[i + 1] - pad_size;
end = starts[i + 1];
}
if (end <= begin) continue;
if (!in_grad_mat) continue;
MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
src->addAtOffset(*dst, j * input_dim);
}
}
}
/**
* \param inputs[0] input grad.
* \param inputs[1] weight grad.
* \param inputs[2] input sequence.
* \param outputs[0] output value.
*/
template <DeviceType Device>
class ContextProjectionBackwardFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
is_padding_ = config.get<bool>("is_padding");
total_pad_ = config.get<size_t>("total_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(3, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(outputs[0].getData() && inputs[2].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(inputs[2].dims_.size(), 1);
/// dim of input == dim of weight
CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
/// dim of output = dim of input * context_length
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto in_grad_mat =
!inputs[0].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
auto w_grad_mat =
!inputs[1].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
ContextProjectionBackward<Device>(out_grad_mat.get(),
in_grad_mat ? in_grad_mat.get() : nullptr,
w_grad_mat ? w_grad_mat.get() : nullptr,
seq_vec,
context_length_,
context_start_,
begin_pad_,
is_padding_,
total_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
bool is_padding_;
size_t total_pad_;
};
/**
* \param inputs[0] input grad.
* \param inputs[1] input sequence.
* \param outputs[0] output grad.
*/
template <DeviceType Device>
class ContextProjectionBackwardDataFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(2, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 1);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardData<Device>(out_grad_mat.get(),
in_grad_mat.get(),
seq_vec,
context_length_,
context_start_);
}
private:
size_t context_length_;
int context_start_;
};
/**
* \param inputs[0] weight grad.
* \param inputs[1] input sequence.
* \param outputs[0] output grad.
*/
template <DeviceType Device>
class ContextProjectionBackwardWeightFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
total_pad_ = config.get<size_t>("total_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(2, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 1);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
w_grad_mat.get(),
seq_vec,
context_length_,
context_start_,
total_pad_,
begin_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
size_t total_pad_;
};
REGISTER_TYPED_FUNC(ContextProjectionForward,
CPU,
ContextProjectionForwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackward,
CPU,
ContextProjectionBackwardFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(ContextProjectionForward,
GPU,
ContextProjectionForwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackward,
GPU,
ContextProjectionBackwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
GPU,
ContextProjectionBackwardDataFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
GPU,
ContextProjectionBackwardWeightFunc);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Function.h"
namespace paddle {
/**
* \brief Context Projection Forward.
*
* \param[out] outputs output data.
* \param[in] input input data.
* \param[in] weight input weight.
* \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position.
* \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not.
*
*/
template <DeviceType Device>
void ContextProjectionForward(typename MatrixT<Device>::type* output,
const typename MatrixT<Device>::type* input,
const typename MatrixT<Device>::type* weight,
const typename SequenceT<Device>::type& sequence,
size_t context_length,
int context_start,
size_t begin_pad);
/**
* \brief Context Projection Backward.
*
* \param[out] outputs output gradient.
* \param[in] input input gradient.
* \param[in] weight input weight gradient.
* \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position.
* \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not.
*
*/
template <DeviceType Device>
void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* in_grad,
typename MatrixT<Device>::type* w_grad,
const typename SequenceT<Device>::type& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad);
template <DeviceType Device>
void ContextProjectionBackwardData(
typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* in_grad,
const typename SequenceT<Device>::type& sequence,
size_t context_length,
int context_start);
template <DeviceType Device>
void ContextProjectionBackwardWeight(
typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* w_grad,
const typename SequenceT<Device>::type& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
size_t begin_pad);
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h"
namespace paddle {
template <bool padding>
__global__ void KeContextProjectionForward(const real* input,
const int* sequence,
const real* weight,
real* output,
int input_dim,
int context_length,
int context_start,
int begin_pad) {
int idx = threadIdx.x;
int block_size = blockDim.x;
int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1];
real value = 0;
int instances = seq_end - seq_start + context_length - 1;
output += seq_start * input_dim * context_length;
input += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) {
for (int i = 0; i < instances; i++) {
// i + context_start;
if ((i + context_start) < 0) {
if (padding) {
value = weight[i * input_dim + idx];
} else {
continue;
}
} else if ((i + context_start) >= (seq_end - seq_start)) {
if (padding) {
value =
weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
input_dim + idx];
} else {
continue;
}
} else {
value = input[(i + context_start) * input_dim + idx];
}
int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r =
output + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
output_r[idx] += value;
if (j - outy == outx) break;
output_r += (context_length - 1) * input_dim;
}
}
}
idx += block_size;
}
}
/**
* @brief Context projection forward.
*
* @param[in] input input sequence.
* @param[in] sequence sequence index.
* @param[in] weight padding data.
* @param[out] output output sequence.
* @param[in] num_sequences number of sequences.
* @param[in] input_dim input sequence dimension.
* @param[in] context_length context length.
* @param[in] context_start context start.
* @param[in] begin_pad number of extra timesteps added at the
* beginning.
*
*/
void hl_context_projection_forward(const real* input,
const int* sequence,
const real* weight,
real* output,
size_t num_sequences,
size_t input_dim,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK_NOTNULL(input);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(output);
int block_size = 128;
int blocks_x = num_sequences;
int blocks_y = 1;
dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y);
if (weight) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weight, output, input_dim,
context_length, context_start, begin_pad);
} else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weight, output, input_dim,
context_length, context_start, begin_pad);
}
CHECK_SYNC("hl_context_projection_forward failed");
}
template <>
void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
const GpuMatrix* input,
const GpuMatrix* weight,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK(input && output);
hl_context_projection_forward(input->getData(),
sequence.getData(),
weight ? weight->getData() : nullptr,
output->getData(),
sequence.getSize() - 1,
input->getWidth(),
context_length,
context_start,
begin_pad);
}
__global__ void KeContextProjectionBackwardData(real* out_grad,
const int* sequence,
real* in_grad,
int input_dim,
int context_length,
int context_start) {
int idx = threadIdx.x;
int block_size = blockDim.x;
int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1];
real value = 0;
int instances = seq_end - seq_start + context_length - 1;
out_grad += seq_start * input_dim * context_length;
in_grad += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) {
for (int i = 0; i < instances; i++) {
if ((i + context_start) < 0) {
continue;
} else if ((i + context_start) >= (seq_end - seq_start)) {
continue;
} else {
// value = 0;
value = in_grad[(i + context_start) * input_dim + idx];
}
int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r =
out_grad + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[idx];
if (j - outy == outx) break;
output_r += (context_length - 1) * input_dim;
}
in_grad[(i + context_start) * input_dim + idx] = value;
}
}
idx += block_size;
}
}
/**
* @brief Context projection backward data.
*
* @param[in] out_grad output gradient.
* @param[in] sequence sequence index.
* @param[out] input_grad input gradient.
* @param[in] num_sequences number of sequences.
* @param[in] input_dim input sequence dimension.
* @param[in] context_length context length.
* @param[in] context_start context start.
*
*/
void hl_context_projection_backward_data(real* out_grad,
const int* sequence,
real* input_grad,
size_t num_sequences,
size_t input_dim,
size_t context_length,
int context_start) {
CHECK_NOTNULL(out_grad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(input_grad);
int block_size = 128;
int blocks_x = num_sequences;
int blocks_y = 1;
dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
(out_grad, sequence, input_grad, input_dim, context_length, context_start);
CHECK_SYNC("hl_context_projection_backward_data failed");
}
template <>
void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
GpuMatrix* in_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start) {
CHECK(in_grad && out_grad);
hl_context_projection_backward_data(out_grad->getData(),
sequence.getData(),
in_grad->getData(),
sequence.getSize() - 1,
in_grad->getWidth(),
context_length,
context_start);
}
template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* out_grad,
const int* sequence,
real* w_grad,
int num_sequences,
int w_dim,
int context_length,
int context_start,
int begin_pad) {
__shared__ real sum_s[THREADS_Y][THREADS_X];
int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
const int idx = threadIdx.x;
const int idy = threadIdx.y;
int padId = blockIdx.x / pad_of_block;
int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
int instanceId;
real value = 0;
real* output_r;
sum_s[idy][idx] = 0.0f;
if (weight_idx < w_dim) {
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1];
output_r = out_grad + seq_start * w_dim * context_length;
if (context_start < 0) {
if (padId + context_start < 0) {
instanceId = padId;
} else {
// begin_pad > 0;
instanceId = (padId - begin_pad) +
(seq_end - seq_start) - context_start;
}
} else {
if (padId + (seq_end - seq_start) < context_start) {
continue;
} else {
// begin_pad == 0;
instanceId = padId + (seq_end - seq_start) - context_start;
}
}
int outx = (instanceId - context_length) < 0 ?
instanceId : (context_length - 1);
int outy = (instanceId - context_length) < 0 ?
0 : (instanceId - (context_length - 1));
output_r += outy * w_dim * context_length + outx * w_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[weight_idx];
if (j - outy == outx) break;
output_r += (context_length - 1) * w_dim;
}
}
sum_s[idy][idx] = value;
}
__syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx];
}
__syncthreads();
}
__syncthreads();
if (weight_idx < w_dim) {
if (idy == 0) {
w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
}
}
}
/**
* @brief Context projection backward weight.
*
* @param[in] out_grad output gradient.
* @param[in] sequence sequence index.
* @param[out] w_grad weight gradient.
* @param[in] num_sequences number of sequences.
* @param[in] w_dim input sequence dimension.
* @param[in] total_pad number of extra timesteps.
* @param[in] context_length context length.
* @param[in] context_start context start.
* @param[in] begin_pad number of extra timesteps added at the
* beginning.
*
*/
void hl_context_projection_backward_weight(real* out_grad,
const int* sequence,
real* w_grad,
size_t num_sequences,
size_t w_dim,
size_t total_pad,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK_NOTNULL(out_grad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(w_grad);
int threads_x = 32;
int threads_y = 32;
int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
dim3 threads(threads_x, threads_y);
dim3 grid(blocks_x, 1);
KeContextProjectionBackwardWeight<32, 32>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(out_grad, sequence, w_grad, num_sequences, w_dim,
context_length, context_start, begin_pad);
CHECK_SYNC("hl_context_projection_backward_weight failed");
}
template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
GpuMatrix* out_grad,
GpuMatrix* w_grad,
const GpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
size_t begin_pad) {
CHECK(out_grad && w_grad);
hl_context_projection_backward_weight(out_grad->getData(),
seq_vec.getData(),
w_grad->getData(),
seq_vec.getSize() - 1,
w_grad->getWidth(),
total_pad,
context_length,
context_start,
begin_pad);
}
template <>
void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
GpuMatrix* in_grad,
GpuMatrix* w_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
CHECK(out_grad);
if (in_grad) {
ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
out_grad,
in_grad,
sequence,
context_length,
context_start);
}
if (is_padding && w_grad) {
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
out_grad,
w_grad,
sequence,
context_length,
context_start,
total_pad,
begin_pad);
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
#include "paddle/gserver/tests/TestUtil.h"
#include "paddle/math/Matrix.h"
using namespace paddle; // NOLINT
void testMatrixProjectionForward(int context_start,
size_t context_length,
bool is_padding,
size_t batch_size,
size_t input_dim) {
size_t pad = std::max(0, -context_start) +
std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionForward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start)));
CpuMatrix cpu_in(batch_size, input_dim);
cpu_in.randomizeUniform();
GpuMatrix gpu_in(batch_size, input_dim);
gpu_in.copyFrom(cpu_in);
auto cpu_weight =
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
auto gpu_weight =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_weight->randomizeUniform();
gpu_weight->copyFrom(*cpu_weight);
}
IVectorPtr cpu_seq;
generateSequenceStartPositions(batch_size, cpu_seq);
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
gpu_seq->copyFrom(*cpu_seq);
CpuMatrix cpu_out(batch_size, input_dim * context_length);
GpuMatrix gpu_out(batch_size, input_dim * context_length);
cpu_out.randomizeUniform();
gpu_out.copyFrom(cpu_out);
compare.getCpuFunction()->calc(
{Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckEqual(cpu_out, gpu_out);
}
void testMatrixProjectionBackward(int context_start,
int context_length,
bool is_padding,
size_t batch_size,
size_t input_dim) {
size_t pad = std::max(0, -context_start) +
std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionBackward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start))
.set("is_padding", is_padding)
.set("total_pad", pad));
CpuMatrix cpu_in_grad(batch_size, input_dim);
cpu_in_grad.randomizeUniform();
GpuMatrix gpu_in_grad(batch_size, input_dim);
gpu_in_grad.copyFrom(cpu_in_grad);
CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
cpu_out_grad.randomizeUniform();
GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
gpu_out_grad.copyFrom(cpu_out_grad);
IVectorPtr cpu_seq;
generateSequenceStartPositions(batch_size, cpu_seq);
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
gpu_seq->copyFrom(*cpu_seq);
auto cpu_w_grad =
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
auto gpu_w_grad =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_w_grad->randomizeUniform();
gpu_w_grad->copyFrom(*cpu_w_grad);
}
compare.getCpuFunction()->calc(
{Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
if (is_padding) {
autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
}
}
TEST(ContextProjection, projection) {
for (auto context_start : {-5, -3, -1, 0, 3}) {
for (auto context_length : {1, 2, 5, 7}) {
for (auto trainable_padding : {false, true}) {
for (auto batch_size : {1, 2, 5, 20, 100}) {
for (auto input_dim : {15, 32, 63, 128, 200}) {
VLOG(3) << " context_start=" << context_start
<< " context_length=" << context_length
<< " trainable_padding=" << trainable_padding
<< " batch_size=" << batch_size
<< " input_dim=" << input_dim;
testMatrixProjectionForward(context_start,
context_length,
trainable_padding,
batch_size,
input_dim);
testMatrixProjectionBackward(context_start,
context_length,
trainable_padding,
batch_size,
input_dim);
}
}
}
}
}
}
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cross_map_normal_op.h"
#include "CrossMapNormalOp.h"
#include "paddle/math/Vector.h"
namespace paddle {
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "cross_map_normal_op.h"
#include "CrossMapNormalOp.h"
namespace paddle {
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
namespace paddle {
TEST(CrossMapNormal, real) {
for (size_t numSamples : {5, 32}) {
for (size_t channels : {1, 5, 32}) {
......@@ -69,3 +71,5 @@ TEST(CrossMapNormalGrad, real) {
}
}
}
} // namespace paddle
......@@ -30,20 +30,48 @@ real FuncConfig::get<real>(const std::string& key) const {
return it->second.r;
}
template <>
int FuncConfig::get<int>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.i;
}
template <>
bool FuncConfig::get<bool>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.b;
}
template <>
FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].s = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].r = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].i = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].b = v;
return *this;
}
ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
} // namespace paddle
......@@ -40,6 +40,19 @@ struct MatrixT<DEVICE_TYPE_GPU> {
using type = GpuMatrix;
};
template <DeviceType Device>
struct SequenceT;
template <>
struct SequenceT<DEVICE_TYPE_CPU> {
using type = CpuIVector;
};
template <>
struct SequenceT<DEVICE_TYPE_GPU> {
using type = GpuIVector;
};
typedef std::vector<size_t> Dims;
class Tensor {
......@@ -59,6 +72,8 @@ public:
union value {
size_t s;
real r;
int i;
bool b;
};
template <typename T>
......
......@@ -33,25 +33,33 @@ public:
// init cpu and gpu arguments
auto initArgs = [=](
Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
for (auto arg : inArgs) {
for (const auto arg : inArgs) {
size_t size = sizeof(real);
for (auto dim : arg.dims_) {
for (const auto dim : arg.dims_) {
size *= dim;
}
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(
Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
gpuArgs.emplace_back(
Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
// will use an api to refactor this code.
CpuVector cpuVector(size / sizeof(real),
(real*)cpuArgs.back().getData());
GpuVector gpuVector(size / sizeof(real),
(real*)gpuArgs.back().getData());
cpuVector.uniform(0.001, 1);
gpuVector.copyFrom(cpuVector);
if (arg.getData()) {
// todo(tianbing), waste unnecessary mem here
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
// already init outside
} else {
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(
Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
gpuArgs.emplace_back(
Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
// will use an api to refactor this code.
CpuVector cpuVector(size / sizeof(real),
(real*)cpuArgs.back().getData());
GpuVector gpuVector(size / sizeof(real),
(real*)gpuArgs.back().getData());
cpuVector.uniform(0.001, 1);
gpuVector.copyFrom(cpuVector);
}
}
};
initArgs(cpuInputs, gpuInputs, inputs);
......@@ -81,6 +89,10 @@ public:
checkArgs(cpuInouts, gpuInouts);
}
std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
protected:
std::shared_ptr<FunctionBase> cpu;
std::shared_ptr<FunctionBase> gpu;
......@@ -95,8 +107,3 @@ protected:
};
} // namespace paddle
using paddle::FunctionCompare;
using paddle::FuncConfig;
using paddle::Dims;
using paddle::Tensor;
......@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
CHECK_EQ(inputDim * totalPad, parameter->getSize());
weight_.reset(new Weight(totalPad, inputDim, parameter));
}
// init forward_ and backward_ functions
init();
}
bool ContextProjection::init() {
size_t context_length = config_.context_length();
int context_start = config_.context_start();
bool is_padding = config_.trainable_padding();
size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
createFunction(forward_,
"ContextProjectionForward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", beginPad_));
createFunction(backward_,
"ContextProjectionBackward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", beginPad_)
.set("is_padding", is_padding)
.set("total_pad", total_pad));
return true;
}
void ContextProjection::resetState() {
......@@ -78,25 +104,29 @@ LayerStatePtr ContextProjection::getState() {
}
void ContextProjection::forward() {
CHECK(in_->value);
CHECK(in_->value && out_->value);
CHECK(in_->sequenceStartPositions);
auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
int64_t inputDim = in_->value->getWidth();
int64_t dim = out_->value->getWidth();
CHECK_EQ(dim, inputDim * config_.context_length());
size_t input_dim = in_->value->getWidth();
size_t dim = out_->value->getWidth();
CHECK_EQ(dim, input_dim * config_.context_length());
size_t batch_size = in_->value->getHeight();
CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
bool isPadding = config_.trainable_padding();
out_->value->contextProjectionForward(
*(in_->value),
state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
*startPositions,
config_.context_length(),
config_.context_start(),
beginPad_,
state_ ? true : isPadding);
bool is_padding = config_.trainable_padding();
/// first use state_, otherwise use weight_(padding false === w nullptr)
auto w_ptr =
state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
auto start_pos = in_->sequenceStartPositions;
forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
Tensor(w_ptr ? w_ptr->getData() : nullptr,
Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
Tensor(reinterpret_cast<real*>(
const_cast<int*>(start_pos->getData(useGpu_))),
Dims{start_pos->getSize()})},
{Tensor(out_->value->getData(), Dims{batch_size, dim})},
{});
if (state_ && config_.context_start() < 0) {
CHECK_EQ(1, in_->getNumSequences());
......@@ -118,41 +148,27 @@ void ContextProjection::forward() {
}
void ContextProjection::backward(const UpdateCallback& callback) {
CHECK(in_->value);
int64_t inputDim = in_->value->getWidth();
int64_t dim = out_->value->getWidth();
CHECK_EQ(dim, inputDim * config_.context_length());
auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
CHECK(in_->value && out_->value && out_->grad);
size_t input_dim = in_->value->getWidth();
size_t dim = out_->value->getWidth();
CHECK_EQ(dim, input_dim * config_.context_length());
size_t batch_size = in_->value->getHeight();
CHECK_EQ(batch_size, out_->value->getHeight());
CHECK_EQ(backward_.size(), 1) << "Only one backward function here";
REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
bool isPadding = config_.trainable_padding();
if (!out_->grad->useGpu()) {
out_->grad->contextProjectionBackward(
in_->grad.get(),
isPadding ? weight_->getWGrad().get() : nullptr,
*startPositions,
config_.context_length(),
config_.context_start(),
beginPad_,
isPadding);
} else {
if (in_->grad) {
out_->grad->contextProjectionBackwardData(*(in_->grad),
*startPositions,
config_.context_length(),
config_.context_start());
}
if (isPadding && weight_->getWGrad()) {
out_->grad->contextProjectionBackwardWeight(
*(weight_->getWGrad()),
*startPositions,
config_.context_length(),
config_.context_start(),
weight_->getWGrad()->getHeight(),
beginPad_);
}
}
bool is_padding = config_.trainable_padding();
auto start_pos = in_->sequenceStartPositions;
auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
Dims{batch_size, input_dim}),
Tensor(w_ptr ? w_ptr->getData() : nullptr,
Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
Tensor(reinterpret_cast<real*>(
const_cast<int*>(start_pos->getData(useGpu_))),
Dims{start_pos->getSize()})},
{Tensor(out_->grad->getData(), Dims{batch_size, dim})},
{});
if (config_.trainable_padding()) {
weight_->getParameterPtr()->incUpdate(callback);
......
......@@ -61,6 +61,8 @@ public:
virtual LayerStatePtr getState();
virtual bool init();
protected:
std::unique_ptr<Weight> weight_;
/// number of extra timesteps added at the beginning
......
......@@ -88,11 +88,37 @@ public:
*/
virtual LayerStatePtr getState() { return nullptr; }
/**
* init forward_ and backward_ functions
*/
virtual bool init() { return true; }
/**
* Get output size of projection.
*/
size_t getOutputSize() const { return config_.output_size(); }
protected:
/**
* Create layer function. Function is called in forward or backward.
* \param function, Layer::forward_ or Layer::backward_
* \param name, function name
* \param config, initialization configuration for the function
*/
void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
const std::string& name,
const FuncConfig& config) {
if (useGpu_) {
function.emplace_back(
FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
} else {
function.emplace_back(
FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
}
auto& func = function.back();
func->init(config);
}
protected:
/// Config of projection
ProjectionConfig config_;
......@@ -106,5 +132,9 @@ protected:
const Argument* out_;
/// Store `passType` passed to forward()
PassType passType_;
/// Layer forward function
std::vector<std::shared_ptr<FunctionBase>> forward_;
/// Layer backward function
std::vector<std::shared_ptr<FunctionBase>> backward_;
};
} // namespace paddle
......@@ -65,9 +65,3 @@ TEST(LinearChainCRF, decoding) {
}
}
}
int main(int argc, char** argv) {
initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -730,9 +730,3 @@ TEST(ProtoSequenceDataProvider, test) {
} // end for (int numIdSlots : numSlotsArray)
} // end for (int numSparseNonValueVecSlots : numSlotsArray)
}
int main(int argc, char** argv) {
initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -242,9 +242,3 @@ TEST(Layer, WarpCTCLayer) {
}
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -1304,68 +1304,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
}
void GpuMatrix::contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
CHECK(dynamic_cast<GpuMatrix*>(&input));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight));
CHECK_EQ(getWidth(), input.getWidth() * contextLength);
hl_context_projection_forward(input.getData(),
sequence.getData(),
isPadding ? weight->getData() : NULL,
getData(),
sequence.getSize() - 1,
input.getWidth(),
contextLength,
contextStart,
beginPad,
isPadding);
}
void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart) {
CHECK(dynamic_cast<GpuMatrix*>(&inputGrad));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength);
hl_context_projection_backward_data(getData(),
sequence.getData(),
inputGrad.getData(),
sequence.getSize() - 1,
inputGrad.getWidth(),
contextLength,
contextStart);
}
void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad) {
CHECK(dynamic_cast<GpuMatrix*>(&weightGrad));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength);
hl_context_projection_backward_weight(getData(),
sequence.getData(),
weightGrad.getData(),
sequence.getSize() - 1,
weightGrad.getWidth(),
totalPad,
contextLength,
contextStart,
beginPad);
}
void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
CHECK(data.useGpu_ == true && W.useGpu_ == true)
<< "Matrix type are not equal";
......@@ -2203,113 +2141,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
}
}
void CpuMatrix::contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
auto input_ptr = dynamic_cast<CpuMatrix*>(&input);
auto seq_ptr = dynamic_cast<const CpuIVector*>(&sequence);
CHECK(input_ptr && seq_ptr);
if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight));
CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength);
const int* starts = seq_ptr->getData();
size_t numSequences = seq_ptr->getSize() - 1;
for (size_t i = 0; i < numSequences; ++i) {
for (int j = 0; j < contextLength; ++j) {
int begin = starts[i] + contextStart + j;
int end = starts[i + 1] + contextStart + j;
int dstBegin = starts[i];
int dstEnd = starts[i + 1];
if (begin < starts[i]) {
int64_t padSize =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
MatrixPtr mat = this->subMatrix(starts[i], padSize);
if (isPadding) {
MatrixPtr sub = weight->subMatrix(j, padSize);
mat->addAtOffset(*sub, j * input_ptr->getWidth());
}
dstBegin = starts[i] + padSize;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t padSize =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
if (isPadding) {
MatrixPtr sub =
weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
mat->addAtOffset(*sub, j * input_ptr->getWidth());
}
dstEnd = starts[i + 1] - padSize;
end = starts[i + 1];
}
if (end <= begin) continue;
MatrixPtr src = input_ptr->subMatrix(begin, end - begin);
MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
dst->addAtOffset(*src, j * input_ptr->getWidth());
}
}
}
void CpuMatrix::contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad));
if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad));
CHECK(dynamic_cast<const CpuIVector*>(&sequence));
int64_t inputDim = inputGrad ? inputGrad->getWidth()
: weightGrad ? weightGrad->getWidth() : 0;
CHECK_EQ(getWidth(), static_cast<size_t>(inputDim * contextLength));
const int* starts = sequence.getData();
size_t numSequences = sequence.getSize() - 1;
for (size_t i = 0; i < numSequences; ++i) {
for (int j = 0; j < contextLength; ++j) {
int begin = starts[i] + contextStart + j;
int end = starts[i + 1] + contextStart + j;
int dstBegin = starts[i];
int dstEnd = starts[i + 1];
if (begin < starts[i]) {
int64_t padSize =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (isPadding && weightGrad) {
MatrixPtr mat = this->subMatrix(starts[i], padSize);
MatrixPtr sub = weightGrad->subMatrix(j, padSize);
sub->addAtOffset(*mat, j * inputDim);
}
dstBegin = starts[i] + padSize;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t padSize =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (isPadding && weightGrad) {
MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
MatrixPtr sub = weightGrad->subMatrix(
beginPad + contextStart + j - padSize, padSize);
sub->addAtOffset(*mat, j * inputDim);
}
dstEnd = starts[i + 1] - padSize;
end = starts[i + 1];
}
if (end <= begin) continue;
if (!inputGrad) continue;
MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
src->addAtOffset(*dst, j * inputDim);
}
}
}
inline void vecAddTo(real* a, const real* b, size_t len) {
for (unsigned int i = 0; i < len; ++i) {
a[i] += b[i];
......
......@@ -972,42 +972,6 @@ public:
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad) {
LOG(FATAL) << "Not implemeted";
}
/**
* @code
* this.row[i] += table.row[ids[i]]
......@@ -1442,26 +1406,6 @@ public:
const IVector& sequence,
IVector& index);
void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
void contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart);
void contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad);
void bilinearForward(const Matrix& in,
const size_t inImgH,
const size_t inImgW,
......@@ -1648,22 +1592,6 @@ public:
const IVector& sequence,
IVector& index);
void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
void contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
virtual real* getRowBuf(size_t row) { return getRow(row); }
......
......@@ -120,9 +120,3 @@ TEST(MemoryHandle, Gpu) {
}
}
#endif
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -242,10 +242,4 @@ TEST(BaseMatrix, Other) {
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
#endif
......@@ -77,11 +77,4 @@ TEST(CpuGpuVector, subCreate) {
checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
int ret = RUN_ALL_TESTS();
return ret;
}
#endif
......@@ -114,9 +114,3 @@ TEST(ExecViaCpu, test1) {
testWrapper(functor);
}
#endif
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -291,10 +291,4 @@ TEST(Matrix, multiBinaryCrossEntropy) {
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
#endif
......@@ -169,9 +169,3 @@ TEST(SIMDFunction, decayL1_WithoutLR) {
ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
}
}
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -561,9 +561,3 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
checkSMatrixEqual2(matA, matD);
#endif
}
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -1163,11 +1163,3 @@ TEST(Quaternary, CompareOp) {
TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
#endif
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
hl_start();
hl_init(0);
return RUN_ALL_TESTS();
}
......@@ -459,11 +459,3 @@ void testSparseMomentum(size_t size, bool useGpu) {
}
TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
hl_start();
hl_init(FLAGS_gpu_id);
return RUN_ALL_TESTS();
}
......@@ -53,9 +53,3 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
checkMatrixEqual(cBatchTransMat, cMat_d2h);
}
#endif
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -139,11 +139,3 @@ TEST(sgdUpdate, GPU) {
testMatrixCase(testSgdUpdate<GpuMatrix>);
}
#endif
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
hl_start();
hl_init(0);
return RUN_ALL_TESTS();
}
......@@ -171,11 +171,4 @@ TEST(SMatrix, sMatrixCollectBias) {
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
int ret = RUN_ALL_TESTS();
return ret;
}
#endif
......@@ -23,15 +23,6 @@ limitations under the License. */
using namespace paddle; // NOLINT
int main(int argc, char** argv) {
paddle::initMain(argc, argv);
testing::InitGoogleTest(&argc, argv);
int ret = RUN_ALL_TESTS();
return ret;
}
class CommonTest : public ::testing::Test {
protected:
CommonTest() : testStat_("test") {}
......
# for paddle test case
if(WITH_TESTING)
add_library(paddle_test_main STATIC TestMain.cpp)
add_dependencies(paddle_test_main gen_proto_cpp)
endif()
......@@ -56,7 +56,7 @@ class RemoteParameterUpdater : public ParameterUpdater {
public:
RemoteParameterUpdater(
const OptimizationConfig& config,
int expectedPpassCount,
int expectedPassCount,
std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
~RemoteParameterUpdater() {
if (controllerThread_) {
......@@ -146,7 +146,7 @@ protected:
BatchStatus batchStatus_;
/// controller thread for sync-sgd
std::unique_ptr<std::thread> controllerThread_;
/// passed alread finished
/// passed already finished
int64_t passCount_;
/// expected passes to finished
int64_t expectedPassCount_;
......
......@@ -37,7 +37,7 @@ unsigned int* ThreadLocalRand::getSeed() {
p = new unsigned int(defaultSeed_ - 1);
} else {
p = new unsigned int(defaultSeed_ + getTID());
LOG(INFO) << "thread use undeterministic rand seed:" << *p;
VLOG(3) << "thread use undeterministic rand seed:" << *p;
}
seed_.set(p);
}
......
......@@ -125,7 +125,7 @@ void registerInitFunction(std::function<void()> func, int priority) {
void runInitFunctions() {
std::call_once(g_onceFlag, []() {
LOG(INFO) << "Calling runInitFunctions";
VLOG(3) << "Calling runInitFunctions";
if (g_initFuncs) {
std::sort(g_initFuncs->begin(),
g_initFuncs->end(),
......@@ -139,7 +139,7 @@ void runInitFunctions() {
g_initFuncs = nullptr;
}
g_initialized = true;
LOG(INFO) << "Call runInitFunctions done.";
VLOG(3) << "Call runInitFunctions done.";
});
}
......@@ -231,7 +231,7 @@ std::string join(const std::string& part1, const std::string& part2) {
} // namespace path
void copyFileToPath(const std::string& file, const std::string& dir) {
LOG(INFO) << "copy " << file << " to " << dir;
VLOG(3) << "copy " << file << " to " << dir;
std::string fileName = path::basename(file);
std::string dst = path::join(dir, fileName);
std::ifstream source(file, std::ios_base::binary);
......
......@@ -96,9 +96,3 @@ TEST(CustomStackTrace, normalTest) {
}
});
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -44,8 +44,3 @@ TEST(SIMDFlags, normalPrint) {
LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2;
LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512;
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -53,9 +53,3 @@ TEST(ThreadSpinLock, normalTest) {
});
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
......@@ -79,8 +79,3 @@ TEST(AsyncThreadPool, addBatchJobWithResults) {
ASSERT_EQ(res[i], i);
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -64,9 +64,3 @@ TEST(ThreadBarrier, normalTest) {
});
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
paddle::initMain(argc, argv);
return RUN_ALL_TESTS();
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册