提交 0c444686 编写于 作者: S sweetsky0901

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into my_spp_op

...@@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0) ...@@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
include(system) include(system)
......
## Evaluator Design ## Evaluator Design
### The Problem ### Problem Statement
During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted. During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
### Evaluator Design ### Evaluator Design
Currently, every operation is expressed in the graph. we divide the evaluator process into three steps. Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block. 1. Initialize the metric state and add it into the block.
2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once. 2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation ### Implementation
This design is shown in python API. This design is shown in the Python API.
Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
```python ```python
......
# gserver pacakge unittests # gserver pacakge unittests
add_simple_unittest(test_LinearChainCRF) add_simple_unittest(test_LinearChainCRF)
add_simple_unittest(test_RecurrentLayer) add_simple_unittest(test_RecurrentLayer)
...@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore) ...@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand) gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput) gserver_test(test_MaxPoolingWithMaskOutput)
set(PYTHON_PATH
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
function(gserver_test_with_python TARGET)
add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endfunction()
gserver_test_with_python(test_PyDataProvider2)
if(WITH_PYTHON)
gserver_test_with_python(test_PyDataProvider)
endif()
if(NOT MOBILE_INFERENCE)
gserver_test_with_python(test_CompareTwoNets)
# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
gserver_test_with_python(test_RecurrentGradientMachine)
endif()
########## test_MKLDNN layers and activations ########## ########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN) if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN add_unittest_without_exec(test_MKLDNN
...@@ -36,18 +55,7 @@ if(WITH_MKLDNN) ...@@ -36,18 +55,7 @@ if(WITH_MKLDNN)
MKLDNNTester.cpp MKLDNNTester.cpp
LayerGradUtil.cpp) LayerGradUtil.cpp)
add_test(NAME test_MKLDNN add_test(NAME test_MKLDNN
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
############## test_PyDataProvider ########################
if(WITH_PYTHON)
add_unittest_without_exec(test_PyDataProvider
test_PyDataProvider.cpp)
add_test(NAME test_PyDataProvider
COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif() endif()
...@@ -55,67 +63,35 @@ endif() ...@@ -55,67 +63,35 @@ endif()
if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
add_unittest_without_exec(test_WarpCTCLayer add_unittest_without_exec(test_WarpCTCLayer
test_WarpCTCLayer.cpp) test_WarpCTCLayer.cpp)
add_test(NAME test_WarpCTCLayer add_test(NAME test_WarpCTCLayer
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif() endif()
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
################## test_Evaluator ####################### ################## test_Evaluator #############
add_unittest(test_Evaluator add_unittest(test_Evaluator
test_Evaluator.cpp) test_Evaluator.cpp)
############### test_RecurrentGradientMachine ############### ########### test_NetworkCompare ###############
# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
# I will fix it.
add_unittest_without_exec(test_RecurrentGradientMachine
test_RecurrentGradientMachine.cpp)
add_test(NAME test_RecurrentGradientMachine
COMMAND .set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
############### test_NetworkCompare ###############
add_unittest_without_exec(test_NetworkCompare add_unittest_without_exec(test_NetworkCompare
test_NetworkCompare.cpp) test_NetworkCompare.cpp)
if(WITH_GPU) if(WITH_GPU)
add_test(NAME test_NetworkCompare set(use_gpu true)
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
else() else()
set(use_gpu false)
endif()
add_test(NAME test_NetworkCompare add_test(NAME test_NetworkCompare
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
################# test_CompareSparse ################## ############ test_CompareSparse ################
add_unittest_without_exec(test_CompareSparse add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp) test_CompareSparse.cpp)
if(NOT ON_TRAVIS) if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif() endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif() endif()
################ test_PyDataProvider2 ######################
add_unittest_without_exec(test_PyDataProvider2
test_PyDataProvider2.cpp)
add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
...@@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ ...@@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
h_t = o_t \odot act_h(c_t) h_t = o_t \odot act_h(c_t)
$$ $$
where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$ of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
are diagonal weight matrices for peephole connections. In our implementation, are diagonal weight matrices for peephole connections. In our implementation,
we use vectors to reprenset these diagonal weight matrices. The b terms we use vectors to reprenset these diagonal weight matrices. The b terms
denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$ denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
is the non-line activations, such as logistic sigmoid function, and is the non-line activations, such as logistic sigmoid function, and
\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate, $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector \f$h\f$. the cell output activation vector $h$.
The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$ The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
are the cell input and cell output activation functions and `tanh` is usually are the cell input and cell output activation functions and `tanh` is usually
used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state, used for them. $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state. which is computed based on the current input and the previous hidden state.
Set `use_peepholes` False to disable peephole connection Set `use_peepholes` False to disable peephole connection. The formula
(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula is omitted here, please refer to the paper
is omitted here. http://www.bioinf.jku.at/publications/older/2604.pdf for details.
Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
operations on the input \f$x_{t}\f$ are NOT included in this operator. operations on the input $x_{t}$ are NOT included in this operator.
Users can choose to use fully-connect operator before LSTM operator. Users can choose to use fully-connect operator before LSTM operator.
)DOC"); )DOC");
......
...@@ -73,15 +73,15 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -73,15 +73,15 @@ class LSTMKernel : public framework::OpKernel<T> {
T* bias_data = const_cast<T*>(bias->data<T>()); T* bias_data = const_cast<T*>(bias->data<T>());
// the code style in LstmMetaValue will be updated later. // the code style in LstmMetaValue will be updated later.
lstm_value.checkIg = bias_data + 4 * frame_size; lstm_value.check_ig = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size; lstm_value.check_fg = lstm_value.check_ig + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size; lstm_value.check_og = lstm_value.check_fg + frame_size;
} else { } else {
lstm_value.checkIg = nullptr; lstm_value.check_ig = nullptr;
lstm_value.checkFg = nullptr; lstm_value.check_fg = nullptr;
lstm_value.checkOg = nullptr; lstm_value.check_og = nullptr;
} }
lstm_value.prevStateValue = nullptr; lstm_value.prev_state_value = nullptr;
Tensor ordered_c0; Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data(); const size_t* order = batch_gate->lod()[2].data();
if (cell_t0) { if (cell_t0) {
...@@ -90,7 +90,7 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -90,7 +90,7 @@ class LSTMKernel : public framework::OpKernel<T> {
// to reorder. // to reorder.
ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0, ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
true); true);
lstm_value.prevStateValue = ordered_c0.data<T>(); lstm_value.prev_state_value = ordered_c0.data<T>();
} }
// Use the local variable as here. // Use the local variable as here.
...@@ -140,14 +140,14 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -140,14 +140,14 @@ class LSTMKernel : public framework::OpKernel<T> {
static_cast<T>(1.0)); static_cast<T>(1.0));
} }
lstm_value.gateValue = gate_t.data<T>(); lstm_value.gate_value = gate_t.data<T>();
lstm_value.outputValue = out_t.data<T>(); lstm_value.output_value = out_t.data<T>();
lstm_value.stateValue = cell_t.data<T>(); lstm_value.state_value = cell_t.data<T>();
lstm_value.stateActiveValue = cell_pre_act_t.data<T>(); lstm_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value, math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
frame_size, cur_batch_size, frame_size, cur_batch_size,
gate_act, cell_act, cand_act); gate_act, cell_act, cand_act);
lstm_value.prevStateValue = lstm_value.stateValue; lstm_value.prev_state_value = lstm_value.state_value;
} }
math::Batch2LoDTensorFunctor<Place, T> to_seq; math::Batch2LoDTensorFunctor<Place, T> to_seq;
...@@ -214,13 +214,13 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -214,13 +214,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
math::LstmMetaValue<T> lstm_value; math::LstmMetaValue<T> lstm_value;
if (bias && ctx.Attr<bool>("use_peepholes")) { if (bias && ctx.Attr<bool>("use_peepholes")) {
T* bias_data = const_cast<T*>(bias->data<T>()); T* bias_data = const_cast<T*>(bias->data<T>());
lstm_value.checkIg = bias_data + 4 * frame_size; lstm_value.check_ig = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size; lstm_value.check_fg = lstm_value.check_ig + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size; lstm_value.check_og = lstm_value.check_fg + frame_size;
} else { } else {
lstm_value.checkIg = nullptr; lstm_value.check_ig = nullptr;
lstm_value.checkFg = nullptr; lstm_value.check_fg = nullptr;
lstm_value.checkOg = nullptr; lstm_value.check_og = nullptr;
} }
math::LstmMetaGrad<T> lstm_grad; math::LstmMetaGrad<T> lstm_grad;
...@@ -231,13 +231,13 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -231,13 +231,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
} }
if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) { if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
T* bias_g_data = bias_g->data<T>(); T* bias_g_data = bias_g->data<T>();
lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size; lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size; lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size; lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
} else { } else {
lstm_grad.checkIgGrad = nullptr; lstm_grad.check_ig_grad = nullptr;
lstm_grad.checkFgGrad = nullptr; lstm_grad.check_fg_grad = nullptr;
lstm_grad.checkOgGrad = nullptr; lstm_grad.check_og_grad = nullptr;
} }
math::LoDTensor2BatchFunctor<Place, T> to_batch; math::LoDTensor2BatchFunctor<Place, T> to_batch;
...@@ -276,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -276,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel<T> {
Tensor gate = batch_gate->Slice(bstart, bend); Tensor gate = batch_gate->Slice(bstart, bend);
Tensor cell = batch_cell.Slice(bstart, bend); Tensor cell = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
lstm_value.gateValue = gate.data<T>(); lstm_value.gate_value = gate.data<T>();
lstm_value.stateValue = cell.data<T>(); lstm_value.state_value = cell.data<T>();
lstm_value.stateActiveValue = cell_pre_act.data<T>(); lstm_value.state_active_value = cell_pre_act.data<T>();
Tensor out_g = batch_hidden_g.Slice(bstart, bend); Tensor out_g = batch_hidden_g.Slice(bstart, bend);
Tensor gate_g = batch_gate_g.Slice(bstart, bend); Tensor gate_g = batch_gate_g.Slice(bstart, bend);
Tensor cell_g = batch_cell_g.Slice(bstart, bend); Tensor cell_g = batch_cell_g.Slice(bstart, bend);
lstm_grad.stateGrad = cell_g.data<T>(); lstm_grad.state_grad = cell_g.data<T>();
lstm_grad.gateGrad = gate_g.data<T>(); lstm_grad.gate_grad = gate_g.data<T>();
lstm_grad.outputGrad = out_g.data<T>(); lstm_grad.output_grad = out_g.data<T>();
if (n > 0) { if (n > 0) {
int bstart_pre = static_cast<int>(batch_starts[n - 1]); int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
lstm_value.prevStateValue = cell_pre.data<T>(); lstm_value.prev_state_value = cell_pre.data<T>();
lstm_grad.prevStateGrad = cell_pre_g.data<T>(); lstm_grad.prev_state_grad = cell_pre_g.data<T>();
} else { } else {
lstm_value.prevStateValue = c0 ? ordered_c0.data<T>() : nullptr; lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
lstm_grad.prevStateGrad = c0_g ? ordered_c0_g.data<T>() : nullptr; lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
} }
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
......
...@@ -26,278 +26,284 @@ namespace detail { ...@@ -26,278 +26,284 @@ namespace detail {
template <class T, class Op> template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rCheckI; T r_checkI;
T rCheckF; T r_checkF;
T rCheckO; T r_checkO;
T rState; T r_state;
T rPrevState = 0; T r_prev_state = 0;
T rStateAtv; T r_state_atv;
T rOut; T r_out;
T *valueIn = value.gateValue; T *value_in = value.gate_value;
T *valueIg = value.gateValue + frameSize; T *value_ig = value.gate_value + frame_size;
T *valueFg = value.gateValue + frameSize * 2; T *value_fg = value.gate_value + frame_size * 2;
T *valueOg = value.gateValue + frameSize * 3; T *value_og = value.gate_value + frame_size * 3;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
rCheckI = value.checkIg ? value.checkIg[i] : 0; r_checkI = value.check_ig ? value.check_ig[i] : 0;
rCheckF = value.checkFg ? value.checkFg[i] : 0; r_checkF = value.check_fg ? value.check_fg[i] : 0;
rCheckO = value.checkOg ? value.checkOg[i] : 0; r_checkO = value.check_og ? value.check_og[i] : 0;
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = value.prevStateValue[i]; r_prev_state = value.prev_state_value[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
active_gate, active_state);
valueIn[i] = rValueIn; value_in[i] = r_value_in;
valueIg[i] = rValueIg; value_ig[i] = r_value_ig;
valueFg[i] = rValueFg; value_fg[i] = r_value_fg;
valueOg[i] = rValueOg; value_og[i] = r_value_og;
value.stateValue[i] = rState; value.state_value[i] = r_state;
value.stateActiveValue[i] = rStateAtv; value.state_active_value[i] = r_state_atv;
value.outputValue[i] = rOut; value.output_value[i] = r_out;
} }
} }
template <class T, class Op> template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize, LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rGradIn; T r_grad_in;
T rGradIg; T r_grad_ig;
T rGradFg; T r_grad_fg;
T rGradOg; T r_grad_og;
T rPrevState = 0; T r_prev_state = 0;
T rPrevStateGrad; T r_prev_state_grad;
T rState; T r_state;
T rStateGrad; T r_state_grad;
T rStateAtv; T r_state_atv;
T rOutputGrad; T r_output_grad;
T rCheckI; T r_checkI;
T rCheckF; T r_checkF;
T rCheckO; T r_checkO;
T rCheckIGrad; T r_checkIGrad;
T rCheckFGrad; T r_checkFGrad;
T rCheckOGrad; T r_checkOGrad;
T *valueIn = value.gateValue; T *value_in = value.gate_value;
T *valueIg = value.gateValue + frameSize; T *value_ig = value.gate_value + frame_size;
T *valueFg = value.gateValue + frameSize * 2; T *value_fg = value.gate_value + frame_size * 2;
T *valueOg = value.gateValue + frameSize * 3; T *value_og = value.gate_value + frame_size * 3;
T *gradIn = grad.gateGrad; T *grad_in = grad.gate_grad;
T *gradIg = grad.gateGrad + frameSize; T *grad_ig = grad.gate_grad + frame_size;
T *gradFg = grad.gateGrad + frameSize * 2; T *grad_fg = grad.gate_grad + frame_size * 2;
T *gradOg = grad.gateGrad + frameSize * 3; T *grad_og = grad.gate_grad + frame_size * 3;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
rCheckI = value.checkIg ? value.checkIg[i] : 0; r_checkI = value.check_ig ? value.check_ig[i] : 0;
rCheckF = value.checkFg ? value.checkFg[i] : 0; r_checkF = value.check_fg ? value.check_fg[i] : 0;
rCheckO = value.checkOg ? value.checkOg[i] : 0; r_checkO = value.check_og ? value.check_og[i] : 0;
rState = value.stateValue[i]; r_state = value.state_value[i];
rStateAtv = value.stateActiveValue[i]; r_state_atv = value.state_active_value[i];
rOutputGrad = grad.outputGrad[i]; r_output_grad = grad.output_grad[i];
rStateGrad = grad.stateGrad[i]; r_state_grad = grad.state_grad[i];
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = value.prevStateValue[i]; r_prev_state = value.prev_state_value[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
rCheckOGrad, active_node, active_gate, active_state); r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg; grad_in[i] = r_grad_in;
gradFg[i] = rGradFg; grad_ig[i] = r_grad_ig;
gradOg[i] = rGradOg; grad_fg[i] = r_grad_fg;
grad.stateGrad[i] = rStateGrad; grad_og[i] = r_grad_og;
grad.state_grad[i] = r_state_grad;
if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
if (value.prevStateValue) { if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad; if (value.prev_state_value) {
if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad; if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
} }
if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad; if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
} }
} }
template <class T, class Op> template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize, void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueIn; __m256 r_value_in;
__m256 rValueIg; __m256 r_value_ig;
__m256 rValueFg; __m256 r_value_fg;
__m256 rValueOg; __m256 r_value_og;
__m256 rCheckI = _mm256_set1_ps(0.0f); __m256 r_checkI = _mm256_set1_ps(0.0f);
__m256 rCheckF = _mm256_set1_ps(0.0f); __m256 r_checkF = _mm256_set1_ps(0.0f);
__m256 rCheckO = _mm256_set1_ps(0.0f); __m256 r_checkO = _mm256_set1_ps(0.0f);
__m256 rState; __m256 r_state;
__m256 rPrevState = _mm256_set1_ps(0.0f); __m256 r_prev_state = _mm256_set1_ps(0.0f);
__m256 rStateAtv; __m256 r_state_atv;
__m256 rOut; __m256 r_out;
__m256 *valueIn = (__m256 *)value.gateValue; __m256 *value_in = (__m256 *)value.gate_value;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize); __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
if (value.checkIg) { if (value.check_ig) {
rCheckI = ((__m256 *)value.checkIg)[i]; r_checkI = ((__m256 *)value.check_ig)[i];
rCheckF = ((__m256 *)value.checkFg)[i]; r_checkF = ((__m256 *)value.check_fg)[i];
rCheckO = ((__m256 *)value.checkOg)[i]; r_checkO = ((__m256 *)value.check_og)[i];
} }
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = ((__m256 *)value.prevStateValue)[i]; r_prev_state = ((__m256 *)value.prev_state_value)[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
active_gate, active_state);
valueIn[i] = rValueIn; value_in[i] = r_value_in;
valueIg[i] = rValueIg; value_ig[i] = r_value_ig;
valueFg[i] = rValueFg; value_fg[i] = r_value_fg;
valueOg[i] = rValueOg; value_og[i] = r_value_og;
((__m256 *)value.stateValue)[i] = rState; ((__m256 *)value.state_value)[i] = r_state;
((__m256 *)value.stateActiveValue)[i] = rStateAtv; ((__m256 *)value.state_active_value)[i] = r_state_atv;
((__m256 *)value.outputValue)[i] = rOut; ((__m256 *)value.output_value)[i] = r_out;
} }
#endif #endif
} }
template <class T, class Op> template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize, LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueIn; __m256 r_value_in;
__m256 rValueIg; __m256 r_value_ig;
__m256 rValueFg; __m256 r_value_fg;
__m256 rValueOg; __m256 r_value_og;
__m256 rGradIn; __m256 r_grad_in;
__m256 rGradIg; __m256 r_grad_ig;
__m256 rGradFg; __m256 r_grad_fg;
__m256 rGradOg; __m256 r_grad_og;
__m256 rPrevState = _mm256_set1_ps(0.0f); __m256 r_prev_state = _mm256_set1_ps(0.0f);
__m256 rPrevStateGrad; __m256 r_prev_state_grad;
__m256 rStateGrad; __m256 r_state_grad;
__m256 rState; __m256 r_state;
__m256 rStateAtv; __m256 r_state_atv;
__m256 rOutputGrad; __m256 r_output_grad;
__m256 rCheckI = _mm256_set1_ps(0.0f); __m256 r_checkI = _mm256_set1_ps(0.0f);
__m256 rCheckF = _mm256_set1_ps(0.0f); __m256 r_checkF = _mm256_set1_ps(0.0f);
__m256 rCheckO = _mm256_set1_ps(0.0f); __m256 r_checkO = _mm256_set1_ps(0.0f);
__m256 rCheckIGrad; __m256 r_checkIGrad;
__m256 rCheckFGrad; __m256 r_checkFGrad;
__m256 rCheckOGrad; __m256 r_checkOGrad;
__m256 *valueIn = (__m256 *)value.gateValue; __m256 *value_in = (__m256 *)value.gate_value;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize); __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
__m256 *gradIn = (__m256 *)grad.gateGrad; __m256 *grad_in = (__m256 *)grad.gate_grad;
__m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize); __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
__m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2); __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
__m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3); __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
if (value.checkIg) { if (value.check_ig) {
rCheckI = ((__m256 *)value.checkIg)[i]; r_checkI = ((__m256 *)value.check_ig)[i];
rCheckF = ((__m256 *)value.checkFg)[i]; r_checkF = ((__m256 *)value.check_fg)[i];
rCheckO = ((__m256 *)value.checkOg)[i]; r_checkO = ((__m256 *)value.check_og)[i];
} }
rState = ((__m256 *)value.stateValue)[i]; r_state = ((__m256 *)value.state_value)[i];
rStateAtv = ((__m256 *)value.stateActiveValue)[i]; r_state_atv = ((__m256 *)value.state_active_value)[i];
rOutputGrad = ((__m256 *)grad.outputGrad)[i]; r_output_grad = ((__m256 *)grad.output_grad)[i];
rStateGrad = ((__m256 *)grad.stateGrad)[i]; r_state_grad = ((__m256 *)grad.state_grad)[i];
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = ((__m256 *)value.prevStateValue)[i]; r_prev_state = ((__m256 *)value.prev_state_value)[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
rCheckOGrad, active_node, active_gate, active_state); r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg; grad_in[i] = r_grad_in;
gradFg[i] = rGradFg; grad_ig[i] = r_grad_ig;
gradOg[i] = rGradOg; grad_fg[i] = r_grad_fg;
((__m256 *)grad.stateGrad)[i] = rStateGrad; grad_og[i] = r_grad_og;
((__m256 *)grad.state_grad)[i] = r_state_grad;
if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
if (value.prevStateValue) { if (grad.prev_state_grad)
if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad; ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad; if (value.prev_state_value) {
if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
} }
if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad; if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
} }
#endif #endif
} }
template <class T, class Op> template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize, void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node, avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state); active_gate, active_state);
} else { } else {
naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node, naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state); active_gate, active_state);
} }
} }
template <class T, class Op> template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad, void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, activation_mode_t active_node, int frame_size, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node, avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
active_gate, active_state); active_gate, active_state);
} else { } else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node, naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
active_gate, active_state); active_node, active_gate, active_state);
} }
} }
......
...@@ -26,189 +26,192 @@ namespace math { ...@@ -26,189 +26,192 @@ namespace math {
namespace detail { namespace detail {
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool is_batch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize, __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
int batchSize, activation_mode_t active_node, int batch_size, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
value.gateValue += batchIdx * frameSize * 4; value.gate_value += batch_idx * frame_size * 4;
value.outputValue += batchIdx * frameSize; value.output_value += batch_idx * frame_size;
value.stateValue += batchIdx * frameSize; value.state_value += batch_idx * frame_size;
value.stateActiveValue += batchIdx * frameSize; value.state_active_value += batch_idx * frame_size;
} }
T rState; T r_state;
T rPrevState = 0; T r_prev_state = 0;
T rStateAtv; T r_state_atv;
T rOut; T r_out;
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0; T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0; T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0; T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
rValueIn = value.gateValue[frameIdx]; r_value_in = value.gate_value[frame_idx];
rValueIg = value.gateValue[frameIdx + frameSize]; r_value_ig = value.gate_value[frame_idx + frame_size];
rValueFg = value.gateValue[frameIdx + frameSize * 2]; r_value_fg = value.gate_value[frame_idx + frame_size * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3]; r_value_og = value.gate_value[frame_idx + frame_size * 3];
if (value.prevStateValue) { if (value.prev_state_value) {
if (isBatch) value.prevStateValue += batchIdx * frameSize; if (is_batch) value.prev_state_value += batch_idx * frame_size;
rPrevState = value.prevStateValue[frameIdx]; r_prev_state = value.prev_state_value[frame_idx];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
active_state);
value.gateValue[frameIdx] = rValueIn; value.gate_value[frame_idx] = r_value_in;
value.gateValue[frameIdx + frameSize] = rValueIg; value.gate_value[frame_idx + frame_size] = r_value_ig;
value.gateValue[frameIdx + frameSize * 2] = rValueFg; value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
value.gateValue[frameIdx + frameSize * 3] = rValueOg; value.gate_value[frame_idx + frame_size * 3] = r_value_og;
value.stateValue[frameIdx] = rState; value.state_value[frame_idx] = r_state;
value.stateActiveValue[frameIdx] = rStateAtv; value.state_active_value[frame_idx] = r_state_atv;
value.outputValue[frameIdx] = rOut; value.output_value[frame_idx] = r_out;
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool is_batch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value, __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize, LstmMetaGrad<T> grad, int frame_size,
int batchSize, activation_mode_t active_node, int batch_size, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
value.gateValue += batchIdx * frameSize * 4; value.gate_value += batch_idx * frame_size * 4;
value.stateValue += batchIdx * frameSize; value.state_value += batch_idx * frame_size;
value.stateActiveValue += batchIdx * frameSize; value.state_active_value += batch_idx * frame_size;
grad.gateGrad += batchIdx * frameSize * 4; grad.gate_grad += batch_idx * frame_size * 4;
grad.stateGrad += batchIdx * frameSize; grad.state_grad += batch_idx * frame_size;
grad.outputGrad += batchIdx * frameSize; grad.output_grad += batch_idx * frame_size;
} }
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rGradIn; T r_grad_in;
T rGradIg; T r_grad_ig;
T rGradFg; T r_grad_fg;
T rGradOg; T r_grad_og;
T rPrevState = 0; T r_prev_state = 0;
T rPrevStateGrad; T r_prev_state_grad;
T rState; T r_state;
T rStateGrad; T r_state_grad;
T rStateAtv; T r_state_atv;
T rOutputGrad; T r_output_grad;
T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0; T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0; T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0; T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
T rCheckIGrad; T r_checkIGrad;
T rCheckFGrad; T r_checkFGrad;
T rCheckOGrad; T r_checkOGrad;
rValueIn = value.gateValue[frameIdx]; r_value_in = value.gate_value[frame_idx];
rValueIg = value.gateValue[frameIdx + frameSize]; r_value_ig = value.gate_value[frame_idx + frame_size];
rValueFg = value.gateValue[frameIdx + frameSize * 2]; r_value_fg = value.gate_value[frame_idx + frame_size * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3]; r_value_og = value.gate_value[frame_idx + frame_size * 3];
rState = value.stateValue[frameIdx]; r_state = value.state_value[frame_idx];
rStateAtv = value.stateActiveValue[frameIdx]; r_state_atv = value.state_active_value[frame_idx];
rOutputGrad = grad.outputGrad[frameIdx]; r_output_grad = grad.output_grad[frame_idx];
rStateGrad = grad.stateGrad[frameIdx]; r_state_grad = grad.state_grad[frame_idx];
if (value.prevStateValue) { if (value.prev_state_value) {
if (isBatch) value.prevStateValue += batchIdx * frameSize; if (is_batch) value.prev_state_value += batch_idx * frame_size;
rPrevState = value.prevStateValue[frameIdx]; r_prev_state = value.prev_state_value[frame_idx];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad, r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
active_node, active_gate, active_state); r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
grad.gateGrad[frameIdx] = rGradIn;
grad.gateGrad[frameIdx + frameSize] = rGradIg; grad.gate_grad[frame_idx] = r_grad_in;
grad.gateGrad[frameIdx + frameSize * 2] = rGradFg; grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
grad.gateGrad[frameIdx + frameSize * 3] = rGradOg; grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
grad.stateGrad[frameIdx] = rStateGrad; grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
if (grad.prevStateGrad) { grad.state_grad[frame_idx] = r_state_grad;
if (isBatch) grad.prevStateGrad += batchIdx * frameSize; if (grad.prev_state_grad) {
grad.prevStateGrad[frameIdx] = rPrevStateGrad; if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
grad.prev_state_grad[frame_idx] = r_prev_state_grad;
} }
if (isBatch) { if (is_batch) {
if (value.prevStateValue) { if (value.prev_state_value) {
if (grad.checkIgGrad) if (grad.check_ig_grad)
paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx, paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
rCheckIGrad); r_checkIGrad);
if (grad.checkFgGrad) if (grad.check_fg_grad)
paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx, paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
rCheckFGrad); r_checkFGrad);
} }
if (grad.checkOgGrad) if (grad.check_og_grad)
paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad); paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
r_checkOGrad);
} else { } else {
if (value.prevStateValue) { if (value.prev_state_value) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad; if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad; if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
} }
if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad; if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
} }
} }
template <class T, class Op> template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op, void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frameSize, int batchSize, LstmMetaValue<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
/* framePerBlock = 32 batchPerBlock = 32 */ /* frame_per_block = 32 batch_per_block = 32 */
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batch_size == 1) {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate, op, value, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} else { } else {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate, op, value, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} }
} }
...@@ -216,34 +219,34 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, ...@@ -216,34 +219,34 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
template <class T, class Op> template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op, void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, int batchSize, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
/* framePerBlock = 32 batchPerBlock = 16 */ /* frame_per_block = 32 batch_per_block = 16 */
threads = dim3(32, 16); threads = dim3(32, 16);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 16 - 1) / 16); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
} }
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batch_size == 1) {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate, op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} else { } else {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate, op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} }
} }
......
...@@ -27,19 +27,19 @@ namespace forward { ...@@ -27,19 +27,19 @@ namespace forward {
template <class T> template <class T>
class lstm { class lstm {
public: public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
T &prevState, T &state, T &stateAtv, T &output, T &prev_state, T &state, T &state_atv, T &output,
T &checkI, T &checkF, T &checkO, T &checkI, T &checkF, T &checkO,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
valueIn = activation(valueIn, active_node); value_in = activation(value_in, active_node);
valueIg = activation(valueIg + prevState * checkI, active_gate); value_ig = activation(value_ig + prev_state * checkI, active_gate);
valueFg = activation(valueFg + prevState * checkF, active_gate); value_fg = activation(value_fg + prev_state * checkF, active_gate);
state = valueIn * valueIg + prevState * valueFg; state = value_in * value_ig + prev_state * value_fg;
valueOg = activation(valueOg + state * checkO, active_gate); value_og = activation(value_og + state * checkO, active_gate);
stateAtv = activation(state, active_state); state_atv = activation(state, active_state);
output = valueOg * stateAtv; output = value_og * state_atv;
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
...@@ -48,24 +48,27 @@ class lstm { ...@@ -48,24 +48,27 @@ class lstm {
// Only float support AVX optimization // Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value; static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
__m256 &valueOg, __m256 &prevState, __m256 &state, __m256 &value_fg, __m256 &value_og,
__m256 &stateAtv, __m256 &output, __m256 &checkI, __m256 &prev_state, __m256 &state,
__m256 &state_atv, __m256 &output, __m256 &checkI,
__m256 &checkF, __m256 &checkO, __m256 &checkF, __m256 &checkO,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
valueIn = activation(valueIn, active_node); value_in = activation(value_in, active_node);
valueIg = activation( value_ig =
_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate); activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
valueFg = activation(
_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate);
state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
_mm256_mul_ps(prevState, valueFg));
valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)),
active_gate); active_gate);
stateAtv = activation(state, active_state); value_fg =
output = _mm256_mul_ps(valueOg, stateAtv); activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
active_gate);
state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
_mm256_mul_ps(prev_state, value_fg));
value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
active_gate);
state_atv = activation(state, active_state);
output = _mm256_mul_ps(value_og, state_atv);
} }
#endif #endif
#endif #endif
...@@ -78,25 +81,26 @@ namespace backward { ...@@ -78,25 +81,26 @@ namespace backward {
template <class T> template <class T>
class lstm { class lstm {
public: public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
T &gradIn, T &gradIg, T &gradFg, T &gradOg, T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
T &prevState, T &prevStateGrad, T &state, T &prev_state, T &prev_state_grad, T &state,
T &stateGrad, T &stateAtv, T &outputGrad, T &state_grad, T &state_atv, T &output_grad,
T &checkI, T &checkF, T &checkO, T &checkIGrad, T &checkI, T &checkF, T &checkO, T &checkIGrad,
T &checkFGrad, T &checkOGrad, T &checkFGrad, T &checkOGrad,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
gradOg = activation(outputGrad * stateAtv, valueOg, active_gate); grad_og = activation(output_grad * state_atv, value_og, active_gate);
stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) + state_grad += activation(output_grad * value_og, state_atv, active_state) +
gradOg * checkO; grad_og * checkO;
gradIn = activation(stateGrad * valueIg, valueIn, active_node); grad_in = activation(state_grad * value_ig, value_in, active_node);
gradIg = activation(stateGrad * valueIn, valueIg, active_gate); grad_ig = activation(state_grad * value_in, value_ig, active_gate);
gradFg = activation(stateGrad * prevState, valueFg, active_gate); grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg; prev_state_grad =
checkIGrad = gradIg * prevState; grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
checkFGrad = gradFg * prevState; checkIGrad = grad_ig * prev_state;
checkOGrad = gradOg * state; checkFGrad = grad_fg * prev_state;
checkOGrad = grad_og * state;
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
...@@ -105,32 +109,32 @@ class lstm { ...@@ -105,32 +109,32 @@ class lstm {
// Only float support AVX optimization // Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value; static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()( HOSTDEVICE void operator()(
__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg, __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
__m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg, __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
__m256 &prevState, __m256 &prevStateGrad, __m256 &state, __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
__m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI, __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
__m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad, __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
__m256 &checkOGrad, activation_mode_t active_node, __m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_state) { activation_mode_t active_gate, activation_mode_t active_state) {
gradOg = grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate); active_gate);
stateGrad = _mm256_add_ps( state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state), state_atv, active_state),
stateGrad); state_grad);
stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad); state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
gradIn = grad_in =
activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node); activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
gradIg = grad_ig =
activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate); activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
gradFg = grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate); active_gate);
prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI), prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
_mm256_mul_ps(gradFg, checkF)); _mm256_mul_ps(grad_fg, checkF));
prevStateGrad = prev_state_grad =
_mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad); _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
checkIGrad = _mm256_mul_ps(gradIg, prevState); checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
checkFGrad = _mm256_mul_ps(gradFg, prevState); checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
checkOGrad = _mm256_mul_ps(gradOg, state); checkOGrad = _mm256_mul_ps(grad_og, state);
} }
#endif #endif
#endif #endif
......
...@@ -30,12 +30,12 @@ struct LstmUnitFunctor<platform::CPUPlace, T> { ...@@ -30,12 +30,12 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size, detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
ActiveType(cand_act), ActiveType(gate_act), ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act)); ActiveType(cell_act));
value.gateValue += frame_size * 4; value.gate_value += frame_size * 4;
value.stateValue += frame_size; value.state_value += frame_size;
value.stateActiveValue += frame_size; value.state_active_value += frame_size;
value.outputValue += frame_size; value.output_value += frame_size;
if (value.prevStateValue) { if (value.prev_state_value) {
value.prevStateValue += frame_size; value.prev_state_value += frame_size;
} }
} }
} }
...@@ -53,20 +53,20 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> { ...@@ -53,20 +53,20 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
frame_size, ActiveType(cand_act), frame_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act)); ActiveType(gate_act), ActiveType(cell_act));
value.gateValue += frame_size * 4; value.gate_value += frame_size * 4;
value.stateValue += frame_size; value.state_value += frame_size;
value.stateActiveValue += frame_size; value.state_active_value += frame_size;
value.outputValue += frame_size; value.output_value += frame_size;
if (value.prevStateValue) { if (value.prev_state_value) {
value.prevStateValue += frame_size; value.prev_state_value += frame_size;
} }
grad.gateGrad += frame_size * 4; grad.gate_grad += frame_size * 4;
grad.stateGrad += frame_size; grad.state_grad += frame_size;
grad.stateActiveGrad += frame_size; grad.state_active_grad += frame_size;
grad.outputGrad += frame_size; grad.output_grad += frame_size;
if (grad.prevStateGrad) { if (grad.prev_state_grad) {
grad.prevStateGrad += frame_size; grad.prev_state_grad += frame_size;
} }
} }
} }
......
...@@ -31,26 +31,26 @@ typedef enum { ...@@ -31,26 +31,26 @@ typedef enum {
template <class T> template <class T>
struct LstmMetaValue { struct LstmMetaValue {
T *gateValue; T *gate_value;
T *prevStateValue; T *prev_state_value;
T *stateValue; T *state_value;
T *stateActiveValue; T *state_active_value;
T *outputValue; T *output_value;
T *checkIg; T *check_ig;
T *checkFg; T *check_fg;
T *checkOg; T *check_og;
}; };
template <class T> template <class T>
struct LstmMetaGrad { struct LstmMetaGrad {
T *gateGrad; T *gate_grad;
T *prevStateGrad; T *prev_state_grad;
T *stateGrad; T *state_grad;
T *stateActiveGrad; T *state_active_grad;
T *outputGrad; T *output_grad;
T *checkIgGrad; T *check_ig_grad;
T *checkFgGrad; T *check_fg_grad;
T *checkOgGrad; T *check_og_grad;
}; };
inline activation_mode_t ActiveType(const std::string &type) { inline activation_mode_t ActiveType(const std::string &type) {
......
...@@ -5,4 +5,6 @@ if(WITH_PYTHON) ...@@ -5,4 +5,6 @@ if(WITH_PYTHON)
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
endif(WITH_PYTHON) endif(WITH_PYTHON)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB}) if(WITH_DOC)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
endif(WITH_DOC)
...@@ -185,7 +185,14 @@ EOF ...@@ -185,7 +185,14 @@ EOF
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/ ADD go/cmd/master/master /usr/bin/
EOF
if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
cat >> /paddle/build/Dockerfile <<EOF
ADD paddle/pybind/print_operators_doc /usr/bin/ ADD paddle/pybind/print_operators_doc /usr/bin/
EOF
fi
cat >> /paddle/build/Dockerfile <<EOF
# default command shows the paddle version and exit # default command shows the paddle version and exit
CMD ["paddle", "version"] CMD ["paddle", "version"]
EOF EOF
......
################# test_Compare ############################ set(PYTHON_PATH
add_unittest_without_exec(test_Compare ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
test_Compare.cpp) ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
add_test(NAME test_Compare function(trainer_test TARGET)
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
${CMAKE_CURRENT_BINARY_DIR}/test_Compare add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endfunction()
################# test_Trainer ########################### trainer_test(test_Compare)
add_unittest_without_exec(test_Trainer trainer_test(test_PyDataProviderWrapper)
test_Trainer.cpp) trainer_test(test_recurrent_machine_generation)
add_test(NAME test_Trainer trainer_test(test_Trainer)
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
############### test_TrainerOnePass ########################## ############### test_TrainerOnePass ##########################
if(WITH_PYTHON) if(WITH_PYTHON)
...@@ -22,32 +20,13 @@ if(WITH_PYTHON) ...@@ -22,32 +20,13 @@ if(WITH_PYTHON)
add_unittest_without_exec(test_TrainerOnePass add_unittest_without_exec(test_TrainerOnePass
test_TrainerOnePass.cpp) test_TrainerOnePass.cpp)
add_test(NAME test_TrainerOnePass add_test(NAME test_TrainerOnePass
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif() endif()
################# test_recurrent_machine_generation ###############
add_unittest_without_exec(test_recurrent_machine_generation
test_recurrent_machine_generation.cpp)
add_test(NAME test_recurrent_machine_generation
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
#################### test_PyDataProviderWrapper #########################
add_unittest_without_exec(test_PyDataProviderWrapper
test_PyDataProviderWrapper.cpp)
add_test(NAME test_PyDataProviderWrapper
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
#################### test_config_parser ######################### #################### test_config_parser #########################
add_test(NAME test_config_parser add_test(NAME test_config_parser
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE}
${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
...@@ -14,13 +14,14 @@ import optimizer ...@@ -14,13 +14,14 @@ import optimizer
import backward import backward
import regularizer import regularizer
from param_attr import ParamAttr from param_attr import ParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, GPUPlace from core import LoDTensor, CPUPlace, GPUPlace
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
'DataFeeder'
] ]
......
from __future__ import print_function
import core
import numpy
import six.moves as six
from framework import Variable
__all__ = ['DataFeeder']
class DataToLoDTensorConverter(object):
def __init__(self, place, lod_level, shape, dtype):
self.place = place
self.lod_level = lod_level
self.shape = shape
if dtype == core.DataType.FP32:
self.dtype = 'float32'
elif dtype == core.DataType.INT64:
self.dtype = 'int64'
elif dtype == core.DataType.FP64:
self.dtype = 'float64'
elif dtype == core.DataType.INT32:
self.dtype = 'int32'
else:
raise ValueError("dtype must be any of [int32, float32, int64, "
"float64]")
self.data = []
self.lod = []
for i in six.range(lod_level):
self.lod.append([0])
def feed(self, data):
self._feed_impl_(data, self.lod, self.lod_level)
def _feed_impl_(self, data, lod, lod_level):
if lod_level == 0:
self.data.append(data)
else:
cur_lod_len = len(data)
lod[-1].append(lod[-1][-1] + cur_lod_len)
for each_data in data:
self._feed_impl_(each_data, lod[:-1], lod_level - 1)
def done(self):
arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
t = core.LoDTensor()
t.set(arr, self.place)
if self.lod_level > 0:
t.set_lod(self.lod)
return t
class DataFeeder(object):
def __init__(self, feed_list, place):
self.feed_dtypes = []
self.feed_names = []
self.feed_shapes = []
self.feed_lod_level = []
for each_var in feed_list:
if not isinstance(each_var, Variable):
raise TypeError("Feed list should contain a list of variable")
self.feed_dtypes.append(each_var.dtype)
self.feed_names.append(each_var.name)
shape = each_var.shape
batch_size_dim = -1
for i, s in enumerate(shape):
if s < 0:
batch_size_dim = i
break
if batch_size_dim == -1:
raise ValueError("Variable {0} must has a batch size dimension",
each_var.name)
self.feed_lod_level.append(each_var.lod_level)
self.feed_shapes.append(shape)
self.place = place
def feed(self, iterable):
converter = []
for lod_level, shape, dtype in six.zip(
self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
converter.append(
DataToLoDTensorConverter(
place=self.place,
lod_level=lod_level,
shape=shape,
dtype=dtype))
for each_sample in iterable:
for each_converter, each_slot in six.zip(converter, each_sample):
each_converter.feed(each_slot)
ret_dict = {}
for each_name, each_converter in six.zip(self.feed_names, converter):
ret_dict[each_name] = each_converter.done()
return ret_dict
...@@ -22,6 +22,7 @@ train_reader = paddle.batch( ...@@ -22,6 +22,7 @@ train_reader = paddle.batch(
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -31,12 +32,8 @@ for pass_id in range(PASS_NUM): ...@@ -31,12 +32,8 @@ for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): for data in train_reader():
x_data = np.array(map(lambda _: _[0], data)).astype("float32")
y_data = np.array(map(lambda _: _[1], data)).astype("float32")
avg_loss_value, = exe.run(fluid.default_main_program(), avg_loss_value, = exe.run(fluid.default_main_program(),
feed={'x': x_data, feed=feeder.feed(data),
'y': y_data},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
......
...@@ -113,23 +113,14 @@ train_reader = paddle.batch( ...@@ -113,23 +113,14 @@ train_reader = paddle.batch(
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
batch_size = 1
for i in y_data.shape:
batch_size = batch_size * i
y_data = y_data.reshape([batch_size, 1])
loss, acc = exe.run(fluid.default_main_program(), loss, acc = exe.run(fluid.default_main_program(),
feed={"pixel": img_data, feed=feeder.feed(data),
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
......
...@@ -28,17 +28,9 @@ def load_parameter(file_name, h, w): ...@@ -28,17 +28,9 @@ def load_parameter(file_name, h, w):
return np.fromfile(f, dtype=np.float32).reshape(h, w) return np.fromfile(f, dtype=np.float32).reshape(h, w)
def db_lstm(): def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
**ignored):
# 8 features # 8 features
word = fluid.layers.data(name='word_data', shape=[1], dtype='int64')
predicate = fluid.layers.data(name='verb_data', shape=[1], dtype='int64')
ctx_n2 = fluid.layers.data(name='ctx_n2_data', shape=[1], dtype='int64')
ctx_n1 = fluid.layers.data(name='ctx_n1_data', shape=[1], dtype='int64')
ctx_0 = fluid.layers.data(name='ctx_0_data', shape=[1], dtype='int64')
ctx_p1 = fluid.layers.data(name='ctx_p1_data', shape=[1], dtype='int64')
ctx_p2 = fluid.layers.data(name='ctx_p2_data', shape=[1], dtype='int64')
mark = fluid.layers.data(name='mark_data', shape=[1], dtype='int64')
predicate_embedding = fluid.layers.embedding( predicate_embedding = fluid.layers.embedding(
input=predicate, input=predicate,
size=[pred_len, word_dim], size=[pred_len, word_dim],
...@@ -120,8 +112,25 @@ def to_lodtensor(data, place): ...@@ -120,8 +112,25 @@ def to_lodtensor(data, place):
def main(): def main():
# define network topology # define network topology
feature_out = db_lstm() word = fluid.layers.data(
target = fluid.layers.data(name='target', shape=[1], dtype='int64') name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
feature_out = db_lstm(**locals())
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf( crf_cost = fluid.layers.linear_chain_crf(
input=feature_out, input=feature_out,
label=target, label=target,
...@@ -139,6 +148,11 @@ def main(): ...@@ -139,6 +148,11 @@ def main():
paddle.dataset.conll05.test(), buf_size=8192), paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
],
place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -150,28 +164,8 @@ def main(): ...@@ -150,28 +164,8 @@ def main():
batch_id = 0 batch_id = 0
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
for data in train_data(): for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place)
ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place)
ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place)
ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place)
ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place)
verb_data = to_lodtensor(map(lambda x: x[6], data), place)
mark_data = to_lodtensor(map(lambda x: x[7], data), place)
target = to_lodtensor(map(lambda x: x[8], data), place)
outs = exe.run(fluid.default_main_program(), outs = exe.run(fluid.default_main_program(),
feed={ feed=feeder.feed(data),
'word_data': word_data,
'ctx_n2_data': ctx_n2_data,
'ctx_n1_data': ctx_n1_data,
'ctx_0_data': ctx_0_data,
'ctx_p1_data': ctx_p1_data,
'ctx_p2_data': ctx_p2_data,
'verb_data': verb_data,
'mark_data': mark_data,
'target': target
},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0]) avg_cost_val = np.array(outs[0])
......
...@@ -37,20 +37,14 @@ train_reader = paddle.batch( ...@@ -37,20 +37,14 @@ train_reader = paddle.batch(
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([BATCH_SIZE, 1])
loss, acc = exe.run(fluid.default_main_program(), loss, acc = exe.run(fluid.default_main_program(),
feed={"pixel": img_data, feed=feeder.feed(data),
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" + print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
......
...@@ -48,40 +48,22 @@ test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128) ...@@ -48,40 +48,22 @@ test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
PASS_NUM = 100 PASS_NUM = 100
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
x_data = np.array(map(lambda x: x[0], data)).astype("float32") out, acc = exe.run(fluid.default_main_program(),
y_data = np.array(map(lambda x: x[1], data)).astype("int64") feed=feeder.feed(data),
y_data = np.expand_dims(y_data, axis=1)
tensor_x = fluid.LoDTensor()
tensor_x.set(x_data, place)
tensor_y = fluid.LoDTensor()
tensor_y.set(y_data, place)
outs = exe.run(fluid.default_main_program(),
feed={'x': tensor_x,
'y': tensor_y},
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
out = np.array(outs[0])
acc = np.array(outs[1])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
test_accuracy.reset(exe) test_accuracy.reset(exe)
for data in test_reader(): for data in test_reader():
x_data = np.array(map(lambda x: x[0], data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = np.expand_dims(y_data, axis=1)
out, acc = exe.run(inference_program, out, acc = exe.run(inference_program,
feed={'x': x_data, feed=feeder.feed(data),
'y': y_data},
fetch_list=[avg_cost] + test_accuracy.metrics) fetch_list=[avg_cost] + test_accuracy.metrics)
test_pass_acc = test_accuracy.eval(exe) test_pass_acc = test_accuracy.eval(exe)
......
...@@ -4,10 +4,8 @@ import paddle.v2 as paddle ...@@ -4,10 +4,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
data = fluid.layers.data(name="words", shape=[1], dtype="int64") hid_dim=32):
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
conv_3 = fluid.nets.sequence_conv_pool( conv_3 = fluid.nets.sequence_conv_pool(
input=emb, input=emb,
...@@ -55,8 +53,11 @@ def main(): ...@@ -55,8 +53,11 @@ def main():
dict_dim = len(word_dict) dict_dim = len(word_dict)
class_dim = 2 class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = convolution_net( cost, accuracy, acc_out = convolution_net(
input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -64,24 +65,15 @@ def main(): ...@@ -64,24 +65,15 @@ def main():
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_data(): for data in train_data():
tensor_words = to_lodtensor(map(lambda x: x[0], data), place) cost_val, acc_val = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
label = np.array(map(lambda x: x[1], data)).astype("int64")
label = label.reshape([BATCH_SIZE, 1])
tensor_label = fluid.LoDTensor()
tensor_label.set(label, place)
cost_val, acc_val = exe.run(
fluid.default_main_program(),
feed={"words": tensor_words,
"label": tensor_label},
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("cost=" + str(cost_val) + " acc=" + str(acc_val) + print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
......
...@@ -3,14 +3,14 @@ import paddle.v2 as paddle ...@@ -3,14 +3,14 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
def stacked_lstm_net(input_dim, def stacked_lstm_net(data,
label,
input_dim,
class_dim=2, class_dim=2,
emb_dim=128, emb_dim=128,
hid_dim=512, hid_dim=512,
stacked_num=3): stacked_num=3):
assert stacked_num % 2 == 1 assert stacked_num % 2 == 1
data = fluid.layers.data(name="words", shape=[1], dtype="int64")
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
# add bias attr # add bias attr
...@@ -65,8 +65,11 @@ def main(): ...@@ -65,8 +65,11 @@ def main():
dict_dim = len(word_dict) dict_dim = len(word_dict)
class_dim = 2 class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = stacked_lstm_net( cost, accuracy, acc_out = stacked_lstm_net(
input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -74,24 +77,15 @@ def main(): ...@@ -74,24 +77,15 @@ def main():
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_data(): for data in train_data():
tensor_words = to_lodtensor(map(lambda x: x[0], data), place) cost_val, acc_val = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
label = np.array(map(lambda x: x[1], data)).astype("int64")
label = label.reshape([BATCH_SIZE, 1])
tensor_label = fluid.LoDTensor()
tensor_label.set(label, place)
cost_val, acc_val = exe.run(
fluid.default_main_program(),
feed={"words": tensor_words,
"label": tensor_label},
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("cost=" + str(cost_val) + " acc=" + str(acc_val) + print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
......
...@@ -57,23 +57,16 @@ train_reader = paddle.batch( ...@@ -57,23 +57,16 @@ train_reader = paddle.batch(
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(
feed_list=[first_word, second_word, third_word, forth_word, next_word],
place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
for data in train_reader(): for data in train_reader():
input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
input_data = map(lambda x: np.array(x).astype("int64"), input_data)
input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
avg_cost_np = exe.run(fluid.default_main_program(), avg_cost_np = exe.run(fluid.default_main_program(),
feed={ feed=feeder.feed(data),
'firstw': input_data[0],
'secondw': input_data[1],
'thirdw': input_data[2],
'forthw': input_data[3],
'nextw': input_data[4]
},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
if avg_cost_np[0] < 5.0: if avg_cost_np[0] < 5.0:
exit(0) # if avg cost less than 10.0, we think our code is good. exit(0) # if avg cost less than 10.0, we think our code is good.
......
import paddle.v2.fluid as fluid
def test_converter():
img = fluid.layers.data(name='image', shape=[1, 28, 28])
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
print(result)
if __name__ == '__main__':
test_converter()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册