diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 6937d13dbaa60723f8d55964db8db1eb03f4057b..f6bf54d33943e628b08743d40efc5ae7d1cf53b2 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -70,8 +70,8 @@ paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None)) -paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None)) -paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid')) +paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)) +paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)) paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc index 087f903a8bba9a4bfcd7eaabd7098555442a904e..752d706cbfab8eb3027fe9610c25b7400ecfed1d 100644 --- a/paddle/fluid/operators/gru_op.cc +++ b/paddle/fluid/operators/gru_op.cc @@ -137,6 +137,10 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, defalut: False) " "whether to compute reversed GRU.") .SetDefault(false); + AddAttr("origin_mode", + "bool" + "use origin mode in article https://arxiv.org/abs/1412.3555") + .SetDefault(false); AddComment(R"DOC( GRU Operator implements part calculations of the complete GRU as following: @@ -221,6 +225,7 @@ class GRUCPUKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { using DeviceContext = paddle::platform::CPUDeviceContext; + bool origin_mode = context.Attr("origin_mode"); auto* input = context.Input("Input"); auto* h0 = context.Input("H0"); auto* weight = context.Input("Weight"); @@ -327,7 +332,7 @@ class GRUCPUKernel : public framework::OpKernel { math::detail::forward_final_output( math::detail::forward::gru_finalOutput(), gru_value, frame_size, - cur_batch_size, active_node); + cur_batch_size, active_node, origin_mode); gru_value.prev_out_value = gru_value.output_value; } @@ -351,7 +356,7 @@ class GRUCPUKernel : public framework::OpKernel { math::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, active_node, - active_gate); + active_gate, origin_mode); gru_value.prev_out_value = gru_value.output_value; } diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc index 55721c283dd18c2f9642563a9ce1eabfce16fd7b..ba918b3def22e3c60c4155f77ecbaad85d520928 100644 --- a/paddle/fluid/operators/gru_op.cu.cc +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -21,6 +21,7 @@ template class GRUKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { + bool origin_mode = context.Attr("origin_mode"); auto* input = context.Input("Input"); auto* h0 = context.Input("H0"); auto* weight = context.Input("Weight"); @@ -87,7 +88,7 @@ class GRUKernel : public framework::OpKernel { gru_value.reset_output_value = reset_hidden_prev_t.data(); math::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, active_node, - active_gate); + active_gate, origin_mode); gru_value.prev_out_value = gru_value.output_value; } diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h index 0b551e8046be16c95f7d6b10b68b32a9af594f73..45c769ee37260bf912ebc848d58019557f4adc07 100644 --- a/paddle/fluid/operators/gru_op.h +++ b/paddle/fluid/operators/gru_op.h @@ -41,6 +41,7 @@ template class GRUGradKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { + bool origin_mode = context.Attr("origin_mode"); auto* h0 = context.Input("H0"); auto* weight = context.Input("Weight"); const T* weight_data = weight->data(); @@ -146,7 +147,7 @@ class GRUGradKernel : public framework::OpKernel { math::GRUUnitGradFunctor::compute( dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node, - active_gate); + active_gate, origin_mode); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc index 82a808b01e99ec33b0ca00a065fb301d3c633b19..e3beedcf10b6286c92371c48cae7912aef35e7a3 100644 --- a/paddle/fluid/operators/gru_unit_op.cc +++ b/paddle/fluid/operators/gru_unit_op.cc @@ -111,6 +111,13 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { "The activation type used in update gate and reset gate.") .SetDefault(sigmoid) .InEnum({identity, sigmoid, tanh, relu}); + AddAttr("origin_mode", + "bool" + "use origin mode in article (https://arxiv.org/pdf/1406.1078.pdf)") + .SetDefault(false); AddComment(R"DOC( GRUUnit Operator implements partial calculations of the GRU unit as following: diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h index 451ec61ba1f7239d92c6dfbad0b2961e74e1bc17..712ef05d8631ac74b92795321202cb5590286e82 100644 --- a/paddle/fluid/operators/gru_unit_op.h +++ b/paddle/fluid/operators/gru_unit_op.h @@ -113,7 +113,11 @@ class GRUUnitKernel : public framework::OpKernel { auto c = g.slice(c_offsets, extents); // output candidate // calculate final output - h.device(place) = u * (c - h_p) + h_p; + if (context.Attr("origin_mode")) { + h.device(place) = c + u * (h_p - c); // (1 - u) * c + u * h_p + } else { + h.device(place) = u * (c - h_p) + h_p; // u * c + (1 - u) * h_p + } } }; @@ -180,11 +184,19 @@ class GRUUnitGradKernel : public framework::OpKernel { auto c = g.slice(c_offsets, extents); // output candidate // backward for unactivated update gate - ActGradCompute(context.Attr("gate_activation"), place, u, u, - d_g.slice(u_offsets, extents), d_h * (c - h_p)); - // backward for unactivated output candidate - ActGradCompute(context.Attr("activation"), place, c, c, - d_g.slice(c_offsets, extents), d_h * u); + if (context.Attr("origin_mode")) { + ActGradCompute(context.Attr("gate_activation"), place, u, u, + d_g.slice(u_offsets, extents), d_h * (h_p - c)); + // backward for unactivated output candidate + ActGradCompute(context.Attr("activation"), place, c, c, + d_g.slice(c_offsets, extents), d_h * (1 - u)); + } else { + ActGradCompute(context.Attr("gate_activation"), place, u, u, + d_g.slice(u_offsets, extents), d_h * (c - h_p)); + // backward for unactivated output candidate + ActGradCompute(context.Attr("activation"), place, c, c, + d_g.slice(c_offsets, extents), d_h * u); + } // backward for reset_hidden_prev auto blas = math::GetBlas(context); blas.GEMM(false, true, batch_size, frame_size, frame_size, 1, @@ -213,7 +225,11 @@ class GRUUnitGradKernel : public framework::OpKernel { T* hidden_prev_grad_data = hidden_prev_grad->mutable_data(context.GetPlace()); auto d_h_p = EigenMatrix::From(*hidden_prev_grad); - d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); + if (context.Attr("origin_mode")) { + d_h_p.device(place) = d_r_h_p * r + d_h * u; + } else { + d_h_p.device(place) = d_r_h_p * r + d_h * (1 - u); + } blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1, gate_grad_data, frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data, frame_size); diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h index 47c771f7c5c01b651423c7886207abf4a4297019..c6dd972e12b763283a4212d4c56844afb1c2fd7a 100644 --- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h @@ -56,7 +56,8 @@ template void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, T *gate_value, T *prev_output_value, T *output_value, int frame_size, - ActivationType active_node) { + ActivationType active_node, + bool origin_mode) { T r_value_update_gate; T r_value_frame_state; T r_prev_out = 0; @@ -72,7 +73,7 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, } op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, - &r_output, active_node); + &r_output, active_node, origin_mode); frame_state[i] = r_value_frame_state; output_value[i] = r_output; @@ -146,7 +147,8 @@ template void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, T *gate_value, T *prev_output_value, T *output_value, int frame_size, - ActivationType active_node) { + ActivationType active_node, + bool origin_mode) { #ifdef __AVX__ __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f); __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f); @@ -180,7 +182,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, } op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, - &r_output, active_node); + &r_output, active_node, origin_mode); _mm256_storeu_ps(reinterpret_cast(frame_state + i), r_value_frame_state); @@ -190,7 +192,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, if (rest > 0) { i = n - block; op_final_output(&r_value_update_gate_last, &r_value_frame_state_last, - &r_prev_out_last, &r_output, active_node); + &r_prev_out_last, &r_output, active_node, origin_mode); _mm256_storeu_ps(reinterpret_cast(frame_state + i), r_value_frame_state_last); @@ -227,17 +229,18 @@ inline void forward_reset_output(OpResetOutput op_reset_output, template inline void forward_final_output(OpFinalOutput op_final_output, GRUMetaValue value, int frame_size, - int batch_size, ActivationType active_node) { + int batch_size, ActivationType active_node, + bool origin_mode) { for (int b = 0; b < batch_size; b++) { if (OpFinalOutput::avx && (frame_size > static_cast(8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_forward_final_output(op_final_output, value.gate_value, value.prev_out_value, value.output_value, - frame_size, active_node); + frame_size, active_node, origin_mode); } else { hl_naive_gru_forward_final_output( op_final_output, value.gate_value, value.prev_out_value, - value.output_value, frame_size, active_node); + value.output_value, frame_size, active_node, origin_mode); } value.gate_value += frame_size * 3; @@ -253,7 +256,8 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *output_grad, int frame_size, - ActivationType active_node) { + ActivationType active_node, + bool origin_mode) { T r_update_gate_value; T r_update_gate_grad; T r_frame_state_value; @@ -279,7 +283,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value, - &r_prev_out_grad, &r_out_grad, active_node); + &r_prev_out_grad, &r_out_grad, active_node, origin_mode); update_gate_grad[i] = r_update_gate_grad; frame_state_grad[i] = r_frame_state_grad; @@ -338,8 +342,8 @@ template void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *output_grad, - int frame_size, - ActivationType active_node) { + int frame_size, ActivationType active_node, + bool origin_mode) { #ifdef __AVX__ __m256 r_update_gate_value; __m256 r_update_gate_grad; @@ -368,7 +372,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value, - &r_prev_out_grad, &r_out_grad, active_node); + &r_prev_out_grad, &r_out_grad, active_node, origin_mode); update_gate_grad[i] = r_update_gate_grad; frame_state_grad[i] = r_frame_state_grad; @@ -431,16 +435,18 @@ template inline void backward_state_grad(OpStateGrad op_state_grad, GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, - ActivationType active_node) { + ActivationType active_node, bool origin_mode) { for (int b = 0; b < batch_size; b++) { if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_backward_state_grad( - op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, - grad.prev_out_grad, grad.output_grad, frame_size, active_node); + hl_avx_gru_backward_state_grad(op_state_grad, value.gate_value, + grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, + frame_size, active_node, origin_mode); } else { - hl_naive_gru_backward_state_grad( - op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, - grad.prev_out_grad, grad.output_grad, frame_size, active_node); + hl_naive_gru_backward_state_grad(op_state_grad, value.gate_value, + grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, + frame_size, active_node, origin_mode); } value.gate_value += frame_size * 3; diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h index 813d69f6aba722609a0523a5be71d32f91f76d59..6b57da1046a05b15b9c3302104d9f4d12c52227f 100644 --- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h @@ -72,7 +72,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, T *gate_value, T *prev_output_value, T *output_value, int frame_size, int batch_size, - ActivationType active_node) { + ActivationType active_node, + bool origin_mode) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; @@ -94,7 +95,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, } op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, - &r_output, active_node); + &r_output, active_node, origin_mode); gate_value[frame_idx + frame_size * 2] = r_value_frame_state; output_value[frame_idx] = r_output; @@ -109,7 +110,8 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *output_grad, int frame_size, int batch_size, - ActivationType active_node) { + ActivationType active_node, + bool origin_mode) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; @@ -139,7 +141,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad, - &r_out_grad, active_node); + &r_out_grad, active_node, origin_mode); gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad; diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h index f6d192358bd84eb56a2e01eb36f28d8832ef271f..894f5f04d2451151964965bd721ff35e353ff2b5 100644 --- a/paddle/fluid/operators/math/detail/gru_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_kernel.h @@ -57,10 +57,16 @@ class gru_finalOutput { public: HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state, T *prev_out, T *value_output, - ActivationType act_input) { + ActivationType act_input, bool origin_mode) { *value_frame_state = activation(*value_frame_state, act_input); - *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) + - ((*value_update_gate) * (*value_frame_state)); + if (origin_mode) { + *value_output = ((*value_update_gate) * (*prev_out)) + + *value_frame_state - + ((*value_update_gate) * (*value_frame_state)); + } else { + *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) + + ((*value_update_gate) * (*value_frame_state)); + } } #ifndef __NVCC__ #ifndef __AVX__ @@ -69,11 +75,20 @@ class gru_finalOutput { static const bool avx = true; HOSTDEVICE void operator()(__m256 *value_update_gate, __m256 *value_frame_state, __m256 *prev_out, - __m256 *value_output, ActivationType act_input) { + __m256 *value_output, ActivationType act_input, + bool origin_mode) { *value_frame_state = activation(*value_frame_state, act_input); - *value_output = _mm256_add_ps( - _mm256_sub_ps(*prev_out, _mm256_mul_ps(*value_update_gate, *prev_out)), - _mm256_mul_ps(*value_update_gate, *value_frame_state)); + if (origin_mode) { + *value_output = _mm256_sub_ps( + _mm256_add_ps(_mm256_mul_ps(*value_update_gate, *prev_out), + *value_frame_state), + _mm256_mul_ps(*value_update_gate, *value_frame_state)); + } else { + *value_output = _mm256_add_ps( + _mm256_sub_ps(*prev_out, + _mm256_mul_ps(*value_update_gate, *prev_out)), + _mm256_mul_ps(*value_update_gate, *value_frame_state)); + } } #endif #endif @@ -88,13 +103,23 @@ class gru_stateGrad { HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate, T *value_frame_state, T *grad_frame_state, T *value_prev_out, T *grad_prev_out, - T *grad_output, ActivationType act_input) { - *grad_update_gate = (*grad_output * (*value_frame_state)); - *grad_update_gate -= (*grad_output * (*value_prev_out)); - *grad_prev_out -= (*grad_output * (*value_update_gate)); - *grad_prev_out += *grad_output; - *grad_frame_state = activation(*grad_output * (*value_update_gate), - *value_frame_state, act_input); + T *grad_output, ActivationType act_input, + bool origin_mode) { + if (origin_mode) { + *grad_update_gate = + (*grad_output) * ((*value_prev_out) - (*value_frame_state)); + *grad_prev_out += (*grad_output * (*value_update_gate)); + *grad_frame_state = activation( + *grad_output * (static_cast(1.0) - (*value_update_gate)), + *value_frame_state, act_input); + } else { + *grad_update_gate = + (*grad_output) * ((*value_frame_state) - (*value_prev_out)); + *grad_prev_out += + (*grad_output * (static_cast(1.0) - *value_update_gate)); + *grad_frame_state = activation(*grad_output * (*value_update_gate), + *value_frame_state, act_input); + } } #ifndef __NVCC__ #ifndef __AVX__ @@ -106,17 +131,27 @@ class gru_stateGrad { __m256 *value_frame_state, __m256 *grad_frame_state, __m256 *value_prev_out, __m256 *grad_prev_out, __m256 *grad_output, - ActivationType act_input) { - *grad_update_gate = _mm256_mul_ps(*grad_output, *value_frame_state); - *grad_update_gate = _mm256_sub_ps( - *grad_update_gate, _mm256_mul_ps(*grad_output, *value_prev_out)); - *grad_prev_out = _mm256_add_ps( - _mm256_sub_ps(*grad_prev_out, - _mm256_mul_ps(*grad_output, *value_update_gate)), - *grad_output); - *grad_frame_state = - activation(_mm256_mul_ps(*grad_output, *value_update_gate), - *value_frame_state, act_input); + ActivationType act_input, bool origin_mode) { + if (origin_mode) { + *grad_update_gate = _mm256_mul_ps( + *grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state)); + *grad_prev_out = _mm256_add_ps( + *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate)); + *grad_frame_state = activation( + _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f), + *value_update_gate)), + *value_frame_state, act_input); + } else { + *grad_update_gate = _mm256_mul_ps( + *grad_output, _mm256_sub_ps(*value_frame_state, *value_prev_out)); + *grad_prev_out = _mm256_add_ps( + *grad_prev_out, + _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f), + *value_update_gate))); + *grad_frame_state = + activation(_mm256_mul_ps(*grad_output, *value_update_gate), + *value_frame_state, act_input); + } } #endif #endif diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc index 0e15b81deef43a932d4b2d3f545393b0ad9e080c..07c5cbf33378e6f6cee8a82448f55399966a2574 100644 --- a/paddle/fluid/operators/math/gru_compute.cc +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -23,7 +23,8 @@ struct GRUUnitFunctor { static void compute(const platform::CPUDeviceContext &context, GRUMetaValue value, int frame_size, int batch_size, const detail::ActivationType active_node, - const detail::ActivationType active_gate) { + const detail::ActivationType active_gate, + bool origin_mode) { #ifndef __NVCC__ auto blas = math::GetBlas(context); if (value.prev_out_value) { @@ -43,7 +44,8 @@ struct GRUUnitFunctor { } detail::forward_final_output(detail::forward::gru_finalOutput(), value, - frame_size, batch_size, active_node); + frame_size, batch_size, active_node, + origin_mode); #endif } }; @@ -54,10 +56,12 @@ struct GRUUnitGradFunctor { GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, const detail::ActivationType active_node, - const detail::ActivationType active_gate) { + const detail::ActivationType active_gate, + bool origin_mode) { #ifndef __NVCC__ detail::backward_state_grad(detail::backward::gru_stateGrad(), value, - grad, frame_size, batch_size, active_node); + grad, frame_size, batch_size, active_node, + origin_mode); auto blas = math::GetBlas(context); if (value.prev_out_value && grad.prev_out_grad) { blas.GEMM(false, true, batch_size, frame_size, frame_size, 1, diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu index 1327d914952d57aab6e5d17090d0ea976a6d4755..ec7e4d2228c38161bb1f3f97ec21b91db454adb4 100644 --- a/paddle/fluid/operators/math/gru_compute.cu +++ b/paddle/fluid/operators/math/gru_compute.cu @@ -24,7 +24,8 @@ struct GRUUnitFunctor { static void compute(const platform::CUDADeviceContext &context, GRUMetaValue value, int frame_size, int batch_size, const detail::ActivationType active_node, - const detail::ActivationType active_gate) { + const detail::ActivationType active_gate, + bool origin_mode) { auto stream = context.stream(); dim3 threads; dim3 grid; @@ -73,14 +74,14 @@ struct GRUUnitFunctor { T><<>>( detail::forward::gru_finalOutput(), value.gate_value, value.prev_out_value, value.output_value, frame_size, batch_size, - active_node); + active_node, origin_mode); } else { detail::KeGruForwardFinalOutput, /* is_batch= */ true, T><<>>( detail::forward::gru_finalOutput(), value.gate_value, value.prev_out_value, value.output_value, frame_size, batch_size, - active_node); + active_node, origin_mode); } } }; @@ -91,7 +92,8 @@ struct GRUUnitGradFunctor { GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, const detail::ActivationType active_node, - const detail::ActivationType active_gate) { + const detail::ActivationType active_gate, + bool origin_mode) { auto stream = context.stream(); dim3 threads; dim3 grid; @@ -111,14 +113,14 @@ struct GRUUnitGradFunctor { /* is_batch= */ false><<>>( detail::backward::gru_stateGrad(), value.gate_value, grad.gate_grad, value.prev_out_value, grad.prev_out_grad, - grad.output_grad, frame_size, batch_size, active_node); + grad.output_grad, frame_size, batch_size, active_node, origin_mode); } else { detail::KeGruBackwardStateGrad< detail::backward::gru_stateGrad, /* is_batch= */ true><<>>( detail::backward::gru_stateGrad(), value.gate_value, grad.gate_grad, value.prev_out_value, grad.prev_out_grad, - grad.output_grad, frame_size, batch_size, active_node); + grad.output_grad, frame_size, batch_size, active_node, origin_mode); } auto blas = math::GetBlas(context); diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h index c5816b16cd90410fcc48929931c25d0d561ad653..f5ddec0aaa275a32a5a9937699066a170edc0825 100644 --- a/paddle/fluid/operators/math/gru_compute.h +++ b/paddle/fluid/operators/math/gru_compute.h @@ -44,7 +44,8 @@ struct GRUUnitFunctor { static void compute(const DeviceContext &context, GRUMetaValue value, int frame_size, int batch_size, const detail::ActivationType active_node, - const detail::ActivationType active_gate); + const detail::ActivationType active_gate, + bool origin_mode); }; template @@ -52,7 +53,8 @@ struct GRUUnitGradFunctor { static void compute(const DeviceContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, const detail::ActivationType active_node, - const detail::ActivationType active_gate); + const detail::ActivationType active_gate, + bool origin_mode); }; } // namespace math diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 56971cff43b919f946091cebf909731a23d60736..093571a93b4a79a0330a82fce586c34e18db2d3a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -864,12 +864,14 @@ def dynamic_gru(input, is_reverse=False, gate_activation='sigmoid', candidate_activation='tanh', - h_0=None): + h_0=None, + origin_mode=False): """ **Gated Recurrent Unit (GRU) Layer** - Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on - Sequence Modeling `_ . + if origin_mode is False, then the equation of a gru step is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ . The formula is as follows: @@ -883,6 +885,21 @@ def dynamic_gru(input, h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} + + if origin_mode is True then the equation is from paper + Learning Phrase Representations using RNN Encoder-Decoder for Statistical + Machine Translation `_ + + .. math:: + + u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) + + r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) + + \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) + + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` is the update gate and reset gate activation function and :math:`sigmoid` is usually used for it. :math:`act_c` is the activation function for @@ -980,7 +997,8 @@ def dynamic_gru(input, attrs={ 'is_reverse': is_reverse, 'gate_activation': gate_activation, - 'activation': candidate_activation + 'activation': candidate_activation, + 'origin_mode': origin_mode }) return hidden @@ -991,9 +1009,14 @@ def gru_unit(input, param_attr=None, bias_attr=None, activation='tanh', - gate_activation='sigmoid'): + gate_activation='sigmoid', + origin_mode=False): """ - GRU unit layer. The equation of a gru step is: + **GRU unit layer** + + if origin_mode is True, then the equation of a gru step is from paper + `Learning Phrase Representations using RNN Encoder-Decoder for Statistical + Machine Translation `_ .. math:: u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) @@ -1002,7 +1025,21 @@ def gru_unit(input, m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) - h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1}) + h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) + + if origin_mode is False, then the equation of a gru step is from paper + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling `_ + + .. math:: + u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) + + r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) + + m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) + + h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) + The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms of the equation above, the :math:`z_t` is split into 3 parts - diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index f61a447fd77d001c1a9c42c46ba9a0d747e926ec..6606162733487b15ef55f1a4677fb382e6e7e0ac 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -31,7 +31,8 @@ def gru( is_reverse, act_state, act_gate, - dtype='float32'): + dtype='float32', + origin_mode=False): def _seq_to_batch(lod, is_reverse): idx_in_seq_list = [] seq_lens = lod[0] @@ -66,7 +67,10 @@ def gru( w_c = w.flatten()[D * D * 2:].reshape((D, D)) c = act_state(np.dot(r_h_p, w_c) + g[:, D * 2:]) g = np.hstack((u_r, c)) - h = u * c + (1 - u) * h_p + if origin_mode: + h = (1 - u) * c + u * h_p + else: + h = u * c + (1 - u) * h_p return g, r_h_p, h T = sum(lod[0]) @@ -110,6 +114,7 @@ class TestGRUOp(OpTest): self.act_state = 'tanh' self.act_gate = 'sigmoid' self.dtype = 'float64' + self.origin_mode = False self.set_confs() T = sum(self.lod[0]) @@ -126,7 +131,8 @@ class TestGRUOp(OpTest): batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru( input, self.lod, h0, weight, bias, self.is_reverse, - ACTIVATION[self.act_state], ACTIVATION[self.act_gate], self.dtype) + ACTIVATION[self.act_state], ACTIVATION[self.act_gate], self.dtype, + self.origin_mode) self.inputs = {'Input': (input, self.lod), 'Weight': weight} if self.with_bias: @@ -145,7 +151,8 @@ class TestGRUOp(OpTest): self.attrs = { 'activation': self.act_state, 'gate_activation': self.act_gate, - 'is_reverse': self.is_reverse + 'is_reverse': self.is_reverse, + 'origin_mode': self.origin_mode } def test_check_output(self): @@ -155,12 +162,24 @@ class TestGRUOp(OpTest): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) +class TestGRUOriginMode(TestGRUOp): + def set_confs(self): + self.origin_mode = True + + class TestGRUOp2(TestGRUOp): def set_confs(self): self.D = 19 self.dtype = 'float32' +class TestGRUOp2OriginMode(TestGRUOp): + def set_confs(self): + self.D = 19 + self.dtype = 'float32' + self.origin_mode = True + + class TestGRUOpNoInitial(TestGRUOp): def set_confs(self): self.with_h0 = False @@ -182,5 +201,11 @@ class TestGRUOpReverse(TestGRUOp): self.is_reverse = True +class TestGRUOpReverseOriginMode(TestGRUOp): + def set_confs(self): + self.is_reverse = True + self.origin_mode = True + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py index b5a66fdf086f2abc0c9a8af663241b9eda739407..78f2f030f5b6dc9d827f3930dff590a0f5b784fb 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py @@ -53,7 +53,7 @@ class TestGRUUnitOp(OpTest): GRUActivationType.relu: relu, } - def set_inputs(self): + def set_inputs(self, origin_mode=False): batch_size = self.batch_size frame_size = self.frame_size self.op_type = 'gru_unit' @@ -68,10 +68,11 @@ class TestGRUUnitOp(OpTest): } self.attrs = { 'activation': GRUActivationType.tanh, - 'gate_activation': GRUActivationType.sigmoid + 'gate_activation': GRUActivationType.sigmoid, + 'origin_mode': origin_mode } - def set_outputs(self): + def set_outputs(self, origin_mode=False): # GRU calculations batch_size = self.batch_size frame_size = self.frame_size @@ -93,7 +94,10 @@ class TestGRUUnitOp(OpTest): c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) + g[:, frame_size * 2:]) g = np.hstack((u_r, c)) - h = u * c + (1 - u) * h_p + if origin_mode: + h = (1 - u) * c + u * h_p + else: + h = u * c + (1 - u) * h_p self.outputs = { 'Gate': g.astype('float64'), 'ResetHiddenPrev': r_h_p.astype('float64'), @@ -111,8 +115,14 @@ class TestGRUUnitOp(OpTest): self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden']) +class TestGRUUnitOpOriginMode(TestGRUUnitOp): + def setUp(self): + self.set_inputs(origin_mode=True) + self.set_outputs(origin_mode=True) + + class TestGRUUnitOpWithBias(TestGRUUnitOp): - def set_inputs(self): + def set_inputs(self, origin_mode=False): batch_size = self.batch_size frame_size = self.frame_size super(TestGRUUnitOpWithBias, self).set_inputs() @@ -120,7 +130,8 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp): -0.1, 0.1, (1, frame_size * 3)).astype('float64') self.attrs = { 'activation': GRUActivationType.identity, - 'gate_activation': GRUActivationType.sigmoid + 'gate_activation': GRUActivationType.sigmoid, + 'origin_mode': origin_mode } def test_check_grad(self): @@ -132,5 +143,11 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp): no_grad_set=set('Input')) +class TestGRUUnitOpWithBiasOriginMode(TestGRUUnitOpWithBias): + def setUp(self): + self.set_inputs(origin_mode=True) + self.set_outputs(origin_mode=True) + + if __name__ == '__main__': unittest.main()