diff --git a/CMakeLists.txt b/CMakeLists.txt index 1252e7539816016dfdf1b90b8941fa42e6bb85e0..264420ad830ed39b38f1918951d8d66c84fd5ee9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,6 +127,7 @@ include(external/warpctc) # download, build, install warpctc include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 +include(external/nccl) include(cudnn) # set cudnn libraries, must before configure include(configure) # add paddle env configuration @@ -159,7 +160,7 @@ set(EXTERNAL_LIBS if(WITH_GPU) list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) if(NOT WITH_DSO) - list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) + list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) endif(NOT WITH_DSO) endif(WITH_GPU) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index db8f5ab0456792f903093b9cf20e2541f00add5c..24ddb24399dabeec9b8e5faf36be3eb21f420111 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -62,11 +62,11 @@ else() FIND_PACKAGE(CUDA REQUIRED) if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) - message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile") + message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") endif() if(NOT CUDNN_FOUND) - message(FATAL_ERROR "Paddle need cudnn to compile") + message(FATAL_ERROR "Paddle needs cudnn to compile") endif() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake new file mode 100644 index 0000000000000000000000000000000000000000..dfbbed58c9ed7cc57809b3d33a29ce26a35d75a2 --- /dev/null +++ b/cmake/external/nccl.cmake @@ -0,0 +1,50 @@ +INCLUDE(ExternalProject) + +SET(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl) + +INCLUDE_DIRECTORIES(${NCCL_SOURCE_DIR}/src/extern_nccl/src) + + +if(WITH_DSO) + # If we use DSO, we do not build nccl, just download the dependencies + set(NCCL_BUILD_COMMAND "") + set(NCCL_INSTALL_COMMAND "") + set(NCCL_INSTALL_DIR "") +else() + # otherwise, we build nccl and link it. + set(NCCL_BUILD_COMMAND "make -j 8") + set(NCCL_INSTALL_COMMAND "make install") + SET(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl) +endif() + +ExternalProject_Add( + extern_nccl + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git" + GIT_TAG "v1.3.4-1" + PREFIX "${NCCL_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "${NCCL_BUILD_COMMAND}" + INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}" + INSTALL_DIR "${NCCL_INSTALL_DIR}" + TEST_COMMAND "" +) + +if (WITH_DSO) + if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") + add_library(nccl STATIC ${dummyfile}) + else() + add_library(nccl INTERFACE) + endif() +else() + ADD_LIBRARY(nccl STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET nccl PROPERTY IMPORTED_LOCATION + ${NCCL_INSTALL_DIR}/lib/libnccl.a) +endif() + +add_dependencies(nccl extern_nccl) + +LIST(APPEND external_project_dependencies nccl) diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h index e926180780609c0a8ffc6270627835c50bbce782..59a64d71371b546f76eabdeed7e7514e8fb0f84a 100644 --- a/paddle/framework/op_info.h +++ b/paddle/framework/op_info.h @@ -87,11 +87,8 @@ class OpInfoMap { } } - template - void IterAllInfo(Callback callback) { - for (auto& it : map_) { - callback(it.first, it.second); - } + const std::unordered_map& map() const { + return map_; } private: diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc index c302217e5aacdc17800238770d689b7fb65804f3..8e92c81d1137472737230be79d71824593d3256f 100644 --- a/paddle/framework/var_desc.cc +++ b/paddle/framework/var_desc.cc @@ -18,6 +18,10 @@ limitations under the License. */ namespace paddle { namespace framework { +VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); } + +void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); } + void VarDescBind::SetShape(const std::vector &dims) { VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); } diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h index af4c26ca0a77b444852cc01545a8b585a5c3afcc..929de1f836fa906966ff125c70380d85d062afdf 100644 --- a/paddle/framework/var_desc.h +++ b/paddle/framework/var_desc.h @@ -75,9 +75,9 @@ class VarDescBind { int32_t GetLodLevel() const; - VarDesc::VarType GetType() const { return desc_.type(); } + VarDesc::VarType GetType() const; - void SetType(VarDesc::VarType type) { desc_.set_type(type); } + void SetType(VarDesc::VarType type); bool Persistable() const { return desc_.persistable(); } diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 9b54c95b55cc9b503de5ff527ac983eb4752ddb0..2c21a5b2aaecb17a52a5de9a98664068f2255d83 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -339,9 +339,13 @@ private: * clear all grad */ void clearGrads() { - output_.grad->zeroMem(); + if (output_.grad) { + output_.grad->zeroMem(); + } for (size_t i = 0; i < outputOtherDevice_.size(); i++) { - outputOtherDevice_[i].grad->zeroMem(); + if (outputOtherDevice_[i].grad) { + outputOtherDevice_[i].grad->zeroMem(); + } } } diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 75fcc1cda165197fc4413efc6bbbc440088cb4cd..f97bc837dca09060c55cae6a5524c49cd69df28b 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -115,7 +115,8 @@ set(DEPS_OPS softmax_with_cross_entropy_op sum_op pool_op - pool_with_index_op) + pool_with_index_op + lstm_op) op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc @@ -126,6 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(sum_op DEPS net_op) op_library(pool_op DEPS pooling) op_library(pool_with_index_op DEPS pooling) +op_library(lstm_op DEPS sequence2batch lstm_compute) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) diff --git a/paddle/operators/conv2d_op.h b/paddle/operators/conv2d_op.h index f629728f68d65ce81b4910cae7f89ab06d6d94b8..0621389a79eee6b5e75b1eab309b49f8aa4a97ca 100644 --- a/paddle/operators/conv2d_op.h +++ b/paddle/operators/conv2d_op.h @@ -114,7 +114,7 @@ class GemmConv2DKernel : public framework::OpKernel { // im2col Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); im2col(context.device_context(), in_slice, col, strides[0], strides[1], - paddings[0], paddings[1]); + paddings[0], paddings[0], paddings[1], paddings[1]); // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); @@ -213,7 +213,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel { Tensor in_grad_slice = in_grad_batch.Slice(g * in_step, (g + 1) * in_step); col2im(context.device_context(), in_grad_slice, col, strides[0], - strides[1], paddings[0], paddings[1]); + strides[1], paddings[0], paddings[0], paddings[1], + paddings[1]); } } } @@ -235,7 +236,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel { out_grad_batch.Slice(g * out_step, (g + 1) * out_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); im2col(context.device_context(), in_slice, col, strides[0], - strides[1], paddings[0], paddings[1]); + strides[1], paddings[0], paddings[0], paddings[1], + paddings[1]); // gemm Tensor filter_grad_slice = diff --git a/paddle/operators/conv2dtranspose_op.cc b/paddle/operators/conv2dtranspose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c1b231906e2f172b6f9cee55f850d1a5ec6c3221 --- /dev/null +++ b/paddle/operators/conv2dtranspose_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/conv2dtranspose_op.h" + +namespace paddle { +namespace operators { + +void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of Conv2DTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of Conv2DTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of Conv2DTransposeOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + + for (size_t i = 0; i < paddings.size(); ++i) { + PADDLE_ENFORCE_EQ(paddings[i], 0, + "No Padding allowed in conv transpose op."); + } + + PADDLE_ENFORCE_EQ(in_dims.size(), 4, + "Conv2DTransposeOp input should be 4-D tensor."); + PADDLE_ENFORCE_EQ(filter_dims.size(), 4, + "Conv2DTransposeOp filter should be 4-D tensor."); + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], + "input and kernel input dimension should be equal."); + + auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2]; + auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3]; + ctx->SetOutputDim("Output", + {in_dims[0], filter_dims[1], output_height, output_width}); +} + +Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( + framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "(Tensor) The input tensor of convolution transpose operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of input channels, H and W is the height and width of image."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution transpose operator." + "The format of the filter tensor is CMHW, where C is the number of " + "output image channels, M is the number of input image channels, " + "H and W is height and width of filter. " + "We enforce groups number == 1 and padding == 0 in " + "convolution transpose Scenario."); + AddOutput("Output", + "(Tensor) The output tensor of convolution transpose operator." + "The format of output tensor is also NCHW."); + AddAttr>("strides", + "strides of convolution transpose operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "paddings of convolution transpose operator.") + .SetDefault({0, 0}); + AddComment(R"DOC( +The convolution transpose operation calculates the output based on the input, filter +and strides, paddings, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +)DOC"); +} + +void Conv2DTransposeOpGrad::InferShape( + framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp, + ops::Conv2DTransposeOpMaker, conv2dtranspose_grad, + ops::Conv2DTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv2dtranspose, + ops::GemmConv2DTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv2dtranspose_grad, + ops::GemmConv2DTransposeGradKernel); diff --git a/paddle/operators/conv2dtranspose_op.cu b/paddle/operators/conv2dtranspose_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..761bc1959e69be94f43571728e6b92a322558b99 --- /dev/null +++ b/paddle/operators/conv2dtranspose_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/conv2dtranspose_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_GPU_KERNEL( + conv2dtranspose, + ops::GemmConv2DTransposeKernel); +REGISTER_OP_GPU_KERNEL( + conv2dtranspose_grad, + ops::GemmConv2DTransposeGradKernel); diff --git a/paddle/operators/conv2dtranspose_op.h b/paddle/operators/conv2dtranspose_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8c70b3dcec1e26ab3d8a42d88040764c643b5ae6 --- /dev/null +++ b/paddle/operators/conv2dtranspose_op.h @@ -0,0 +1,254 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/im2col.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +// Define Op classes in .h file so that other conv transpose +// operator implementations can reuse the code. +class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv2DTransposeOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker); +}; + +class Conv2DTransposeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class Conv2DTransposeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +template +class GemmConv2DTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + // The filter will be reshaped, so it should not be constant pointer + Tensor filter = *context.Input("Filter"); + + Tensor* output = context.Output("Output"); + + std::vector strides = context.Attr>("strides"); + + // TODO(Zhuoyuan): Paddings can be added in future. + // groups will alway be disabled in conv2dtranspose. + + const int batch_size = input->dims()[0]; + const int m = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + + const int k_h = filter.dims()[2]; + const int k_w = filter.dims()[3]; + + const int c = output->dims()[1]; // output channels + const int o_h = output->dims()[2]; + const int o_w = output->dims()[3]; + + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kCFO, Place, T> + col2im; + + // use col_shape in the im2col and col2im calculation + DDim col_shape = {c, k_h, k_w, h, w}; + + // use col_matrix_shape in the gemm calculation + DDim col_matrix_shape = {c * k_h * k_w, h * w}; + + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + DDim output_shape = {c, o_h, o_w}; + DDim input_matrix_shape = {m, h * w}; + + DDim filter_matrix_shape = {m, c * k_h * k_w}; + filter.Resize(filter_matrix_shape); + + // convolution transpose: gemm + col2im (similar to conv-backward on input) + + output->mutable_data(context.GetPlace()); + auto t = framework::EigenVector::Flatten(*output); + t.device(context.GetEigenDevice()) = t.constant(static_cast(0)); + + for (int i = 0; i < batch_size; i++) { + // batch with size (M, h * w) + Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + // filter size: (M, c * k_h * k_w) + + // output size: (c, o_h, o_w) + Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); + + // col_matrix = filter * input_batch + // of shape (c * k_h * k_w, h * w) + math::matmul(context.device_context(), filter, true, + input_batch, false, T(1.0), &col_matrix, T(0.0)); + col2im(context.device_context(), output_batch, col, strides[0], + strides[1], 0, 0, 0, 0); + } + } +}; + +template +class GemmConv2DTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + + // For filter, we do not use const pointer b/c we will do reshape, + // but we should avoid modifying its value. + Tensor filter = *context.Input("Filter"); + + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + + std::vector strides = context.Attr>("strides"); + // Actually, no paddings and groups allowed in conv transpose. + std::vector paddings = context.Attr>("paddings"); + + const int batch_size = input->dims()[0]; + const int m = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + + const int k_h = filter.dims()[2]; + const int k_w = filter.dims()[3]; + + const int c = output_grad->dims()[1]; // output channels + const int o_h = output_grad->dims()[2]; + const int o_w = output_grad->dims()[3]; + + // Only im2col functor required for bp to get to the right shape + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kCFO, Place, T> + im2col; + + // use col_shape in the im2col and col2im calculation + DDim col_shape = {c, k_h, k_w, h, w}; + + // use col_matrix_shape in the gemm calculation + DDim col_matrix_shape_f = {c * h * w, k_h * k_w}; + + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + + DDim output_shape = {c, o_h, o_w}; + DDim input_matrix_shape = {m, h * w}; + + DDim filter_matrix_shape = {m, c * k_h * k_w}; + filter.Resize(filter_matrix_shape); + + // convolution transpose grad on input: + // im2col + gemm (similar to conv-forward) + // input need to compute gradient + if (input_grad) { + Tensor col_matrix; + col_matrix.ShareDataWith(col); + DDim col_matrix_shape = {c * k_h * k_w, h * w}; + col_matrix.Resize(col_matrix_shape); + + input_grad->mutable_data(context.GetPlace()); + auto t = framework::EigenVector::Flatten(*input_grad); + t.device(context.GetEigenDevice()) = t.constant(static_cast(0)); + + for (int i = 0; i < batch_size; i++) { + // batch with size (c, o_h * o_w) + Tensor output_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_shape); + // filter of size (m, c * k_h * k_w) + + // batch with size (m, h, w) + Tensor input_grad_batch = + input_grad->Slice(i, i + 1).Resize(input_matrix_shape); + + // im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w) + im2col(context.device_context(), output_grad_batch, col, strides[0], + strides[1], paddings[0], paddings[0], paddings[1], paddings[1]); + + // gemm: dx = filter * dy + // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h) + math::matmul(context.device_context(), filter, false, + col_matrix, false, T(1.0), &input_grad_batch, + T(0.0)); + } + } + + // filter gradient required + if (filter_grad) { + Tensor col_matrix_f; + col_matrix_f.ShareDataWith(col); + DDim col_matrix_shape_f = {c * h * w, k_h * k_w}; + col_matrix_f.Resize(col_matrix_shape_f); + + filter_grad->mutable_data(context.GetPlace()); + Tensor filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + auto t = framework::EigenVector::Flatten(filter_grad_); + t.device(context.GetEigenDevice()) = t.constant(static_cast(0)); + + for (int i = 0; i < batch_size; ++i) { + // batch with size (c, o_h, o_w) + Tensor output_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_shape); + // input batch + Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + + // im2col: (c * h * w, k_h * k_w) + im2col(context.device_context(), output_grad_batch, col, strides[0], + strides[1], paddings[0], paddings[0], paddings[1], paddings[1]); + + // gemm: d_filter = x * y_grad^T + // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h) + math::matmul(context.device_context(), in_batch, false, + col_matrix_f, true, T(1.0), &filter_grad_, + T(1.0)); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc index a994d916761da3b44cb60bd4c1c767cd1987522f..ed78e9e3a3a49b7ff0990b8d13cfe2dae594b722 100644 --- a/paddle/operators/crop_op.cc +++ b/paddle/operators/crop_op.cc @@ -59,7 +59,8 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker { "The input should be a k-D tensor(k > 0 and k < 7)"); AddInput("Y", "The input used as reference for cropping" - " with the same dimension as X. "); + " with the same dimension as X. ") + .AsDispensable(); AddOutput("Out", "The output of crop op " "with the same dimension as X."); diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc deleted file mode 100644 index 7c422c81fc479fa2e317bdee1b66017096381d27..0000000000000000000000000000000000000000 --- a/paddle/operators/fc_op.cc +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/framework/op_registry.h" -#include "paddle/operators/net_op.h" - -namespace paddle { -namespace operators { - -class FCOp : public NetOp { - public: - FCOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : NetOp(type, inputs, outputs, attrs) { - PADDLE_ENFORCE(!Inputs("X").empty(), - "Inputs(X) of FCOp should not be null."); - PADDLE_ENFORCE(!Inputs("W").empty(), - "Inputs(W) of FCOp should not be null."); - PADDLE_ENFORCE(!Outputs("MulOut").empty(), - "Outputs(MulOut) of FCOp should not be null."); - PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName, - "Output(Out) of FCOp should not be null."); - - auto x = Inputs("X"); - auto w = Inputs("W"); - auto mul_out = Outputs("MulOut"); - PADDLE_ENFORCE_EQ( - x.size(), w.size(), - "The size of inputs X(%d) should be the same as that of weights W(%d).", - x.size(), w.size()); - PADDLE_ENFORCE_EQ(mul_out.size(), x.size(), - "The size of intermediate mul_out(%d) should be the same " - "as that of inputs X(%d).", - mul_out.size(), x.size()); - - size_t n = x.size(); - PADDLE_ENFORCE_GE(n, static_cast(1), - "The size of inputs X(%d) should be no less than 1.", n); - - auto x_num_col_dims = Attr>("xNumColDims"); - - // Set all values or set no values (use the default value) - if (!x_num_col_dims.empty()) { - PADDLE_ENFORCE_EQ(x_num_col_dims.size(), n, - "The size of attribute xNumColDims(%d) should be the " - "same as that of inputs X(%d).", - x_num_col_dims.size(), n); - } else { - x_num_col_dims.resize(n); - for (size_t i = 0; i < n; i++) { - x_num_col_dims[i] = 1; - } - } - - // mul_out[i] = X[i] * W[i] - for (size_t i = 0; i < n; i++) { - framework::AttributeMap mul_attr; - mul_attr["x_num_col_dims"] = static_cast(x_num_col_dims[i]); - mul_attr["y_num_col_dims"] = static_cast(1); - AppendOp( - framework::OpRegistry::CreateOp("mul", {{"X", {x[i]}}, {"Y", {w[i]}}}, - {{"Out", {mul_out[i]}}}, mul_attr)); - } - - // sum_out = X[0] * W[0] + ... + X[n-1] * W[n-1] - auto sum_out = mul_out[0]; - if (n > 1) { - PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName, - "Output(SumOut) of FCOp should not be null when the " - "size of Inputs(X) > 1."); - - sum_out = Output("SumOut"); - AppendOp(framework::OpRegistry::CreateOp("sum", {{"X", {mul_out}}}, - {{"Out", {sum_out}}}, {})); - } else { - if (Output("SumOut") != framework::kEmptyVarName) { - this->Rename(Output("SumOut"), framework::kEmptyVarName); - } - } - - // add_out = sum_out + b - auto b = Input("B"); - auto add_out = sum_out; - if (b != framework::kEmptyVarName) { - PADDLE_ENFORCE_NE( - Output("AddOut"), framework::kEmptyVarName, - "Output(AddOut) of FCOp should not be null when Input(B) is set."); - - add_out = Output("AddOut"); - AppendOp(framework::OpRegistry::CreateOp( - "elementwise_add", {{"X", {sum_out}}, {"Y", {Input("B")}}}, - {{"Out", {add_out}}}, {})); - } else { - if (Output("AddOut") != framework::kEmptyVarName) { - this->Rename(Output("AddOut"), framework::kEmptyVarName); - } - } - - auto activation = Attr("activation"); - AppendOp(framework::OpRegistry::CreateOp(activation, {{"X", {add_out}}}, - {{"Y", {Output("Out")}}}, {})); - CompleteAddOp(false); - } -}; - -class FCOpMaker : public framework::OpProtoAndCheckerMaker { - public: - FCOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", - "(A vector of Tensors) each input Tensor can be of arbitrary " - "dimension, and will be reshaped to a 2-D matrix of size " - "(minibatch, number_of_input_features) according to attribute " - "xNumColDims.") - .AsDuplicable(); - AddInput("W", - "(A vector of Tensors) the weights of FC operator, a " - "vector of 2-D matrix of size " - "(number_of_input_features, number_of_neurons).") - .AsDuplicable(); - AddInput("B", - "(Tensor) the bias of FC operator, a 1-D vector of size " - "number_of_neurons."); - - AddOutput("Out", - "(Tensor) the activated output matrix of FC operator, a 2-D " - "matrix of size (minibatch, number_of_neurons)."); - AddOutput("MulOut", - "(A vector of Tensors) the intermediate outputs of FC operator, " - "each Tensor saving the product of X_i * W_i.") - .AsIntermediate() - .AsDuplicable(); - AddOutput( - "SumOut", - "(Tensor) the intermediate output of FC operator, " - "saving the sum of the products of X and W, that is sum{X_i * W_i}.") - .AsIntermediate(); - AddOutput("AddOut", - "(Tensor) the non-actived output of FC operator, " - "saving sum{X_i * W_i} + B.") - .AsIntermediate(); - AddAttr( - "activation", - "(string, default identity) the activation type of FC operator.") - .SetDefault("identity") - .InEnum({"identity", "sigmoid", "softmax"}); - AddAttr>( - "xNumColDims", - "(std::vector) The inputs Tensors of FC operator can be of " - "more than 2 dimensions. In that case, each input Tensor `X_i` will be " - "reshaped to a 2-D matrix. The matrix's first dimension " - "(the length of column) will be the product of `X_i`'s last " - "`xNumColDims_i` dimensions, that is " - "`X_i.dims[0] x ... x X_i.dims[xNumColDims_i - 1]`. " - "The matrix's second dimension (the length of row) will be the product " - "of `X_i`'s first `rank - xNumColDims_i` dimensions, that is " - "`X_i.dims[xNumColDims_i] x ... x X_i.dims[rank - 1]`)") - .SetDefault(std::vector{}); - - AddComment(R"DOC( -Fully Connected Operator, known as Fully Connected Layer or Inner Product Layer -in Convolutional Neural Networks. Neurons in a fully connected layer have -full connections to all activations in the previous layer. -It computes an inner product of a set of -learned weights with a matrix multiplication followed by a bias offset -(optionally). - -Equation: - Out = Act(sum_n{X_i * W_i} + B) - -where X_i is Tensor that will be reshaped to a 2-D matrix of size (M x K), -usually M is the minibatch size and K is the number of input features. -W_i is a 2-D matrix of size (K x N), where N means the number of neurons -in the fully connected layer. B is a 1-D vector of size N. -Thus, the output Out is a 2-D matrix of size (M x N). -Activation type can be set to `identity` (default), `sigmoid` or `softmax`. - -All the inputs can carry the LoD (Level of Details) information, -or not. But the output only shares the LoD with first input (`X[0]`). -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(fc, ops::FCOp, ops::FCOpMaker); diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc index 72dd841c85ce9934a57263d10c366e679693c471..a596f93769780419d27b7c0b40631d3da78e6700 100644 --- a/paddle/operators/gru_unit_op.cc +++ b/paddle/operators/gru_unit_op.cc @@ -54,8 +54,7 @@ class GRUUnitOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( weight_width, frame_size * 3, "The shape of Weight matrix must be [frame_size, frame_size * 3]."); - auto bias = Input("Bias"); - if (bias != framework::kEmptyVarName) { + if (ctx->HasInput("Bias")) { auto bias_dims = ctx->GetInputDim("Bias"); int bias_height = bias_dims[0]; int bias_width = bias_dims[1]; @@ -89,7 +88,8 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { "weights of output candidate with shape [frame_size, frame_size]"); AddInput("Bias", "(Tensor) Bias vector with shape [1, frame_size * 3] concating " - "bias of the update gate, reset gate and output candidate."); + "bias of the update gate, reset gate and output candidate.") + .AsDispensable(); AddOutput("Gate", "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " "output of update gate, reset gate and output candidate") diff --git a/paddle/operators/identity_op.cc b/paddle/operators/identity_op.cc deleted file mode 100644 index 2cc632205e63abbe412b09af4b894420ac512ec5..0000000000000000000000000000000000000000 --- a/paddle/operators/identity_op.cc +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/operators/net_op.h" -#include "paddle/operators/scale_op.h" - -namespace paddle { -namespace operators { - -// The identity operator is an alias of the scale operator. This is also an -// example for creating an alias for an existing operator. -template -class IdentityOpMaker : public framework::OpProtoAndCheckerMaker { - public: - IdentityOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input tensor of identity operator."); - AddOutput("Y", "The output tensor of identity operator."); - AddComment(R"DOC( -The identity operator is an alias of the scale operator -with the attribute scale fixed to 1.0. -)DOC"); - } -}; - -template -class IdentityOp : public NetOp { - public: - IdentityOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : NetOp(type, inputs, outputs, attrs) { - PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName, - "Input(X) of IdentityOp should not be null."); - PADDLE_ENFORCE_NE(Output("Y"), framework::kEmptyVarName, - "Output(Y) of IdentityOp should not be null."); - - AppendOp(framework::OpRegistry::CreateOp( - "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Y")}}}, - {{"scale", static_cast(1)}})); - CompleteAddOp(false); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp, - ops::IdentityOpMaker); diff --git a/paddle/operators/interp_op.cc b/paddle/operators/interp_op.cc deleted file mode 100644 index d02b01c3f3a1b30ec27253140203b076a98ce0c2..0000000000000000000000000000000000000000 --- a/paddle/operators/interp_op.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/framework/op_registry.h" -#include "paddle/operators/net_op.h" - -namespace paddle { -namespace operators { - -class InterpOp : public NetOp { - public: - InterpOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : NetOp(type, inputs, outputs, attrs) { - PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName, - "Input(X) of InterpOp should not be null."); - PADDLE_ENFORCE_NE(Input("Y"), framework::kEmptyVarName, - "Input(Y) of InterpOp should not be null."); - PADDLE_ENFORCE_NE(Input("W"), framework::kEmptyVarName, - "Input(W) of InterpOp should not be null."); - PADDLE_ENFORCE_NE(Output("SubOut"), framework::kEmptyVarName, - "Output(SubOut) of InterpOp should not be null."); - PADDLE_ENFORCE_NE(Output("MulOut"), framework::kEmptyVarName, - "Output(MulOut) of InterpOp should not be null."); - PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName, - "Output(Out) of InterpOp should not be null."); - - // SubOut = X - Y - auto x = Input("X"); - auto y = Input("Y"); - auto sub_out = Output("SubOut"); - AppendOp(framework::OpRegistry::CreateOp( - "elementwise_sub", {{"X", {x}}, {"Y", {y}}}, {{"Out", {sub_out}}}, {})); - - // MulOut = SubOut * W = (X - Y) * W - auto w = Input("W"); - auto mul_out = Output("MulOut"); - AppendOp(framework::OpRegistry::CreateOp( - "elementwise_mul", {{"X", {sub_out}}, {"Y", {w}}}, {{"Out", {mul_out}}}, - {{"axis", 0}})); - - // Out = MulOut + Y = (X - Y) * W + Y = X * W + Y * (1 - W) - AppendOp(framework::OpRegistry::CreateOp("elementwise_add", - {{"X", {mul_out}}, {"Y", {y}}}, - {{"Out", {Output("Out")}}}, {})); - - CompleteAddOp(false); - } -}; - -class InterpOpMaker : public framework::OpProtoAndCheckerMaker { - public: - InterpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", - "(Tensor), 2-D Matrix of shape [batch_size, data_dim]" - "containing data samples, the first input of interp_op"); - AddInput("Y", - "(Tensor), 2-D Matrix of shape `[batch_size, data_dim]`" - "containing data samples, the second input of interp_op"); - AddInput("W", - "(Tensor), 1-D Vector of shape [batch_size]," - "the interpolated values in the half-open interval [0.0, 1.0)"); - AddOutput("SubOut", - "(Tensor), the intermediate subtraction outputs, saving X - Y.") - .AsIntermediate(); - AddOutput("MulOut", - "(Tensor), the intermediate multiplication outputs," - "saving the elementwise multiplication of (X - Y) and W.") - .AsIntermediate(); - AddOutput("Out", - "(Tensor), the output of interp_op, same shape with X," - "returns the first-dimensional piecewise linear interpolant " - "between X and Y"); - AddComment(R"DOC( - Linear Interpolation with two inputs, used in NEURAL TURING MACHINE. - - Equation: - Out.row[i] = X.row[i] * W[i] + Y.row[i] * (1 - W[i]) - = (X.row[i] - Y.row[i]) * W[i] + Y.row[i] - - Example: - X = [[1,2],[3,4]], - Y = [[2,1],[4,3]], - W = [0.3, 0.4] - - Then, Out = [[1.7,1.3],[3.6,3.4]] - - where 1.7 = 1*0.3+2*(1-0.3), - 1.3 = 2*0.3+1*(1-0.3), - 3.6 = 3*0.4+4*(1-0.4), - 3.4 = 4*0.4+3*(1-0.4) -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(interp, ops::InterpOp, ops::InterpOpMaker); diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc index b88cd14d78f616b0e57386ab891dad1a872bfe65..ad86a2e5bc23b2b0ea853971cf79dec745e9706a 100644 --- a/paddle/operators/lookup_table_op.cc +++ b/paddle/operators/lookup_table_op.cc @@ -32,6 +32,9 @@ class LookupTableOp : public framework::OperatorWithKernel { auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); ctx->ShareLoD("Ids", /*->*/ "Out"); } @@ -53,7 +56,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { " which is a learnable parameter."); AddInput("Ids", "An input with type int32 or int64" - "contains the ids to be looked up in W."); + "contains the ids to be looked up in W." + "Ids must be a column vector with rank = 2." + "The 2nd dimension size must be 1"); AddOutput("Out", "The lookup results, which have the same type with W."); AddComment(R"DOC( This operator is used to perform lookups on the parameter W, diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0a089b7c2dc1e05224525bc4fe5399ec39036d01 --- /dev/null +++ b/paddle/operators/lstm_op.cc @@ -0,0 +1,226 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/lstm_op.h" + +namespace paddle { +namespace operators { + +class LSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(Hidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTM should not be null."); + + auto x_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + int frame_size = x_dims[1] / 4; + auto w_dims = ctx->GetInputDim("Weight"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "The rank of Input(Weight) should be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], frame_size, + "The first dimension of Input(Weight) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size, + "The second dimension of Input(Weight) " + "should be 4 * %d.", + frame_size); + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + if (ctx->Attrs().Get("usePeepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + } + ctx->SetOutputDim("Hidden", {x_dims[0], frame_size}); + ctx->SetOutputDim("Cell", {x_dims[0], frame_size}); + ctx->SetOutputDim("BatchGate", x_dims); + ctx->ShareLoD("Input", "Hidden"); + ctx->ShareLoD("Input", "Cell"); + } +}; + +class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) the first input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X 4D), where, T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) the initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size, D is the hidden size."); + AddInput("C0", + "(Tensor, optional) the initial cell state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time"); + AddInput("Weight", + "(Tensor) the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights, which contains two parts: " + "input-hidden bias weight and peephole connections weight if " + "setting `usePeepholes` True. " + "1. `usePeepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `usePeepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddOutput("BatchGate", + "(LoDTensor) This LoDTensor contains input gate, forget gate " + "and output gate after the nonlinear computation. This " + "LoDTensor has the same shape with the reorganized input, which " + "was also be called batch input. The LoD size is 2. The first " + "LoD is the batch offsets and the second LoD contains the " + "indexes, which denote the position of reorganized sequence " + "in the raw input.") + .AsIntermediate(); + AddOutput("Hidden", + "(LoDTensor) the hidden state lod tensor of LSTM operator. " + "The shape and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) the cell state lod tensor of LSTM operator. " + "The shape and lod is the same with the `Input`."); + AddAttr("usePeepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("isReverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr( + "gateActivation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid"); + AddAttr("cellActivation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh"); + AddAttr("candidateActivation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh"); + AddComment(R"DOC(Long-Short Term Memory (LSTM) Operator + +The defalut implementation is diagonal/peephole connection [1], the formula is +as follows + + i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) + + f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) + + \tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) + + o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) + + c_t = f_t ⊙ c_{t-1} + i_t ⊙ \tilde{c_t} + + h_t = o_t ⊙ act_h(c_t) + +where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix +of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$ +are diagonal weight matrices for peephole connections. In our implenmention, +We use vectors to reprenset these diagonal weight matrices. The b terms +denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$ +is the non-line actications, such as logistic sigmoid function, and +\f$i, f, o\f$ and \f$c\f$ are respectively the input gate, forget gate, +output gate and cell activation vectors, all of which are the same size as +the cell output activation vector \f$h\f$. + +The ⊙ is the element-wise product of the vectors, \f$act_g\f$ and \f$act_h\f$ +are the cell input and cell output activation functions, `tanh` is usually +used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state, +which is computed based on the current input and the previous hidden state. + +Set `usePeepholes` False to disable peephole connection [2]. The formula +is omitted here. + +@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ +operations on the input x_{t} were NOT included in this operator. +Users can choose to use fully-connect operator before LSTM operator. + +[1] Hasim Sak, Andrew Senior, and Francoise Beaufays. Long short-term memory +recurrent neural network architectures for large scale acoustic modeling. +INTERSPEECH, 2014. + +[2] S. Hochreiter and J. Schmidhuber. Long Short-Term Memory. +Neural Computation, 9(8):1735-1780, 1997. + +)DOC"); + } +}; + +class LSTMGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")), + "Input(Hidden@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cell")), + "Input(Cell@GRAD) should not be null"); + ctx->SetOutputDim(framework::GradVarName("Weight"), + ctx->GetInputDim("Weight")); + ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->GetInputDim("Bias")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp); +REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CPU_KERNEL(lstm_grad, + ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.cu b/paddle/operators/lstm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9ad56941553bf19a56c25f41f76fe20dfa3a106f --- /dev/null +++ b/paddle/operators/lstm_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/lstm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_GPU_KERNEL(lstm_grad, + ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0af5694c48fcb4437e3acd422606de013bb2e145 --- /dev/null +++ b/paddle/operators/lstm_op.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/lstm_compute.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +using framework::LoDTensor; +using framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class LSTMKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + + auto* batch_gate = ctx.Output("BatchGate"); + batch_gate->mutable_data(ctx.GetPlace()); + auto* hidden_out = ctx.Output("Hidden"); + hidden_out->mutable_data(ctx.GetPlace()); + auto* cell_out = ctx.Output("Cell"); + cell_out->mutable_data(ctx.GetPlace()); + + // Now the function ShareLoD in InferShape is not implemented. + // So copy LoD here. + ctx.ShareLoD("Input", "Hidden"); + ctx.ShareLoD("Input", "Cell"); + + bool is_reverse = ctx.Attr("isReverse"); + math::LoDTensor2BatchFunctor to_batch; + to_batch(ctx.device_context(), *input, *batch_gate, is_reverse); + + auto in_dims = input->dims(); + int frame_size = static_cast(in_dims[1] / 4); + framework::DDim dims({in_dims[0], frame_size}); + + if (bias) { + Eigen::array extents({{1, 4 * frame_size}}); + Eigen::array offsets({{0, 0}}); + auto b = EigenMatrix::From(*bias); + auto gate = EigenMatrix::From(*batch_gate); + gate.device(ctx.GetEigenDevice()) = + gate + + b.slice(offsets, extents) + .reshape(Eigen::array({{1, frame_size * 4}})) + .broadcast( + Eigen::array({{static_cast(in_dims[0]), 1}})); + } + + math::LstmMetaValue lstm_value; + T* bias_data = const_cast(bias->data()); + // the code style in LstmMetaValue will be updated later. + lstm_value.checkIg = bias_data + 4 * frame_size; + lstm_value.checkFg = lstm_value.checkIg + frame_size; + lstm_value.checkOg = lstm_value.checkFg + frame_size; + lstm_value.prevStateValue = nullptr; + + framework::LoDTensor batch_out, batch_cell, batch_cell_pre_act; + batch_out.mutable_data(dims, ctx.GetPlace()); + batch_cell.mutable_data(dims, ctx.GetPlace()); + batch_cell_pre_act.mutable_data(dims, ctx.GetPlace()); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto gate_act = ctx.Attr("gateActivation"); + auto cell_act = ctx.Attr("cellActivation"); + auto cand_act = ctx.Attr("candidateActivation"); + + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor out_t = batch_out.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act.Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n != 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_t = batch_out.Slice(pre_h_start, pre_h_end); + math::matmul(ctx.device_context(), pre_hidden_t, false, + *weight, false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } + // else if : FIXME support the initial hidden and cell + + lstm_value.gateValue = gate_t.data(); + lstm_value.outputValue = out_t.data(); + lstm_value.stateValue = cell_t.data(); + lstm_value.stateActiveValue = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute(ctx.device_context(), lstm_value, + frame_size, cur_batch_size, + gate_act, cell_act, cand_act); + lstm_value.prevStateValue = lstm_value.stateValue; + } + + math::Batch2LoDTensorFunctor to_seq; + batch_out.set_lod(batch_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(ctx.device_context(), batch_out, *hidden_out); + + batch_cell.set_lod(batch_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(ctx.device_context(), batch_cell, *cell_out); + } +}; + +template +class LSTMGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override {} +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h index a0ff498c1d3ed2aaa10f5473ef91de168c250649..625b1852c2f0eb2ed435f73fea251c40c614a7dd 100644 --- a/paddle/operators/lstm_unit_op.h +++ b/paddle/operators/lstm_unit_op.h @@ -19,7 +19,6 @@ namespace paddle { namespace operators { -using framework::LoDTensor; using framework::Tensor; template diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 72ce8585045b5166df424a401442db39b47ab098..5598669ef96535b7d47150052b3841771c37c60b 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(detail) + if(WITH_GPU) nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) @@ -7,6 +9,8 @@ if(WITH_GPU) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) + nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) + nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) else() cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) @@ -14,6 +18,8 @@ else() cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context) + cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) + cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) endif() cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu index 367190e6b0682ec62550e869e2f04c3a2b2cbec3..db878129d650d663e187ecabb106eea0e39db6fa 100644 --- a/paddle/operators/math/cross_entropy.cu +++ b/paddle/operators/math/cross_entropy.cu @@ -22,8 +22,6 @@ namespace { template __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label, const int N, const int D) { - // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file. - // CUDA_1D_KERNEL_LOOP(i, N) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { PADDLE_ASSERT(label[i] >= 0 && label[i] < D); diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/operators/math/detail/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..49cf228de2204cb4888cf645a0cb68ed04cc3371 --- /dev/null +++ b/paddle/operators/math/detail/CMakeLists.txt @@ -0,0 +1,5 @@ +if(WITH_AVX) + cc_library(activation_functions SRCS hl_cpu_functions.cc hl_avx_functions.cc) +else() + cc_library(activation_functions SRCS hl_cpu_functions.cc) +endif() diff --git a/paddle/operators/math/detail/hl_activation_functions.h b/paddle/operators/math/detail/hl_activation_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..9d7d9914f0090bff17049038dfa2288d84f3dbda --- /dev/null +++ b/paddle/operators/math/detail/hl_activation_functions.h @@ -0,0 +1,188 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef HL_ACTIVATION_FUNCTIONS_H_ +#define HL_ACTIVATION_FUNCTIONS_H_ + +#include "hl_functions.h" +#include "paddle/operators/math/lstm_compute.h" + +/** + * Active functions: sigmoid, relu, tanh and linear. + */ +#define FLOAT_ACTIVE_FUNCTION \ + { \ + hppl::typef::sigmoid, hppl::typef::relu, hppl::typef::tanh, \ + hppl::typef::linear \ + } + +#define DOUBLE_ACTIVE_FUNCTION \ + { \ + hppl::typed::sigmoid, hppl::typed::relu, hppl::typed::tanh, \ + hppl::typed::linear \ + } + +#define AVX_ACTIVE_FUNCTION \ + { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear } + +namespace hppl { + +using activation_mode_t = paddle::operators::math::activation_mode_t; + +/** + * Hppl supports sigmoid, relu, tanh, linear active functions + * for neural networks' forward and backward activation. + */ +template +class Active { + public: + typedef T (*forward)(T); + typedef T (*backward)(T, T); +}; + +template +struct ForwardActType; + +template <> +struct ForwardActType { + using type = Active::forward; +}; + +template <> +struct ForwardActType { + using type = Active::forward; +}; + +template +struct BackwardActType; + +template <> +struct BackwardActType { + using type = Active::backward; +}; + +template <> +struct BackwardActType { + using type = Active::backward; +}; + +#ifdef __NVCC__ +namespace gpu { +static __device__ Active::forward forward[] = FLOAT_ACTIVE_FUNCTION; +static __device__ Active::backward backward[] = FLOAT_ACTIVE_FUNCTION; + +static __device__ Active::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION; +static __device__ Active::backward backward_d[] = + DOUBLE_ACTIVE_FUNCTION; + +template +struct ForwardAct { + __device__ typename ForwardActType::type operator()( + activation_mode_t type); +}; + +template <> +struct ForwardAct { + __device__ ForwardActType::type operator()(activation_mode_t type) { + return forward[type]; + } +}; + +template <> +struct ForwardAct { + __device__ ForwardActType::type operator()(activation_mode_t type) { + return forward_d[type]; + } +}; + +template +struct BackwardAct { + __device__ typename BackwardActType::type operator()( + activation_mode_t type); +}; + +template <> +struct BackwardAct { + __device__ BackwardActType::type operator()(activation_mode_t type) { + return backward[type]; + } +}; + +template <> +struct BackwardAct { + __device__ BackwardActType::type operator()(activation_mode_t type) { + return backward_d[type]; + } +}; + +} // namespace gpu +#else +namespace cpu { +static Active::forward forward[] = FLOAT_ACTIVE_FUNCTION; +static Active::backward backward[] = FLOAT_ACTIVE_FUNCTION; + +static Active::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION; +static Active::backward backward_d[] = DOUBLE_ACTIVE_FUNCTION; + +template +struct ForwardAct { + typename ForwardActType::type operator()(activation_mode_t type); +}; + +template <> +struct ForwardAct { + ForwardActType::type operator()(activation_mode_t type) { + return forward[type]; + } +}; + +template <> +struct ForwardAct { + ForwardActType::type operator()(activation_mode_t type) { + return forward_d[type]; + } +}; + +template +struct BackwardAct { + typename BackwardActType::type operator()(activation_mode_t type); +}; + +template <> +struct BackwardAct { + BackwardActType::type operator()(activation_mode_t type) { + return backward[type]; + } +}; + +template <> +struct BackwardAct { + BackwardActType::type operator()(activation_mode_t type) { + return backward_d[type]; + } +}; + +} // namespace cpu + +#ifdef __AVX__ +namespace avx { +static Active<__m256>::forward forward[] = AVX_ACTIVE_FUNCTION; +static Active<__m256>::backward backward[] = AVX_ACTIVE_FUNCTION; +} // namespace avx +#endif +#endif + +} // namespace hppl + +#endif // HL_ACTIVATION_FUNCTIONS_H_ diff --git a/paddle/operators/math/detail/hl_avx_functions.cc b/paddle/operators/math/detail/hl_avx_functions.cc new file mode 100644 index 0000000000000000000000000000000000000000..415bac5d93ee00244d072b0998c6941b14d4f8d8 --- /dev/null +++ b/paddle/operators/math/detail/hl_avx_functions.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "hl_functions.h" +// TODO(qingqing) refine this dependence +#include "paddle/cuda/src/avx_mathfun.h" + +namespace hppl { + +__m256 exp(__m256 a) { return exp256_ps(a); } + +__m256 relu(const __m256 a) { + __m256 tmp = _mm256_set1_ps(0.0f); + return _mm256_max_ps(a, tmp); +} + +__m256 sigmoid(const __m256 a) { + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 tmp = _mm256_max_ps(a, min); + tmp = _mm256_min_ps(tmp, max); + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); + tmp = exp(tmp); + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); + return tmp; +} + +__m256 tanh(const __m256 a) { + __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); + __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); + tmp = _mm256_min_ps(tmp, max); + tmp = exp(tmp); + return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), + _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), + _mm256_set1_ps(1.0f)); +} + +__m256 linear(const __m256 a) { return a; } + +__m256 relu(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), + _mm256_set1_ps(1.0f))); +} + +__m256 sigmoid(const __m256 a, const __m256 b) { + return _mm256_mul_ps(_mm256_mul_ps(a, b), + _mm256_sub_ps(_mm256_set1_ps(1.0f), b)); +} + +__m256 tanh(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b))); +} + +__m256 linear(const __m256 a, const __m256 b) { return a; } +} // namespace hppl diff --git a/paddle/operators/math/detail/hl_avx_functions.h b/paddle/operators/math/detail/hl_avx_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..35f4eabb4c07c6cc9d2edded02e5b6290b1232f8 --- /dev/null +++ b/paddle/operators/math/detail/hl_avx_functions.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef HL_AVX_FUNCTIONS_H_ +#define HL_AVX_FUNCTIONS_H_ + +#include + +namespace hppl { +__m256 relu(const __m256 a); +__m256 sigmoid(const __m256 a); +__m256 tanh(const __m256 a); +__m256 linear(const __m256 a); + +__m256 relu(const __m256 a, const __m256 b); +__m256 sigmoid(const __m256 a, const __m256 b); +__m256 tanh(const __m256 a, const __m256 b); +__m256 linear(const __m256 a, const __m256 b); +} // namespace hppl + +#endif // HL_AVX_FUNCTIONS_H_ diff --git a/paddle/operators/math/detail/hl_cpu_functions.cc b/paddle/operators/math/detail/hl_cpu_functions.cc new file mode 100644 index 0000000000000000000000000000000000000000..21ec78f9629af0e4673a56517d76ac6734f57db8 --- /dev/null +++ b/paddle/operators/math/detail/hl_cpu_functions.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "hl_functions.h" + +namespace hppl { +namespace typef { + +float relu(const float a) { + return a > static_cast(0.0) ? a : static_cast(0.0); +} + +float sigmoid(const float a) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + float tmp = (a < min) ? min : ((a > max) ? max : a); + return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); +} + +float tanh(const float a) { + float tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} + +float linear(const float a) { return a; } + +float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); } + +float sigmoid(const float a, const float b) { + return a * b * (static_cast(1) - b); +} + +float tanh(const float a, const float b) { + return a * (static_cast(1) - b * b); +} + +float linear(const float a, const float b) { return a; } + +} // namespace typef + +namespace typed { +double relu(const double a) { + return a > static_cast(0.0) ? a : static_cast(0.0); +} + +double sigmoid(const double a) { + const double min = SIGMOID_THRESHOLD_MIN; + const double max = SIGMOID_THRESHOLD_MAX; + double tmp = (a < min) ? min : ((a > max) ? max : a); + return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); +} + +double tanh(const double a) { + double tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} + +double linear(const double a) { return a; } + +double relu(const double a, const double b) { + return a * (b > 0.0 ? 1.0 : 0.0); +} + +double sigmoid(const double a, const double b) { + return a * b * (static_cast(1) - b); +} + +double tanh(const double a, const double b) { + return a * (static_cast(1) - b * b); +} + +double linear(const double a, const double b) { return a; } + +} // namespace typed +} // namespace hppl diff --git a/paddle/operators/math/detail/hl_functions.h b/paddle/operators/math/detail/hl_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..3e2f0c9ee6d3ae2ed598c4d5f09b85b7d61fdd51 --- /dev/null +++ b/paddle/operators/math/detail/hl_functions.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef HL_FUNCTIONS_H_ +#define HL_FUNCTIONS_H_ + +/** + * sigmoid threshold maximum + */ +#define SIGMOID_THRESHOLD_MIN -40.0 + +/** + * sigmoid threshold minimum + */ +#define SIGMOID_THRESHOLD_MAX 13.0 + +/** + * The maximum input value for exp, used to avoid overflow problem. + * currently only used for tanh function. + */ +#define EXP_MAX_INPUT 40.0 + +#ifndef __NVCC__ +namespace hppl { +namespace typef { +float relu(const float a); +float sigmoid(const float a); +float tanh(const float a); +float linear(const float a); + +float relu(const float a, const float b); +float sigmoid(const float a, const float b); +float tanh(const float a, const float b); +float linear(const float a, const float b); + +} // namespace typef + +namespace typed { +double relu(const double a); +double sigmoid(const double a); +double tanh(const double a); +double linear(const double a); + +double relu(const double a, const double b); +double sigmoid(const double a, const double b); +double tanh(const double a, const double b); +double linear(const double a, const double b); +} // namespace typed + +} // namespace hppl + +#ifdef __AVX__ +#include "hl_avx_functions.h" +#endif + +#else +#include "hl_gpu_functions.h" +#endif + +#endif // HL_FUNCTIONS_H_ diff --git a/paddle/operators/math/detail/hl_gpu_functions.h b/paddle/operators/math/detail/hl_gpu_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..72f2204e7b2cfdba1367b51e3731dde11fb292d6 --- /dev/null +++ b/paddle/operators/math/detail/hl_gpu_functions.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef HL_GPU_FUNCTIONS_CUH_ +#define HL_GPU_FUNCTIONS_CUH_ + +#include "hl_base.h" + +namespace hppl { +namespace typef { + +__device__ static float relu(const float a) { return a > 0.0f ? a : 0.0f; } + +__device__ static float sigmoid(const float a) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + float tmp = (a < min) ? min : ((a > max) ? max : a); + return __fdividef(1.0f, 1.0f + __expf(-tmp)); +} + +__device__ static float tanh(const float a) { + float tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return __fdividef(2.0f, (1.0f + __expf(-2.0f * tmp))) - 1.0f; +} + +__device__ static float linear(const float a) { return a; } + +__device__ static float relu(const float a, const float b) { + return a * (b > 0.0f ? 1.0f : 0.0f); +} + +__device__ static float sigmoid(const float a, const float b) { + return a * b * (1.0f - b); +} + +__device__ static float tanh(const float a, const float b) { + return a * (1.0f - b * b); +} + +__device__ static float linear(const float a, const float b) { return a; } + +} // namespace typef + +namespace typed { + +__device__ static double relu(const double a) { return a > 0.0 ? a : 0.0; } + +__device__ static double sigmoid(const double a) { + const double min = SIGMOID_THRESHOLD_MIN; + const double max = SIGMOID_THRESHOLD_MAX; + double tmp = (a < min) ? min : ((a > max) ? max : a); + return 1.0 / (1.0 + exp(-tmp)); +} + +__device__ static double tanh(const double a) { + double tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(-2.0 * a))) - 1.0; +} + +__device__ static double linear(const double a) { return a; } + +__device__ static double relu(const double a, const double b) { + return a * (b > 0.0 ? 1.0 : 0.0); +} + +__device__ static double sigmoid(const double a, const double b) { + return a * b * (1 - b); +} + +__device__ static double tanh(const double a, const double b) { + return a * (1.0 - b * b); +} + +__device__ static double linear(const double a, const double b) { return a; } + +} // namespace typef + +} // namespace hppl + +#endif // HL_GPU_FUNCTIONS_CUH_ diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..74d51d7bc9b91f4c8088384d77183131f57aafab --- /dev/null +++ b/paddle/operators/math/detail/lstm_cpu_kernel.h @@ -0,0 +1,310 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/operators/math/detail/hl_activation_functions.h" +#include "paddle/operators/math/lstm_compute.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#ifndef __NVCC__ + +template +void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, + int frameSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + T rValueIn; + T rValueIg; + T rValueFg; + T rValueOg; + T rCheckI; + T rCheckF; + T rCheckO; + T rState; + T rPrevState = 0; + T rStateAtv; + T rOut; + + T *valueIn = value.gateValue; + T *valueIg = value.gateValue + frameSize; + T *valueFg = value.gateValue + frameSize * 2; + T *valueOg = value.gateValue + frameSize * 3; + + for (int i = 0; i < frameSize; i++) { + rValueIn = valueIn[i]; + rValueIg = valueIg[i]; + rValueFg = valueFg[i]; + rValueOg = valueOg[i]; + rCheckI = value.checkIg[i]; + rCheckF = value.checkFg[i]; + rCheckO = value.checkOg[i]; + + if (value.prevStateValue) { + rPrevState = value.prevStateValue[i]; + } + + hppl::cpu::ForwardAct act; + op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, + rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate), + act(active_state)); + + valueIn[i] = rValueIn; + valueIg[i] = rValueIg; + valueFg[i] = rValueFg; + valueOg[i] = rValueOg; + value.stateValue[i] = rState; + value.stateActiveValue[i] = rStateAtv; + value.outputValue[i] = rOut; + } +} + +template +void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frameSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + T rValueIn; + T rValueIg; + T rValueFg; + T rValueOg; + T rGradIn; + T rGradIg; + T rGradFg; + T rGradOg; + T rPrevState = 0; + T rPrevStateGrad; + T rState; + T rStateGrad; + T rStateAtv; + T rOutputGrad; + T rCheckI; + T rCheckF; + T rCheckO; + T rCheckIGrad; + T rCheckFGrad; + T rCheckOGrad; + + T *valueIn = value.gateValue; + T *valueIg = value.gateValue + frameSize; + T *valueFg = value.gateValue + frameSize * 2; + T *valueOg = value.gateValue + frameSize * 3; + T *gradIn = grad.gateGrad; + T *gradIg = grad.gateGrad + frameSize; + T *gradFg = grad.gateGrad + frameSize * 2; + T *gradOg = grad.gateGrad + frameSize * 3; + + for (int i = 0; i < frameSize; i++) { + rValueIn = valueIn[i]; + rValueIg = valueIg[i]; + rValueFg = valueFg[i]; + rValueOg = valueOg[i]; + rCheckI = value.checkIg[i]; + rCheckF = value.checkFg[i]; + rCheckO = value.checkOg[i]; + rState = value.stateValue[i]; + rStateAtv = value.stateActiveValue[i]; + rOutputGrad = grad.outputGrad[i]; + rStateGrad = grad.stateGrad[i]; + if (value.prevStateValue) { + rPrevState = value.prevStateValue[i]; + } + + hppl::cpu::BackwardAct act; + op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, + rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, + rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, + rCheckOGrad, act(active_node), act(active_gate), act(active_state)); + + gradIn[i] = rGradIn; + gradIg[i] = rGradIg; + gradFg[i] = rGradFg; + gradOg[i] = rGradOg; + grad.stateGrad[i] = rStateGrad; + + if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad; + if (value.prevStateValue) { + if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad; + if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad; + } + if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad; + } +} + +template +void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, int frameSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { +#ifdef __AVX__ + __m256 rValueIn; + __m256 rValueIg; + __m256 rValueFg; + __m256 rValueOg; + __m256 rCheckI; + __m256 rCheckF; + __m256 rCheckO; + __m256 rState; + __m256 rPrevState = _mm256_set1_ps(0.0f); + __m256 rStateAtv; + __m256 rOut; + + __m256 *valueIn = (__m256 *)value.gateValue; + __m256 *valueIg = (__m256 *)(value.gateValue + frameSize); + __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); + __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); + + for (int i = 0; i < frameSize / 8; i++) { + rValueIn = valueIn[i]; + rValueIg = valueIg[i]; + rValueFg = valueFg[i]; + rValueOg = valueOg[i]; + rCheckI = ((__m256 *)value.checkIg)[i]; + rCheckF = ((__m256 *)value.checkFg)[i]; + rCheckO = ((__m256 *)value.checkOg)[i]; + + if (value.prevStateValue) { + rPrevState = ((__m256 *)value.prevStateValue)[i]; + } + + op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, + rOut, rCheckI, rCheckF, rCheckO, hppl::avx::forward[active_node], + hppl::avx::forward[active_gate], hppl::avx::forward[active_state]); + + valueIn[i] = rValueIn; + valueIg[i] = rValueIg; + valueFg[i] = rValueFg; + valueOg[i] = rValueOg; + ((__m256 *)value.stateValue)[i] = rState; + ((__m256 *)value.stateActiveValue)[i] = rStateAtv; + ((__m256 *)value.outputValue)[i] = rOut; + } +#endif +} + +template +void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frameSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { +#ifdef __AVX__ + __m256 rValueIn; + __m256 rValueIg; + __m256 rValueFg; + __m256 rValueOg; + __m256 rGradIn; + __m256 rGradIg; + __m256 rGradFg; + __m256 rGradOg; + __m256 rPrevState = _mm256_set1_ps(0.0f); + __m256 rPrevStateGrad; + __m256 rStateGrad; + __m256 rState; + __m256 rStateAtv; + __m256 rOutputGrad; + __m256 rCheckI; + __m256 rCheckF; + __m256 rCheckO; + __m256 rCheckIGrad; + __m256 rCheckFGrad; + __m256 rCheckOGrad; + + __m256 *valueIn = (__m256 *)value.gateValue; + __m256 *valueIg = (__m256 *)(value.gateValue + frameSize); + __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); + __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); + __m256 *gradIn = (__m256 *)grad.gateGrad; + __m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize); + __m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2); + __m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3); + + for (int i = 0; i < frameSize / 8; i++) { + rValueIn = valueIn[i]; + rValueIg = valueIg[i]; + rValueFg = valueFg[i]; + rValueOg = valueOg[i]; + rCheckI = ((__m256 *)value.checkIg)[i]; + rCheckF = ((__m256 *)value.checkFg)[i]; + rCheckO = ((__m256 *)value.checkOg)[i]; + rState = ((__m256 *)value.stateValue)[i]; + rStateAtv = ((__m256 *)value.stateActiveValue)[i]; + rOutputGrad = ((__m256 *)grad.outputGrad)[i]; + rStateGrad = ((__m256 *)grad.stateGrad)[i]; + if (value.prevStateValue) { + rPrevState = ((__m256 *)value.prevStateValue)[i]; + } + + op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, + rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, + rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, + rCheckOGrad, hppl::avx::backward[active_node], + hppl::avx::backward[active_gate], hppl::avx::backward[active_state]); + + gradIn[i] = rGradIn; + gradIg[i] = rGradIg; + gradFg[i] = rGradFg; + gradOg[i] = rGradOg; + ((__m256 *)grad.stateGrad)[i] = rStateGrad; + + if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad; + if (value.prevStateValue) { + if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad; + if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad; + } + if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad; + } +#endif +} + +template +void cpu_lstm_forward(Op op, LstmMetaValue value, int frameSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same::value)) { + avx_lstm_forward_one_sequence(op, value, frameSize, active_node, + active_gate, active_state); + } else { + naive_lstm_forward_one_sequence(op, value, frameSize, active_node, + active_gate, active_state); + } +} + +template +void cpu_lstm_backward(Op op, LstmMetaValue value, LstmMetaGrad grad, + int frameSize, activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same::value)) { + avx_lstm_backward_one_sequence(op, value, grad, frameSize, active_node, + active_gate, active_state); + } else { + naive_lstm_backward_one_sequence(op, value, grad, frameSize, active_node, + active_gate, active_state); + } +} + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..9573eaefb6a9d678ef70f2e2bffdc6a3011b21ea --- /dev/null +++ b/paddle/operators/math/detail/lstm_gpu_kernel.h @@ -0,0 +1,256 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/operators/math/detail/hl_activation_functions.h" +#include "paddle/operators/math/lstm_compute.h" +#include "paddle/platform/cuda_helper.h" +#include "paddle/platform/device_context.h" + +#include + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +/* + * threads(framePerBlock, batchPerBlock) + * grid(frameBlocks, batchBlocks) + */ +template +__global__ void KeLstmForward(Op op, LstmMetaValue value, int frameSize, + int batchSize, activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (frameIdx >= frameSize) return; + + int batchIdx = 0; + if (isBatch) { + batchIdx = blockIdx.y * blockDim.y + threadIdx.y; + if (batchIdx >= batchSize) return; + value.gateValue += batchIdx * frameSize * 4; + value.outputValue += batchIdx * frameSize; + value.stateValue += batchIdx * frameSize; + value.stateActiveValue += batchIdx * frameSize; + } + + T rState; + T rPrevState = 0; + T rStateAtv; + T rOut; + T rValueIn; + T rValueIg; + T rValueFg; + T rValueOg; + T rCheckI = value.checkIg[frameIdx]; + T rCheckF = value.checkFg[frameIdx]; + T rCheckO = value.checkOg[frameIdx]; + + rValueIn = value.gateValue[frameIdx]; + rValueIg = value.gateValue[frameIdx + frameSize]; + rValueFg = value.gateValue[frameIdx + frameSize * 2]; + rValueOg = value.gateValue[frameIdx + frameSize * 3]; + + if (value.prevStateValue) { + if (isBatch) value.prevStateValue += batchIdx * frameSize; + rPrevState = value.prevStateValue[frameIdx]; + } + + hppl::gpu::ForwardAct act; + op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, + rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate), + act(active_state)); + + value.gateValue[frameIdx] = rValueIn; + value.gateValue[frameIdx + frameSize] = rValueIg; + value.gateValue[frameIdx + frameSize * 2] = rValueFg; + value.gateValue[frameIdx + frameSize * 3] = rValueOg; + + value.stateValue[frameIdx] = rState; + value.stateActiveValue[frameIdx] = rStateAtv; + value.outputValue[frameIdx] = rOut; +} + +/* + * threads(framePerBlock, batchPerBlock) + * grid(frameBlocks, batchBlocks) + */ +template +__global__ void KeLstmBackward(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frameSize, + int batchSize, activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (frameIdx >= frameSize) return; + + int batchIdx = 0; + if (isBatch) { + batchIdx = blockIdx.y * blockDim.y + threadIdx.y; + if (batchIdx >= batchSize) return; + value.gateValue += batchIdx * frameSize * 4; + value.stateValue += batchIdx * frameSize; + value.stateActiveValue += batchIdx * frameSize; + grad.gateGrad += batchIdx * frameSize * 4; + grad.stateGrad += batchIdx * frameSize; + grad.outputGrad += batchIdx * frameSize; + } + + T rValueIn; + T rValueIg; + T rValueFg; + T rValueOg; + T rGradIn; + T rGradIg; + T rGradFg; + T rGradOg; + T rPrevState = 0; + T rPrevStateGrad; + T rState; + T rStateGrad; + T rStateAtv; + T rOutputGrad; + T rCheckI = value.checkIg[frameIdx]; + T rCheckF = value.checkFg[frameIdx]; + T rCheckO = value.checkOg[frameIdx]; + T rCheckIGrad; + T rCheckFGrad; + T rCheckOGrad; + + rValueIn = value.gateValue[frameIdx]; + rValueIg = value.gateValue[frameIdx + frameSize]; + rValueFg = value.gateValue[frameIdx + frameSize * 2]; + rValueOg = value.gateValue[frameIdx + frameSize * 3]; + rState = value.stateValue[frameIdx]; + rStateAtv = value.stateActiveValue[frameIdx]; + rOutputGrad = grad.outputGrad[frameIdx]; + rStateGrad = grad.stateGrad[frameIdx]; + + if (value.prevStateValue) { + if (isBatch) value.prevStateValue += batchIdx * frameSize; + rPrevState = value.prevStateValue[frameIdx]; + } + + hppl::gpu::BackwardAct act; + op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, + rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, + rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad, + act(active_node), act(active_gate), act(active_state)); + + grad.gateGrad[frameIdx] = rGradIn; + grad.gateGrad[frameIdx + frameSize] = rGradIg; + grad.gateGrad[frameIdx + frameSize * 2] = rGradFg; + grad.gateGrad[frameIdx + frameSize * 3] = rGradOg; + grad.stateGrad[frameIdx] = rStateGrad; + if (grad.prevStateGrad) { + if (isBatch) grad.prevStateGrad += batchIdx * frameSize; + grad.prevStateGrad[frameIdx] = rPrevStateGrad; + } + + if (isBatch) { + if (value.prevStateValue) { + if (grad.checkIgGrad) + paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx, + rCheckIGrad); + if (grad.checkFgGrad) + paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx, + rCheckFGrad); + } + if (grad.checkOgGrad) + paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad); + } else { + if (value.prevStateValue) { + if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad; + if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad; + } + if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad; + } +} + +template +void gpu_lstm_forward(const platform::DeviceContext& context, Op op, + LstmMetaValue value, int frameSize, int batchSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + dim3 threads; + dim3 grid; + if (batchSize == 1) { + int framePerBlock = frameSize <= 1024 ? frameSize : 1024; + int frameBlocks = (frameSize + 1024 - 1) / 1024; + threads = dim3(framePerBlock, 1); + grid = dim3(frameBlocks, 1); + } else { + /* framePerBlock = 32 batchPerBlock = 32 */ + threads = dim3(32, 32); + grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); + } + + auto stream = + reinterpret_cast(context).stream(); + if (batchSize == 1) { + KeLstmForward<<>>( + op, value, frameSize, batchSize, active_node, active_gate, + active_state); + } else { + KeLstmForward<<>>( + op, value, frameSize, batchSize, active_node, active_gate, + active_state); + } +} + +template +void gpu_lstm_backward(const platform::DeviceContext& context, Op op, + LstmMetaValue value, LstmMetaGrad grad, + int frameSize, int batchSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + dim3 threads; + dim3 grid; + if (batchSize == 1) { + int framePerBlock = frameSize <= 1024 ? frameSize : 1024; + int frameBlocks = (frameSize + 1024 - 1) / 1024; + threads = dim3(framePerBlock, 1); + grid = dim3(frameBlocks, 1); + } else { + /* framePerBlock = 32 batchPerBlock = 32 */ + threads = dim3(32, 32); + grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); + } + + auto stream = + reinterpret_cast(context).stream(); + if (batchSize == 1) { + KeLstmBackward<<>>( + op, value, grad, frameSize, batchSize, active_node, active_gate, + active_state); + } else { + KeLstmBackward<<>>( + op, value, grad, frameSize, batchSize, active_node, active_gate, + active_state); + } +} + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6f3ead2397d5131b4468d0ad288513cedb289594 --- /dev/null +++ b/paddle/operators/math/detail/lstm_kernel.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/detail/hl_activation_functions.h" +#include "paddle/platform/hostdevice.h" + +#include + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +namespace forward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, + T &prevState, T &state, T &stateAtv, T &output, + T &checkI, T &checkF, T &checkO, + typename hppl::ForwardActType::type actInput, + typename hppl::ForwardActType::type actGate, + typename hppl::ForwardActType::type actState) { + valueIn = actInput(valueIn); + valueIg = actGate(valueIg + prevState * checkI); + valueFg = actGate(valueFg + prevState * checkF); + state = valueIn * valueIg + prevState * valueFg; + valueOg = actGate(valueOg + state * checkO); + stateAtv = actState(state); + output = valueOg * stateAtv; + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + + HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, + __m256 &valueOg, __m256 &prevState, __m256 &state, + __m256 &stateAtv, __m256 &output, __m256 &checkI, + __m256 &checkF, __m256 &checkO, + hppl::Active<__m256>::forward actInput, + hppl::Active<__m256>::forward actGate, + hppl::Active<__m256>::forward actState) { + valueIn = actInput(valueIn); + valueIg = actGate(_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI))); + valueFg = actGate(_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF))); + state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg), + _mm256_mul_ps(prevState, valueFg)); + valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO))); + stateAtv = actState(state); + output = _mm256_mul_ps(valueOg, stateAtv); + } +#endif +#endif +}; + +} // namespace forward + +namespace backward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, + T &gradIn, T &gradIg, T &gradFg, T &gradOg, + T &prevState, T &prevStateGrad, T &state, + T &stateGrad, T &stateAtv, T &outputGrad, + T &checkI, T &checkF, T &checkO, T &checkIGrad, + T &checkFGrad, T &checkOGrad, + typename hppl::BackwardActType::type actInput, + typename hppl::BackwardActType::type actGate, + typename hppl::BackwardActType::type actState) { + gradOg = actGate(outputGrad * stateAtv, valueOg); + stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO; + gradIn = actInput(stateGrad * valueIg, valueIn); + gradIg = actGate(stateGrad * valueIn, valueIg); + gradFg = actGate(stateGrad * prevState, valueFg); + prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg; + checkIGrad = gradIg * prevState; + checkFGrad = gradFg * prevState; + checkOGrad = gradOg * state; + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, + __m256 &valueOg, __m256 &gradIn, __m256 &gradIg, + __m256 &gradFg, __m256 &gradOg, __m256 &prevState, + __m256 &prevStateGrad, __m256 &state, + __m256 &stateGrad, __m256 &stateAtv, + __m256 &outputGrad, __m256 &checkI, __m256 &checkF, + __m256 &checkO, __m256 &checkIGrad, + __m256 &checkFGrad, __m256 &checkOGrad, + hppl::Active<__m256>::backward actInput, + hppl::Active<__m256>::backward actGate, + hppl::Active<__m256>::backward actState) { + gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg); + stateGrad = _mm256_add_ps( + actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad); + stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad); + gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn); + gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg); + gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg); + prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI), + _mm256_mul_ps(gradFg, checkF)); + prevStateGrad = + _mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad); + checkIGrad = _mm256_mul_ps(gradIg, prevState); + checkFGrad = _mm256_mul_ps(gradFg, prevState); + checkOGrad = _mm256_mul_ps(gradOg, state); + } +#endif +#endif +}; + +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc index c08a3380f042886cd400df0d840e61856274619c..3b1b0bd71dd3768b932864e185af8dc839b4653e 100644 --- a/paddle/operators/math/im2col.cc +++ b/paddle/operators/math/im2col.cc @@ -29,8 +29,8 @@ class Im2ColFunctor(); @@ -52,16 +68,14 @@ class Im2ColFunctor= input_height || - (im_col_idx - padding_width) < 0 || - (im_col_idx - padding_width) >= input_width) { + int im_row_idx = h * stride_height + h_offset - padding_up; + int im_col_idx = w * stride_width + w_offset - padding_left; + + if (im_row_idx < 0 || im_row_idx >= input_height || im_col_idx < 0 || + im_col_idx >= input_width) { col_data[(c * output_height + h) * output_width + w] = T(0); } else { - im_row_idx += c_im * input_height - padding_height; - im_col_idx -= padding_width; + im_row_idx += c_im * input_height; col_data[(c * output_height + h) * output_width + w] = im_data[im_row_idx * input_width + im_col_idx]; } @@ -82,7 +96,8 @@ class Col2ImFunctor(); @@ -103,14 +134,12 @@ class Col2ImFunctor= 0 && - (im_row_idx - padding_height) < input_height && - (im_col_idx - padding_width) >= 0 && - (im_col_idx - padding_width) < input_width) { - im_row_idx += c_im * input_height - padding_height; - im_col_idx -= padding_width; + int im_row_idx = h * stride_height + h_offset - padding_up; + int im_col_idx = w * stride_width + w_offset - padding_left; + + if ((im_row_idx) >= 0 && (im_row_idx) < input_height && + (im_col_idx) >= 0 && (im_col_idx) < input_width) { + im_row_idx += c_im * input_height; im_data[im_row_idx * input_width + im_col_idx] += col_data[(c * output_height + h) * output_width + w]; } @@ -140,8 +169,8 @@ class Im2ColFunctor(); T* col_data = col.data(); @@ -163,10 +207,10 @@ class Im2ColFunctor(); const T* col_data = col.data(); @@ -223,9 +283,9 @@ class Col2ImFunctor(context) .stream()>>>( im.data(), num_outputs, input_height, input_width, filter_height, - filter_width, stride_height, stride_width, padding_height, - padding_width, output_height, output_width, col.data()); + filter_width, stride_height, stride_width, padding_up, padding_left, + output_height, output_width, col.data()); } }; @@ -152,7 +161,8 @@ class Col2ImFunctor<<(context) .stream()>>>( - num_kernels, col.data(), input_height + 2 * padding_height, - input_width + 2 * padding_width, input_channels, filter_height, - filter_width, stride_height, stride_width, padding_height, - padding_width, output_height, output_width, im.data()); + num_kernels, col.data(), input_height + padding_up + padding_down, + input_width + padding_left + padding_left, input_channels, + filter_height, filter_width, stride_height, stride_width, padding_up, + padding_left, output_height, output_width, im.data()); } }; @@ -238,8 +258,8 @@ class Im2ColFunctor(context) .stream()>>>( im.data(), col.data(), input_channels, input_height, input_width, - filter_height, filter_width, stride_height, stride_width, - padding_height, padding_width, output_height, output_width); + filter_height, filter_width, stride_height, stride_width, padding_up, + padding_left, output_height, output_width); } }; @@ -322,7 +351,8 @@ class Col2ImFunctor(context) .stream()>>>( im.data(), col.data(), input_channels, input_height, input_width, - filter_height, filter_width, stride_height, stride_width, - padding_height, padding_width, output_height, output_width); + filter_height, filter_width, stride_height, stride_width, padding_up, + padding_left, output_height, output_width); } }; diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h index 7b717e1603c94cd77c74cb0d86f1d23e2692f9d8..c736d4fa523c2af3e3dd7a11114d7f84021bc5c1 100644 --- a/paddle/operators/math/im2col.h +++ b/paddle/operators/math/im2col.h @@ -74,8 +74,8 @@ class Im2ColFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& im, framework::Tensor& col, - int stride_height, int stride_width, int padding_height, - int padding_width); + int stride_height, int stride_width, int padding_up, + int padding_down, int padding_left, int padding_right); }; template @@ -83,7 +83,8 @@ class Col2ImFunctor { public: void operator()(const platform::DeviceContext& context, framework::Tensor& im, const framework::Tensor& col, int stride_height, - int stride_width, int padding_height, int padding_width); + int stride_width, int padding_up, int padding_down, + int padding_left, int padding_right); }; } // namespace math diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index 443c94b83f0bf24837afe703b19e2ab47a0dd786..5763782c4edec87f44dabef2ccffe3097eeb2421 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -35,6 +35,12 @@ void testIm2col() { * * output_ocf = [0, 1, 3, 4 * 1, 2, 4, 5] + * + * col2im_cfo = [0, 2, 2 + * 3, 4, 5] + * + * col2im_ocf = [0, 2, 2 + * 3, 4, 5] */ int input_height = 2; int input_width = 3; @@ -59,7 +65,7 @@ void testIm2col() { new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); #else PADDLE_THROW("no GPU support"); -#endif // PADDLE_ONLY_CPU +#endif // PADDLE_WITH_CUDA } if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; @@ -71,6 +77,7 @@ void testIm2col() { output_ocf.mutable_data( {output_height, output_width, 1, filter_size, filter_size}, *place); + // Im2Col paddle::operators::math::Im2ColFunctor< paddle::operators::math::ColFormat::kCFO, Place, float> im2col; @@ -78,8 +85,13 @@ void testIm2col() { paddle::operators::math::ColFormat::kOCF, Place, float> im2col_ocf; - im2col(*context, input, output_cfo, stride, stride, padding, padding); - im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding); + im2col(*context, input, output_cfo, stride, stride, padding, padding, padding, + padding); + im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding, + padding, padding); + + float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5}; + float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5}; float* out_cfo_ptr; if (paddle::platform::is_cpu_place(*place)) { @@ -88,14 +100,9 @@ void testIm2col() { output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context); out_cfo_ptr = output_tmp.data(); } - EXPECT_EQ(out_cfo_ptr[0], 0); - EXPECT_EQ(out_cfo_ptr[1], 1); - EXPECT_EQ(out_cfo_ptr[2], 1); - EXPECT_EQ(out_cfo_ptr[3], 2); - EXPECT_EQ(out_cfo_ptr[4], 3); - EXPECT_EQ(out_cfo_ptr[5], 4); - EXPECT_EQ(out_cfo_ptr[6], 4); - EXPECT_EQ(out_cfo_ptr[7], 5); + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]); + } float* out_ocf_ptr; if (paddle::platform::is_cpu_place(*place)) { @@ -104,14 +111,60 @@ void testIm2col() { output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context); out_ocf_ptr = output_tmp.data(); } - EXPECT_EQ(out_ocf_ptr[0], 0); - EXPECT_EQ(out_ocf_ptr[1], 1); - EXPECT_EQ(out_ocf_ptr[2], 3); - EXPECT_EQ(out_ocf_ptr[3], 4); - EXPECT_EQ(out_ocf_ptr[4], 1); - EXPECT_EQ(out_ocf_ptr[5], 2); - EXPECT_EQ(out_ocf_ptr[6], 4); - EXPECT_EQ(out_ocf_ptr[7], 5); + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]); + } + + // Col2Im: kCFO + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kCFO, Place, float> + col2im; + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kOCF, Place, float> + col2im_ocf; + float col2im_data[] = {0, 2, 2, 3, 8, 5}; + + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + input.CopyFrom(input_tmp, *place, *context); + } + + col2im(*context, input, output_cfo, stride, stride, padding, padding, padding, + padding); + + float* in_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } + + // Col2Im: kOCF + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + input.CopyFrom(input_tmp, *place, *context); + } + + col2im_ocf(*context, input, output_ocf, stride, stride, padding, padding, + padding, padding); + + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } } TEST(math, im2col) { diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0febf8e3b70111d12f858cf6259a2801a42d9a90 --- /dev/null +++ b/paddle/operators/math/lstm_compute.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/lstm_compute.h" +#include "paddle/operators/math/detail/lstm_cpu_kernel.h" +#include "paddle/operators/math/detail/lstm_kernel.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmUnitFunctor { + static void compute(const platform::DeviceContext& context, + LstmMetaValue value, int frame_size, int batch_size, + const std::string& gate_act, const std::string& cell_act, + const std::string& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_forward(detail::forward::lstm(), value, frame_size, + ActiveType(cand_act), ActiveType(gate_act), + ActiveType(cell_act)); + value.gateValue += frame_size * 4; + value.stateValue += frame_size; + value.stateActiveValue += frame_size; + value.outputValue += frame_size; + if (value.prevStateValue) { + value.prevStateValue += frame_size; + } + } + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const platform::DeviceContext& context, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + const std::string& gate_act, const std::string& cell_act, + const std::string& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_backward(detail::backward::lstm(), value, grad, + frame_size, ActiveType(cand_act), + ActiveType(gate_act), ActiveType(cell_act)); + + value.gateValue += frame_size * 4; + value.stateValue += frame_size; + value.stateActiveValue += frame_size; + value.outputValue += frame_size; + if (value.prevStateValue) { + value.prevStateValue += frame_size; + } + + grad.gateGrad += frame_size * 4; + grad.stateGrad += frame_size; + grad.stateActiveGrad += frame_size; + grad.outputGrad += frame_size; + if (grad.prevStateGrad) { + grad.prevStateGrad += frame_size; + } + } + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..b2122f2a5c08a6d9d53293833177f0ba2c3ab860 --- /dev/null +++ b/paddle/operators/math/lstm_compute.cu @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/detail/lstm_gpu_kernel.h" +#include "paddle/operators/math/detail/lstm_kernel.h" +#include "paddle/operators/math/lstm_compute.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmUnitFunctor { + static void compute(const platform::DeviceContext& context, + LstmMetaValue value, int frame_size, int batch_size, + const std::string& gate_act, const std::string& cell_act, + const std::string& cand_act) { + detail::gpu_lstm_forward(context, detail::forward::lstm(), value, + frame_size, batch_size, ActiveType(cand_act), + ActiveType(gate_act), ActiveType(cell_act)); + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const platform::DeviceContext& context, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + const std::string& gate_act, const std::string& cell_act, + const std::string& cand_act) { + detail::gpu_lstm_backward(context, detail::backward::lstm(), value, grad, + frame_size, batch_size, ActiveType(cand_act), + ActiveType(gate_act), ActiveType(cell_act)); + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..28d2c6fd3b0d8143da90c37f241072e37397f98b --- /dev/null +++ b/paddle/operators/math/lstm_compute.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/platform/device_context.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { + +typedef enum { + HL_ACTIVATION_SIGMOID = 0, + HL_ACTIVATION_RELU = 1, + HL_ACTIVATION_TANH = 2, + HL_ACTIVATION_LINEAR = 3, + HL_ACTIVATION_END +} activation_mode_t; + +template +struct LstmMetaValue { + T *gateValue; + T *prevStateValue; + T *stateValue; + T *stateActiveValue; + T *outputValue; + T *checkIg; + T *checkFg; + T *checkOg; +}; + +template +struct LstmMetaGrad { + T *gateGrad; + T *prevStateGrad; + T *stateGrad; + T *stateActiveGrad; + T *outputGrad; + T *checkIgGrad; + T *checkFgGrad; + T *checkOgGrad; +}; + +inline activation_mode_t ActiveType(const std::string &type) { + if (type == "sigmoid") { + return HL_ACTIVATION_SIGMOID; + } else if (type == "relu") { + return HL_ACTIVATION_RELU; + } else if (type == "tanh") { + return HL_ACTIVATION_TANH; + } else if (type == "linear" || type == "identity" || type == "") { + return HL_ACTIVATION_LINEAR; + } else { + PADDLE_THROW("Do not support activation type."); + } +} + +template +class LstmUnitFunctor { + public: + static void compute(const platform::DeviceContext &context, + LstmMetaValue value, int frame_size, int batch_size, + const std::string &gate_act, const std::string &cell_act, + const std::string &cand_act); +}; + +template +class LstmUnitGradFunctor { + public: + static void compute(const platform::DeviceContext &context, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + const std::string &gate_act, const std::string &cell_act, + const std::string &cand_act); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc new file mode 100644 index 0000000000000000000000000000000000000000..10c6e105b950b9d510e7a14828d72531e8eb0028 --- /dev/null +++ b/paddle/operators/math/sequence2batch.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class CopyMatrixRowsFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::LoDTensor& src, const size_t* index, + framework::LoDTensor& dst, bool is_src_index) { + auto src_dims = src.dims(); + auto dst_dims = dst.dims(); + PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, + "The src must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL, + "The dst must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], + "The width of src and dst must be same."); + auto height = dst_dims[0]; + auto width = dst_dims[1]; + auto* src_data = src.data(); + auto* dst_data = dst.data(); + for (int i = 0; i < height; ++i) { + if (is_src_index) { + memcpy(dst_data + i * width, src_data + index[i] * width, + width * sizeof(T)); + } else { + memcpy(dst_data + index[i] * width, src_data + i * width, + width * sizeof(T)); + } + } + } +}; + +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; + +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu new file mode 100644 index 0000000000000000000000000000000000000000..4f349946785171e6c59b22163ba76791c7244f88 --- /dev/null +++ b/paddle/operators/math/sequence2batch.cu @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, + int64_t height, int64_t width, + bool is_src_index) { + int idx = threadIdx.x; + int idy = threadIdx.y; + int id = blockIdx.x + idy * GridDimX; + while (id < height) { + int src_idx = is_src_index ? index[id] : id; + int dst_idx = is_src_index ? id : index[id]; + const T* src_data = src + src_idx * width; + T* dst_data = dst + dst_idx * width; + for (int i = idx; i < width; i += BlockDimX) { + dst_data[i] = src_data[i]; + } + id += BlockDimY * GridDimX; + } +} + +template +class CopyMatrixRowsFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::LoDTensor& src, const size_t* index, + framework::LoDTensor& dst, bool is_src_index) { + auto src_dims = src.dims(); + auto dst_dims = dst.dims(); + PADDLE_ENFORCE_EQ(src_dims.size(), 2, + "The src must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(dst_dims.size(), 2, + "The dst must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], + "The width of src and dst must be same."); + auto height = dst_dims[0]; + auto width = dst_dims[1]; + auto* src_data = src.data(); + auto* dst_data = dst.data(); + + dim3 threads(128, 8); + dim3 grid(8, 1); + auto stream = + reinterpret_cast(context).stream(); + CopyMatrixRowsKernel<<>>( + src_data, dst_data, index, height, width, is_src_index); + } +}; + +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; + +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h new file mode 100644 index 0000000000000000000000000000000000000000..03cd018e46e90c9bbe689c9686377e0e998ee513 --- /dev/null +++ b/paddle/operators/math/sequence2batch.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/lod_tensor.h" +#include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class CopyMatrixRowsFunctor { + public: + // If is_src_index is true, + // copy the indexed rows of input src to the output dst. + // If is_src_index is false, + // copy the input src to the indexed rows of output dst. + // The indexed rows are based on the input index. + void operator()(const platform::DeviceContext& context, + const framework::LoDTensor& src, const size_t* index, + framework::LoDTensor& dst, bool is_src_index); +}; + +template +class LoDTensor2BatchFunctor { + // Calculate the length of each sequence and + // sort sequence index by the length. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} + // + struct SeqInfo { + SeqInfo(int start, int length, int seq_idx) + : start(start), length(length), seq_idx(seq_idx) {} + int start; + int length; + int seq_idx; + }; + + public: + void operator()(const platform::DeviceContext& context, + const framework::LoDTensor& lod_tensor, + framework::LoDTensor& batch, bool is_reverse) const { + auto lods = lod_tensor.lod(); + PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now."); + auto lod = lods[0]; + + std::vector seq_info; + for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { + int length = lod[seq_id + 1] - lod[seq_id]; + seq_info.emplace_back(lod[seq_id], length, seq_id); + } + + std::sort(seq_info.begin(), seq_info.end(), + [](SeqInfo a, SeqInfo b) { return a.length > b.length; }); + + // calculate the start position of each batch + // (numBatch equal the maxLength of sequences) + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // num_batch = 5, + // batchIndex = {b0, b1, b2, b3, b4} + // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 + // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} + // batch_start_positions[0] = len(b0) + // batch_start_positions[1] = len(b0) + len(b1) + // batch_start_positions[2] = len(b0) + len(b1) + len(b2) + // ... + // seq2batch_idx[12] = {4, 0, 9, + // 5, 1, 10, + // 6, 2, 11, + // 7, 3, + // 8} + // The batch number represents batch size after rearranging the + // input LodTensor. It is also the maximum length of input sequence. + + paddle::framework::LoD batch_lods; + batch_lods.emplace_back(std::vector{0}); + batch_lods.emplace_back(std::vector{0}); + + // batch_lods[0] is the start positions for batch LoDTensor + int num_batch = seq_info[0].length; + batch_lods[0].resize(static_cast(num_batch + 1)); + // batch_lods[1] is the raw index in the input LoDTensor + auto dims = lod_tensor.dims(); + batch_lods[1].resize(static_cast(dims[0])); + + size_t* batch_starts = batch_lods[0].data(); + size_t* seq2batch_idx = batch_lods[1].data(); + batch_starts[0] = 0; + for (size_t n = 0; n < num_batch; n++) { + auto batch_id = static_cast(batch_starts[n]); + for (size_t i = 0; i < seq_info.size(); ++i) { + size_t seq_len = seq_info[i].length; + int start = seq_info[i].start; + if (n < seq_len) { + seq2batch_idx[batch_id] = + is_reverse ? start + seq_len - 1 - n : start + n; + batch_id++; + } else { + break; + } + } + batch_starts[n + 1] = static_cast(batch_id); + } + batch.set_lod(batch_lods); + + CopyMatrixRowsFunctor to_batch; + to_batch(context, lod_tensor, seq2batch_idx, batch, true); + } +}; + +template +class Batch2LoDTensorFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::LoDTensor& batch, + framework::LoDTensor& lod_tensor) const { + auto in_lod = batch.lod(); + PADDLE_ENFORCE_EQ(in_lod.size(), 2UL, + "The LoD size of input `batch` should be 2."); + auto out_lod = lod_tensor.lod()[0]; + auto num = out_lod[out_lod.size() - 1]; + PADDLE_ENFORCE_EQ(num, lod_tensor.dims()[0]); + PADDLE_ENFORCE_EQ(num, in_lod[1].size()); + PADDLE_ENFORCE_EQ(num, batch.dims()[0]); + CopyMatrixRowsFunctor to_seq; + size_t* index = in_lod[1].data(); + to_seq(context, batch, index, lod_tensor, false); + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 065800f250d8b35a626060bac271e1bce6bb784b..b9b9cd7ca05b4373c27f672cc1ee20daab6827a8 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -49,7 +49,19 @@ class MulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( x_mat_dims[1], y_mat_dims[0], "First matrix's width must be equal with second matrix's height."); - ctx->SetOutputDim("Out", {x_mat_dims[0], y_mat_dims[1]}); + std::vector output_dims; + output_dims.reserve( + static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); + + for (int i = 0; i < x_num_col_dims; ++i) { + output_dims.push_back(x_dims[i]); + } + + for (int i = y_num_col_dims; i < y_dims.size(); ++i) { + output_dims.push_back(y_dims[i]); + } + + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); ctx->ShareLoD("X", /*->*/ "Out"); } }; @@ -109,15 +121,6 @@ class MulOpGrad : public framework::OperatorWithKernel { auto y_mat_dims = framework::flatten_to_2d( y_dims, ctx->Attrs().Get("y_num_col_dims")); - PADDLE_ENFORCE_EQ( - x_mat_dims[0], out_dims[0], - "The first dimension of Out@GRAD must equal to the first dimension of " - "the first operand."); - PADDLE_ENFORCE_EQ( - y_mat_dims[1], out_dims[1], - "The second dimension of Out@GRAD must equal to the second " - "dimension of the second operand."); - auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 3f3e77595b701d428a728fc4727dd3ff4abee45f..bd1bdb4f81b88256822d663fe42ad314338c91ff 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -46,8 +46,15 @@ class MulKernel : public framework::OpKernel { : *y; z->mutable_data(context.GetPlace()); + auto z_dim = z->dims(); + if (z_dim.size() != 2) { + z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } math::matmul(context.device_context(), x_matrix, false, y_matrix, false, 1, z, 0); + if (z_dim.size() != 2) { + z->Resize(z_dim); + } } }; @@ -67,6 +74,11 @@ class MulGradKernel : public framework::OpKernel { : *y; const Tensor* dout = ctx.Input(framework::GradVarName("Out")); + Tensor dout_mat; + dout_mat.ShareDataWith(*dout); + dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0], + framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); + Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); if (dx) { @@ -74,9 +86,10 @@ class MulGradKernel : public framework::OpKernel { Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(*dx, x_num_col_dims) : *dx; + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - math::matmul(ctx.device_context(), *dout, false, y_matrix, true, - 1, &dx_matrix, 0); + math::matmul(ctx.device_context(), dout_mat, false, y_matrix, + true, 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); @@ -84,8 +97,8 @@ class MulGradKernel : public framework::OpKernel { ? framework::ReshapeToMatrix(*dy, y_num_col_dims) : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K - math::matmul(ctx.device_context(), x_matrix, true, *dout, false, - 1, &dy_matrix, 0); + math::matmul(ctx.device_context(), x_matrix, true, dout_mat, + false, 1, &dy_matrix, 0); } } }; diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc index 46f66a1370a35593d1911fc9b3ce76beb38c0956..0599daa7688a5658ebea8902c4e15e63570539fb 100644 --- a/paddle/operators/reduce_op.cc +++ b/paddle/operators/reduce_op.cc @@ -160,66 +160,6 @@ class ReduceMinOpMaker : public ReduceOpMaker { } }; -class NormOp : public NetOp { - public: - NormOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : NetOp(type, inputs, outputs, attrs) { - PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName, - "Input(X) of NormOp should not be null."); - PADDLE_ENFORCE_NE(Output("AbsOut"), framework::kEmptyVarName, - "Output(AbsOut) of NormOp should not be null."); - PADDLE_ENFORCE_NE(Output("PowOut"), framework::kEmptyVarName, - "Output(PowOut) of NormOp should not be null."); - PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName, - "Output(SumOut) of NormOp should not be null."); - PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName, - "Output(Out) of NormOp should not be null."); - auto dim = Attr("dim"); - auto keep_dim = Attr("keep_dim"); - auto p = Attr("p"); - PADDLE_ENFORCE_GT(p, 0, "Order of the norm should be positive."); - AppendOp(framework::OpRegistry::CreateOp("abs", {{"X", {Input("X")}}}, - {{"Y", {Output("AbsOut")}}}, {})); - AppendOp(framework::OpRegistry::CreateOp("pow", {{"X", {Output("AbsOut")}}}, - {{"Y", {Output("PowOut")}}}, - {{"factor", p}})); - framework::AttributeMap sum_attr; - sum_attr["dim"] = dim; - sum_attr["keep_dim"] = keep_dim; - AppendOp(framework::OpRegistry::CreateOp( - "reduce_sum", {{"X", {Output("PowOut")}}}, - {{"Out", {Output("SumOut")}}}, sum_attr)); - AppendOp(framework::OpRegistry::CreateOp( - "pow", {{"X", {Output("SumOut")}}}, {{"Y", {Output("Out")}}}, - {{"factor", static_cast(1. / p)}})); - CompleteAddOp(false); - } -}; - -class NormOpMaker : public ReduceOpMaker { - public: - NormOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : ReduceOpMaker(proto, op_checker) { - AddOutput("AbsOut", - "(Tensor) The intermediate output of Norm operator, " - "saving the absolute value of the input tensor X.") - .AsIntermediate(); - AddOutput("PowOut", - "(Tensor) The intermediate output of Norm operator, " - "saving the p-th power of the output tensor AbsOut.") - .AsIntermediate(); - AddOutput("SumOut", - "(Tensor) the intermediate output of Norm operator, " - "saving the sum of PowOut reduced on the given dimension.") - .AsIntermediate(); - AddAttr("p", "(float, default 2) The order of Norm.").SetDefault(2); - SetComment("Norm", "vector p-norm"); - AddComment(comment_); - } -}; - } // namespace operators } // namespace paddle @@ -237,8 +177,6 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, ops::ReduceGradOp); -REGISTER_OP_WITHOUT_GRADIENT(norm, ops::NormOp, ops::NormOpMaker); - #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ REGISTER_OP_CPU_KERNEL( \ reduce_type, \ diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc index a4f0f37764667c43d48c6aa7646d61cdf4f3fd2d..758481943d463f22eb6c6e0be9a99ad99161da5b 100644 --- a/paddle/operators/smooth_l1_loss_op.cc +++ b/paddle/operators/smooth_l1_loss_op.cc @@ -62,11 +62,13 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("InsideWeight", "Optional input tensor of smooth l1 loss op with the same shape " "as X. If provided, the result of (X - Y) will be multiplied " - "by this tensor element by element."); + "by this tensor element by element.") + .AsDispensable(); AddInput("OutsideWeight", "Optinal input of smooth l1 loss op with the same shape as X." "If provided, the output smooth l1 loss will be multiplied by " - "this tensor element by element."); + "this tensor element by element.") + .AsDispensable(); AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).") .AsIntermediate(); AddOutput("Out", "Smooth l1 loss."); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index daf519b91d623d4369774dc4e37dcb7b1733666b..eb850b658583f2256629d63fdb64248dbf249937 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -25,3 +25,4 @@ nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) +nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt index ceb66f84b6b01892cbaf61c79a47ae60d2589164..bb3fec1be9e811c26cc6851314e960e96fc366b3 100644 --- a/paddle/platform/dynload/CMakeLists.txt +++ b/paddle/platform/dynload/CMakeLists.txt @@ -1,2 +1,3 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) -nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader) +nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc + DEPS dynamic_loader nccl) diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc index ae9a0a982c73de05821579d22b7f9ad99f24a92b..6feba42c0d9d618d27da12e6a6752058b296995e 100644 --- a/paddle/platform/dynload/dynamic_loader.cc +++ b/paddle/platform/dynload/dynamic_loader.cc @@ -35,6 +35,11 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); +DEFINE_string(nccl_dir, "", + "Specify path for loading nccl library, such as libcublas, " + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); + namespace paddle { namespace platform { namespace dynload { @@ -157,6 +162,14 @@ void GetLapackDsoHandle(void** dso_handle) { #endif } +void GetNCCLDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h index a99b05443feb909f10b2c56f4d8bdf3c6fa11e3f..c0e5452e5ae723ec314ebafde86a6ff63980be00 100644 --- a/paddle/platform/dynload/dynamic_loader.h +++ b/paddle/platform/dynload/dynamic_loader.h @@ -58,6 +58,14 @@ void GetWarpCTCDsoHandle(void** dso_handle); */ void GetLapackDsoHandle(void** dso_handle); +/** + * @brief load the DSO of NVIDIA nccl + * + * @param **dso_handle dso handler + * + */ +void GetNCCLDsoHandle(void** dso_handle); + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f92b8d94d56047b7d3fb43b15e3c06575c8d57b --- /dev/null +++ b/paddle/platform/dynload/nccl.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/platform/dynload/nccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nccl_dso_flag; +void *nccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h new file mode 100644 index 0000000000000000000000000000000000000000..0618c7414fd1235e81ee9d92a3a07b53d6ad6ebc --- /dev/null +++ b/paddle/platform/dynload/nccl.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag nccl_dso_flag; +extern void* nccl_dso_handle; + +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(nccl_dso_flag, \ + paddle::platform::dynload::GetNCCLDsoHandle, \ + &nccl_dso_handle); \ + void* p_##__name = dlsym(nccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + ncclResult_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define NCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(ncclCommInitAll); \ + __macro(ncclGetUniqueId); \ + __macro(ncclCommInitRank); \ + __macro(ncclCommDestroy); \ + __macro(ncclCommCount); \ + __macro(ncclCommCuDevice); \ + __macro(ncclCommUserRank); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclReduce); \ + __macro(ncclGetErrorString); + +NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index cd906c3fa9375cd6edaed0377a596771e25043d4..bfe708748a62ff9ac5d151bc652142e1f4925c83 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -29,11 +29,14 @@ limitations under the License. */ #include // for __cxa_demangle #endif +#include + #ifdef PADDLE_WITH_CUDA #include "paddle/platform/dynload/cublas.h" #include "paddle/platform/dynload/cudnn.h" #include "paddle/platform/dynload/curand.h" +#include "paddle/platform/dynload/nccl.h" #include #include @@ -172,6 +175,17 @@ inline typename std::enable_if::type throw_on_error( throw std::runtime_error(err + string::Sprintf(args...)); } +template +inline typename std::enable_if::type throw_on_error( + ncclResult_t stat, const Args&... args) { + if (stat == ncclSuccess) { + return; + } else { + throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + + string::Sprintf(args...)); + } +} + #endif // PADDLE_ONLY_CPU template diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..ab8b96f7263aed83407866fedf9e529ce0affe3f --- /dev/null +++ b/paddle/platform/nccl_test.cu @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/dynload/nccl.h" +#include "paddle/platform/enforce.h" +#include "paddle/platform/gpu_info.h" + +#include +#include +#include + +static int dev_count = 0; + +namespace paddle { +namespace platform { + +TEST(NCCL, init) { + std::vector comms; + comms.resize(dev_count); + + auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); + PADDLE_ENFORCE(status); + for (int i = 0; i < dev_count; ++i) { + dynload::ncclCommDestroy(comms[i]); + } +} + +template +struct PerThreadData { + thrust::device_vector send_buff; + thrust::device_vector recv_buff; + CUDADeviceContext dev_ctx; + + T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); } + + T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } + + PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) { + send_buff.resize(size); + for (size_t i = 0; i < size; ++i) { + send_buff[i] = static_cast(i); + } + recv_buff.resize(size); + } +}; + +static constexpr int ELEM_COUNT = 10000; + +TEST(NCCL, all_reduce) { + std::vector comms; + comms.resize(dev_count); + VLOG(1) << "Initializing ncclComm"; + auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); + PADDLE_ENFORCE(status); + VLOG(1) << "ncclComm initialized"; + VLOG(1) << "Creating thread data"; + std::vector>> data; + data.reserve(dev_count); + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Creating thread data for device " << i; + SetDeviceId(i); + data.emplace_back(new PerThreadData(i, ELEM_COUNT)); + } + VLOG(1) << "Thread data created"; + + VLOG(1) << "Check send_buf data"; + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Check on device " << i; + SetDeviceId(i); + thrust::host_vector tmp = data[i]->send_buff; + for (size_t j = 0; j < tmp.size(); ++j) { + ASSERT_NEAR(static_cast(j), tmp[j], 1e-5); + } + } + + VLOG(1) << "Invoking ncclAllReduce"; + + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Invoking ncclAllReduce with device " << i; + SetDeviceId(i); + PADDLE_ENFORCE(dynload::ncclAllReduce( + data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble, + ncclSum, comms[i], data[i]->dev_ctx.stream())); + VLOG(1) << "Invoked ncclAllReduce for device " << i; + } + + VLOG(1) << "Invoked ncclAllReduce"; + + VLOG(1) << "Sync devices"; + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Sync device " << i; + SetDeviceId(i); + data[i]->dev_ctx.Wait(); + } + VLOG(1) << "device synced"; + + for (int i = 0; i < dev_count; ++i) { + SetDeviceId(i); + VLOG(1) << "Checking vector on device " << i; + thrust::host_vector tmp = data[i]->recv_buff; + for (size_t j = 0; j < tmp.size(); ++j) { + auto elem = static_cast(j); + elem *= dev_count; + ASSERT_NEAR(tmp[j], elem, 1e-4); + } + } + + for (int i = 0; i < dev_count; ++i) { + dynload::ncclCommDestroy(comms[i]); + } +} +} // namespace platform +} // namespace paddle + +int main(int argc, char** argv) { + dev_count = paddle::platform::GetCUDADeviceCount(); + if (dev_count <= 1) { + LOG(WARNING) + << "Cannot test multi-gpu nccl, because the CUDA device count is " + << dev_count; + return 0; + } + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/platform/place.h b/paddle/platform/place.h index 0efc6932349a5b3ad295d195a16737a642e18943..5370360a7de26e409a1545182a12d3df1f37658b 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -35,6 +35,7 @@ struct GPUPlace { GPUPlace() : GPUPlace(0) {} explicit GPUPlace(int d) : device(d) {} + inline int GetDeviceId() const { return device; } // needed for variant equality comparison inline bool operator==(const GPUPlace &o) const { return device == o.device; } inline bool operator!=(const GPUPlace &o) const { return !(*this == o); } diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 46c24e2cd53c068a25e1a5c8c6df600c3111e20a..d7cd738828a10b431370c92026b89d62add1275e 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -4,3 +4,5 @@ if(WITH_PYTHON) DEPS pybind python backward proto_desc tensor_array paddle_memory executor ${GLOB_OP_LIB}) endif(WITH_PYTHON) + +cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array) diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f2a9383f7a069f1a8c7ed2bf3da46720470efa --- /dev/null +++ b/paddle/pybind/print_operators_doc.cc @@ -0,0 +1,132 @@ +#include +#include // std::stringstream +#include + +#include "paddle/framework/op_info.h" +#include "paddle/framework/op_registry.h" +#include "paddle/pybind/pybind.h" + +std::string Escape(const std::string& s) { + std::string r; + for (size_t i = 0; i < s.size(); i++) { + switch (s[i]) { + case '\"': + r += "\\\""; + break; + case '\\': + r += "\\\\"; + break; + case '\n': + r += "\\n"; + break; + case '\t': + r += "\\t"; + case '\r': + break; + default: + r += s[i]; + break; + } + } + return r; +} + +std::string AttrType(paddle::framework::AttrType at) { + switch (at) { + case paddle::framework::INT: + return "int"; + case paddle::framework::FLOAT: + return "float"; + case paddle::framework::STRING: + return "string"; + case paddle::framework::BOOLEAN: + return "bool"; + case paddle::framework::INTS: + return "int array"; + case paddle::framework::FLOATS: + return "float array"; + case paddle::framework::STRINGS: + return "string array"; + case paddle::framework::BOOLEANS: + return "bool array"; + case paddle::framework::BLOCK: + return "block id"; + } + return "UNKNOWN"; // not possible +} + +void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) { + ss << " { " + << "\n" + << " \"name\" : \"" << Escape(v.name()) << "\",\n" + << " \"comment\" : \"" << Escape(v.comment()) << "\",\n" + << " \"duplicable\" : " << v.duplicable() << ",\n" + << " \"intermediate\" : " << v.intermediate() << "\n" + << " },"; +} + +void PrintAttr(const paddle::framework::OpProto::Attr& a, + std::stringstream& ss) { + ss << " { " + << "\n" + << " \"name\" : \"" << Escape(a.name()) << "\",\n" + << " \"type\" : \"" << AttrType(a.type()) << "\",\n" + << " \"comment\" : \"" << Escape(a.comment()) << "\",\n" + << " \"generated\" : " << a.generated() << "\n" + << " },"; +} + +void PrintOpProto(const std::string& type, + const paddle::framework::OpInfo& opinfo, + std::stringstream& ss) { + std::cerr << "Processing " << type << "\n"; + + const paddle::framework::OpProto* p = opinfo.proto_; + if (p == nullptr) { + return; // It is possible that an operator doesn't have OpProto. + } + + ss << "{\n" + << " \"type\" : \"" << Escape(p->type()) << "\",\n" + << " \"comment\" : \"" << Escape(p->comment()) << "\",\n"; + + ss << " \"inputs\" : [ " + << "\n"; + for (int i = 0; i < p->inputs_size(); i++) { + PrintVar(p->inputs(i), ss); + } + ss.seekp(-1, ss.cur); // remove the trailing comma + ss << " ], " + << "\n"; + + ss << " \"outputs\" : [ " + << "\n"; + for (int i = 0; i < p->outputs_size(); i++) { + PrintVar(p->outputs(i), ss); + } + ss.seekp(-1, ss.cur); // remove the trailing comma + ss << " ], " + << "\n"; + + ss << " \"attrs\" : [ " + << "\n"; + for (int i = 0; i < p->attrs_size(); i++) { + PrintAttr(p->attrs(i), ss); + } + ss.seekp(-1, ss.cur); // remove the trailing comma + ss << " ] " + << "\n"; + + ss << "},"; +} + +int main() { + std::stringstream ss; + ss << "[\n"; + for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) { + PrintOpProto(iter.first, iter.second, ss); + } + ss.seekp(-1, ss.cur); // remove the trailing comma + ss << "]\n"; + std::cout << ss.str(); +} diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 405ac544e10f19a33399a649f76699fefc3d49b9..5d43ecea11202fa3f9e21fbde907c9d1d7dd4025 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -257,6 +257,7 @@ void BindOpDesc(py::module &m) { .def("block_attr", &OpDescBind::GetBlockAttr) .def("check_attrs", &OpDescBind::CheckAttrs) .def("infer_shape", &OpDescBind::InferShape) + .def("infer_var_type", &OpDescBind::InferVarType) .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes { const OpDesc *desc = op_desc.Proto(); PADDLE_ENFORCE(desc->IsInitialized(), diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 26b793a4bbf5df7a2635838a6c6a8264ca8ebb67..b6e44fdbad6e2817e3077901f58177adc4bb0c71 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -225,15 +225,16 @@ All parameter, weight, gradient are variables in Paddle. //! Python str. If you want a str object, you should cast them in Python. m.def("get_all_op_protos", []() -> std::vector { std::vector ret_values; - - OpInfoMap::Instance().IterAllInfo([&ret_values](const std::string &type, - const OpInfo &info) { - if (!info.HasOpProtoAndChecker()) return; - std::string str; - PADDLE_ENFORCE(info.Proto().SerializeToString(&str), - "Serialize OpProto Error. This could be a bug of Paddle."); - ret_values.emplace_back(str); - }); + for (auto &iter : OpInfoMap::Instance().map()) { + auto &info = iter.second; + if (info.HasOpProtoAndChecker()) { + std::string str; + PADDLE_ENFORCE( + info.Proto().SerializeToString(&str), + "Serialize OpProto Error. This could be a bug of Paddle."); + ret_values.emplace_back(str); + } + } return ret_values; }); m.def_submodule( diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 120c9d11a5ebaa72b94590e596fd4362c552f979..3821d075cba5d39b5808a39093b8570d9302b667 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1457,11 +1457,13 @@ def dot_product_attention(encoded_sequence, expanded = expand_layer( input=transformed_state, - expanded_as=encoded_sequence, + expand_as=encoded_sequence, name='%s_expand' % name) m = linear_comb_layer( - weights=expanded, vectors=encoded_sequence, name='%s_dot-product') + weights=expanded, + vectors=encoded_sequence, + name='%s_dot-product' % name) attention_weight = fc_layer( input=m, diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py index 03a3dacf25c2ad5514e914d2f6e9637493ba80f4..40b9008d67b4e42093b9b9cbdecd1dbff4150b41 100644 --- a/python/paddle/v2/framework/framework.py +++ b/python/paddle/v2/framework/framework.py @@ -53,8 +53,8 @@ class Variable(object): if is_new_var: self.desc.set_data_type(dtype) else: - old_dtype = self.data_type() - if dtype != old_shape: + old_dtype = self.data_type + if dtype != old_dtype: raise ValueError("Variable {0} has been created before. " "The previous data type is {1}; the new " "data type is {2}. They are not " @@ -113,6 +113,10 @@ class Variable(object): def lod_level(self): return self.desc.lod_level() + @property + def type(self): + return self.desc.type() + @staticmethod def _unique_var_name_(): uid = core.unique_integer() # unique during whole process. @@ -192,31 +196,32 @@ class Operator(object): self.desc.set_type(type) proto = OpProtoHolder.instance().get_op_proto(type) - if inputs is not None: - given = set() - need = set() - for n in inputs: - given.add(n) - for m in proto.inputs: - need.add(m.name) - if not given == need: - raise ValueError( - "Incorrect setting for input(s) of operator \"%s\". Need: [%s] Given: [%s]" - % (type, ", ".join(str(e) for e in need), ", ".join( - str(e) for e in given))) + def find_name(var_list, name): + for var_name in var_list: + if var_name == name: + return True + return False + if inputs is not None: for in_proto in proto.inputs: - in_argus = inputs[in_proto.name] - if not isinstance(in_argus, list): - in_argus = [in_argus] - if not in_proto.duplicable and len(in_argus) > 1: - raise ValueError( - "Input %s expects only one input, but %d are given." % - (in_proto.name, len(in_argus))) - in_argu_names = [] - for argu in in_argus: - in_argu_names.append(argu.name) - self.desc.set_input(in_proto.name, in_argu_names) + found = find_name(inputs, in_proto.name) + assert found or in_proto.dispensable, "Input {} not found".format( + in_proto.name) + + if found: + in_argus = inputs[in_proto.name] + if not isinstance(in_argus, list): + in_argus = [in_argus] + if not in_proto.duplicable and len(in_argus) > 1: + raise ValueError( + "Input %s expects only one input, but %d are given." + % (in_proto.name, len(in_argus))) + in_argu_names = [] + for argu in in_argus: + in_argu_names.append(argu.name) + self.desc.set_input(in_proto.name, in_argu_names) + else: + self.desc.set_input(in_proto.name, []) if outputs is not None: given = set() @@ -250,13 +255,14 @@ class Operator(object): attr_name = attr.name if (not attr_name in attrs) or (attrs[attr_name] is None): continue - if not isinstance(attrs[attr_name], Block): - self.desc.set_attr(attr_name, attrs[attr_name]) - else: + if isinstance(attrs[attr_name], Block): self.desc.set_block_attr(attr_name, attrs[attr_name].desc) + else: + self.desc.set_attr(attr_name, attrs[attr_name]) self.desc.check_attrs() if type not in {'feed', 'fetch'}: + self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) def __str__(self): diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py index 849a6f43065ae95e908e449e9ef9300b64692e5e..f3da32f0e07a22204b3feaed5d1d8d01556e4655 100644 --- a/python/paddle/v2/framework/layer_helper.py +++ b/python/paddle/v2/framework/layer_helper.py @@ -1,8 +1,11 @@ -from paddle.v2.framework.framework import Variable, OpProtoHolder, g_program, g_init_program -import paddle.v2.framework.core as core import copy import itertools +import paddle.v2.framework.core as core + +from paddle.v2.framework.framework import Variable, g_program, \ + g_init_program + def unique_name(prefix): uid = core.unique_integer() # unique during whole process. @@ -120,10 +123,7 @@ class LayerHelper(object): if attr['name'] is None: attr['name'] = unique_name(".".join([self.name, suffix])) self.init_program.global_block().create_parameter( - name=attr['name'], - dtype=dtype, - shape=shape, - init_attr=attr['init_attr']) + dtype=dtype, shape=shape, **attr) return self.program.global_block().create_parameter( name=attr['name'], dtype=dtype, shape=shape) @@ -133,6 +133,9 @@ class LayerHelper(object): dtype=dtype, persistable=False) + def create_variable(self, *args, **kwargs): + return self.program.current_block().create_var(*args, **kwargs) + def create_global_variable(self, *args, **kwargs): return self.program.global_block().create_var( *args, persistable=False, **kwargs) diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py index ac77aefa15333b06f9803ce1d91071df803483d1..6894c40c3a6514f448133f029c4de8cc30405515 100644 --- a/python/paddle/v2/framework/layers.py +++ b/python/paddle/v2/framework/layers.py @@ -1,9 +1,12 @@ -from paddle.v2.framework.layer_helper import LayerHelper +from paddle.v2.framework.layer_helper import LayerHelper, unique_name import paddle.v2.framework.core as core -from paddle.v2.framework.framework import OpProtoHolder, Variable +from paddle.v2.framework.framework import OpProtoHolder, Variable, Program import re -__all__ = ['fc', 'data', 'cross_entropy', 'conv2d', 'pool2d'] +__all__ = [ + 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', + 'StaticRNN' +] def fc(input, @@ -24,7 +27,9 @@ def fc(input, mul_results = [] for input_var, param_attr in helper.iter_inputs_and_params(): input_shape = input_var.shape - param_shape = list(input_shape[num_flatten_dims:]) + [size] + param_shape = [ + reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) + ] + [size] w = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=dtype) @@ -36,10 +41,8 @@ def fc(input, "Y": w, }, outputs={"Out": tmp}, - attrs={ - 'x_num_col_dims': num_flatten_dims, - 'y_num_col_dims': len(input_shape) - num_flatten_dims - }) + attrs={'x_num_col_dims': num_flatten_dims, + 'y_num_col_dims': 1}) mul_results.append(tmp) # sum @@ -55,6 +58,24 @@ def fc(input, return helper.append_activation(pre_activation) +def embedding(input, + size, + data_type='float32', + param_attr=None, + program=None, + init_program=None): + helper = LayerHelper('embedding', **locals()) + w = helper.create_parameter( + attr=helper.param_attr, shape=size, dtype=data_type) + tmp = helper.create_tmp_variable(data_type) + helper.append_op( + type='lookup_table', + inputs={'Ids': input, + 'W': w}, + outputs={'Out': tmp}) + return tmp + + def data(name, shape, data_type='float32', @@ -122,6 +143,19 @@ _create_op_func_('mean') _create_op_func_('mul') +def concat(input, axis, program=None, init_program=None): + helper = LayerHelper('concat', **locals()) + if not isinstance(input, list) and not isinstance(input, tuple): + input = [input] + out = helper.create_tmp_variable(dtype=input[0].data_type) + helper.append_op( + type='concat', + inputs={'X': input}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def cross_entropy(input, label, **kwargs): helper = LayerHelper('cross_entropy', **kwargs) out = helper.create_tmp_variable(dtype=input.data_type) @@ -240,3 +274,170 @@ def pool2d(input, }) return pool_out + + +class BlockGuard(object): + """ + BlockGuard used to create sub-block in program by using Python `with` + keyword. + """ + + def __init__(self, program): + if not isinstance(program, Program): + raise TypeError("BlockGuard takes a program") + self.program = program + + def __enter__(self): + self.program.create_block() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.program.rollback() + if exc_type is not None: + return False # re-raise exception + return True + + +class StaticRNNGuard(BlockGuard): + def __init__(self, rnn): + if not isinstance(rnn, StaticRNN): + raise TypeError("StaticRNNGuard takes an StaticRNN") + super(StaticRNNGuard, self).__init__(rnn.helper.program) + self.rnn = rnn + + def __enter__(self): + self.rnn.status = StaticRNN.IN_RNN_BLOCK + return super(StaticRNNGuard, self).__enter__() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.rnn.status = StaticRNN.AFTER_RNN_BLOCK + self.rnn.complete_rnn_op() + return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb) + + +class StaticRNNMemoryLink(object): + """ + :param init: the initial variable for Memory + :type init: Variable + :param pre_mem: the memory variable in previous time step + :type pre_mem: Variable + :param mem: the memory variable in current time step + :type mem: Variable + """ + + def __init__(self, init, pre_mem, mem=None): + self.init = init + self.pre_mem = pre_mem + self.mem = mem + + +class StaticRNN(object): + BEFORE_RNN_BLOCK = 0 + IN_RNN_BLOCK = 1 + AFTER_RNN_BLOCK = 2 + + def __init__(self, name=None, program=None): + self.helper = LayerHelper("static_rnn", name=name, program=program) + self.memories = {} # memory map, from pre_mem.name --> MemoryLink + self.inputs = [] # input variable list in current block + self.outputs = [] # output variable list in parent block + self.status = StaticRNN.BEFORE_RNN_BLOCK # status flag. + # sequence length, since it is a static RNN, sequence length are fixed. + self.seq_len = None + + def step(self): + return StaticRNNGuard(self) + + def _assert_in_rnn_block_(self, method): + if self.status != StaticRNN.IN_RNN_BLOCK: + raise ValueError("You must invoke {0} in rnn block".format(method)) + + def memory(self, init=None, shape=None, dtype=None, init_value=0): + self._assert_in_rnn_block_('memory') + if init is None: + if shape is None or dtype is None: + raise ValueError( + "if init is None, memory at least need shape and dtype") + parent_block = self.parent_block() + var_name = unique_name("@".join([self.helper.name, "memory_boot"])) + boot_var = parent_block.create_var( + name=var_name, shape=shape, dtype=dtype, persistable=False) + + parent_block.append_op( + type="fill_constant", + inputs={}, + outputs={'Out': [boot_var]}, + attrs={ + 'value': init_value, + 'shape': boot_var.shape, + 'data_type': boot_var.data_type + }) + + return self.memory(init=boot_var) + else: + pre_mem = self.helper.create_variable( + name=unique_name("@".join([self.helper.name, "mem"])), + dtype=init.data_type, + shape=init.shape) + self.memories[pre_mem.name] = StaticRNNMemoryLink( + init=init, pre_mem=pre_mem) + return pre_mem + + def step_input(self, x): + self._assert_in_rnn_block_('step_input') + if not isinstance(x, Variable): + raise TypeError("step input takes a Variable") + if self.seq_len is None: + self.seq_len = x.shape[1] + elif self.seq_len != x.shape[1]: + raise ValueError("Static RNN only take fix seq_len input") + + ipt = self.helper.create_variable( + name=x.name, + dtype=x.data_type, + shape=[-1] + list(x.shape[2:]), + type=x.type) + self.inputs.append(ipt) + return ipt + + def step_output(self, o): + self._assert_in_rnn_block_('step_output') + if not isinstance(o, Variable): + raise TypeError("step output takes a Variable") + + out_var = self.parent_block().create_var( + name=o.name, + shape=[-1, self.seq_len] + list(o.shape[1:]), + dtype=o.data_type) + + self.outputs.append(out_var) + + def output(self, *outputs): + for each in outputs: + self.step_output(each) + + def update_memory(self, mem, var): + if not isinstance(mem, Variable) or not isinstance(var, Variable): + raise TypeError("update memory should take variables") + self.memories[mem.name].mem = var + + def parent_block(self): + prog = self.helper.program + parent_idx = prog.current_block().parent_idx + assert parent_idx >= 0 + parent_block = prog.block(parent_idx) + return parent_block + + def __call__(self, *args, **kwargs): + if self.status != StaticRNN.AFTER_RNN_BLOCK: + raise ValueError("RNN output can only be retrieved after rnn block") + if len(self.outputs) == 0: + raise ValueError("RNN has no output") + elif len(self.outputs) == 1: + return self.outputs[0] + else: + return self.outputs + + def complete_rnn_op(self): + # TODO(yuyang18): Create RNN Op here. + # Implement this method after RNN op complete. + pass diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py index 215fa0b94e423755b7bc3f05a2b14a8c85451202..0fdc21ef5133d17b33860a0e095574d3136b2fd1 100644 --- a/python/paddle/v2/framework/tests/op_test.py +++ b/python/paddle/v2/framework/tests/op_test.py @@ -4,6 +4,8 @@ import random import itertools import paddle.v2.framework.core as core from paddle.v2.framework.op import Operator +from paddle.v2.framework.executor import Executor +from paddle.v2.framework.framework import Program, OpProtoHolder def grad_var_name(var_name): @@ -177,7 +179,12 @@ def get_backward_op(scope, op, no_grad_set): return backward_op -def get_gradient(scope, op, inputs, outputs, grad_name, place, +def get_gradient(scope, + op, + inputs, + outputs, + grad_names, + place, no_grad_set=None): ctx = core.DeviceContext.create(place) @@ -193,8 +200,52 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place, backward_op.run(scope, ctx) - out = np.array(scope.find_var(grad_name).get_tensor()) - return out + return [ + np.array(scope.find_var(grad_name).get_tensor()) + for grad_name in grad_names + ] + + +def append_input_output(block, op_proto, np_list, is_input): + '''Insert VarDesc and generate Python variable instance''' + proto_list = op_proto.inputs if is_input else op_proto.outputs + + def create_var(block, name, np_list, var_proto): + if name not in np_list: + assert var_proto.intermediate, "{} not found".format(name) + shape = None + lod_level = None + else: + np_value = np_list[name] + if isinstance(np_value, tuple): + shape = list(np_value[0].shape) + lod_level = len(np_value[1]) + else: + shape = list(np_value.shape) + lod_level = 0 + return block.create_var( + dtype="float32", shape=shape, lod_level=lod_level, name=name) + + var_dict = {} + for var_proto in proto_list: + var_name = str(var_proto.name) + if is_input: + if (var_name not in np_list) and var_proto.dispensable: + continue + assert (var_name in np_list) or (var_proto.dispensable), \ + "Missing {} as input".format(var_name) + if var_proto.duplicable: + assert isinstance(np_list[var_name], list), \ + "Duplicable {} should be set as list".format(var_name) + var_list = [] + for (name, np_value) in np_list[var_name]: + var_list.append( + create_var(block, name, {name: np_value}, var_proto)) + var_dict[var_name] = var_list + else: + var_dict[var_name] = create_var(block, var_name, np_list, var_proto) + + return var_dict class OpTest(unittest.TestCase): @@ -213,48 +264,93 @@ class OpTest(unittest.TestCase): np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) + def feed_var(self, input_vars, place): + feed_map = {} + for var_name in input_vars: + if isinstance(input_vars[var_name], list): + for name, np_value in self.inputs[var_name]: + tensor = core.LoDTensor() + tensor.set(np_value, place) + feed_map[name] = tensor + else: + tensor = core.LoDTensor() + if isinstance(self.inputs[var_name], tuple): + tensor.set(self.inputs[var_name][0], place) + tensor.set_lod(self.inputs[var_name][1]) + else: + tensor.set(self.inputs[var_name], place) + feed_map[var_name] = tensor + + return feed_map + def check_output_with_place(self, place, atol): - self.scope = core.Scope() - op_inputs = self.inputs if hasattr(self, "inputs") else dict() - op_outputs = self.outputs if hasattr(self, "outputs") else dict() - op_attrs = self.attrs if hasattr(self, "attrs") else dict() - self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs, - op_attrs) - if isinstance(place, core.GPUPlace) and not self.op.support_gpu(): - return - set_input(self.scope, self.op, self.inputs, place) - ctx = core.DeviceContext.create(place) - self.op.run(self.scope, ctx) + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + + program = Program() + block = program.global_block() + + inputs = append_input_output(block, op_proto, self.inputs, True) + outputs = append_input_output(block, op_proto, self.outputs, False) + + op = block.append_op( + type=self.op_type, + inputs=inputs, + outputs=outputs, + attrs=self.attrs if hasattr(self, "attrs") else dict()) + + fetch_list = [] + for var_name, var in outputs.iteritems(): + if var_name in self.outputs: + if isinstance(var, list): + for v in var: + fetch_list.append(v) + else: + fetch_list.append(var) - for out_name, out_dup in Operator.get_op_outputs(self.op.type()): + feed_map = self.feed_var(inputs, place) + + exe = Executor(place) + outs = exe.run(program, feed=feed_map, fetch_list=fetch_list) + + for out_name, out_dup in Operator.get_op_outputs(self.op_type): if out_name not in self.outputs: continue + def find_actual(target_name, fetch_list): + found = [ + i for i, var in enumerate(fetch_list) + if var.name == target_name + ] + self.assertTrue( + len(found) == 1, "Found {} {}".format( + len(found), target_name)) + return found[0] + if out_dup: sub_out = self.outputs[out_name] if not isinstance(sub_out, list): raise AssertionError("sub_out type %s is not list", type(sub_out)) - for sub_out_name, expect in sub_out: - actual = np.array( - self.scope.find_var(sub_out_name).get_tensor()) + idx = find_actual(sub_out_name, fetch_list) + actual = outs[idx] self.assertTrue( np.allclose( actual, expect, atol=atol), - "output name: " + out_name + " has diff.") + "Output (" + sub_out_name + ") has diff at " + + str(place)) else: - actual = np.array(self.scope.find_var(out_name).get_tensor()) + idx = find_actual(out_name, fetch_list) + actual = outs[idx] expect = self.outputs[out_name] - self.assertTrue( np.allclose( actual, expect, atol=atol), - "output name: " + out_name + " has diff.") + "Output (" + out_name + ") has diff at " + str(place)) def check_output(self, atol=1e-5): places = [core.CPUPlace()] - if core.is_compile_gpu(): + if core.is_compile_gpu() and core.op_support_gpu(self.op_type): places.append(core.GPUPlace(0)) for place in places: self.check_output_with_place(place, atol) @@ -310,11 +406,9 @@ class OpTest(unittest.TestCase): ] cpu_place = core.CPUPlace() - cpu_analytic_grads = [ - get_gradient(self.scope, self.op, self.inputs, self.outputs, - grad_name, cpu_place, no_grad_set) - for grad_name in grad_names - ] + cpu_analytic_grads = get_gradient(self.scope, self.op, self.inputs, + self.outputs, grad_names, cpu_place, + no_grad_set) self.__assert_is_close(numeric_grads, cpu_analytic_grads, grad_names, max_relative_error, @@ -322,11 +416,9 @@ class OpTest(unittest.TestCase): if core.is_compile_gpu() and self.op.support_gpu(): gpu_place = core.GPUPlace(0) - gpu_analytic_grads = [ - get_gradient(self.scope, self.op, self.inputs, self.outputs, - grad_name, gpu_place, no_grad_set) - for grad_name in grad_names - ] + gpu_analytic_grads = get_gradient(self.scope, self.op, self.inputs, + self.outputs, grad_names, + gpu_place, no_grad_set) self.__assert_is_close(numeric_grads, gpu_analytic_grads, grad_names, max_relative_error, diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/framework/tests/test_accuracy_op.py index b6f3a35d6f58ba90b39e3f6296ae635220a2e965..02be9a02910bee3eae63e12cceaa51cf53591539 100644 --- a/python/paddle/v2/framework/tests/test_accuracy_op.py +++ b/python/paddle/v2/framework/tests/test_accuracy_op.py @@ -16,7 +16,9 @@ class TestAccuracyOp(OpTest): if ele == label[rowid]: num_correct += 1 break - self.outputs = {'Accuracy': [num_correct / float(n)]} + self.outputs = { + 'Accuracy': np.array([num_correct / float(n)]).astype("float32") + } def test_check_output(self): self.check_output() diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py index 5831b880e4c5ef881929920e87ac64d6c87a2ab5..c1668cd00ff6c3782dd17a789e4ad93b92e5209d 100644 --- a/python/paddle/v2/framework/tests/test_activation_op.py +++ b/python/paddle/v2/framework/tests/test_activation_op.py @@ -172,8 +172,8 @@ class TestBRelu(OpTest): def setUp(self): self.op_type = "brelu" x = np.random.uniform(-1, 1, [4, 4]).astype("float32") - t_min = 1 - t_max = 4 + t_min = 1.0 + t_max = 4.0 # The same with TestAbs x[np.abs(x - t_min) < 0.005] = t_min + 0.02 x[np.abs(x - t_max) < 0.005] = t_max + 0.02 @@ -218,7 +218,7 @@ class TestSoftRelu(OpTest): def setUp(self): self.op_type = "soft_relu" x = np.random.uniform(-3, 3, [4, 4]).astype("float32") - threshold = 2 + threshold = 2.0 # The same reason with TestAbs x[np.abs(x - threshold) < 0.005] = threshold + 0.02 x[np.abs(x + threshold) < 0.005] = -threshold + 0.02 @@ -303,7 +303,7 @@ class TestPow(OpTest): def setUp(self): self.op_type = "pow" self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")} - self.attrs = {'factor': 3} + self.attrs = {'factor': 3.0} self.outputs = {'Y': np.power(self.inputs['X'], 3)} def test_check_output(self): diff --git a/python/paddle/v2/framework/tests/test_clip_op.py b/python/paddle/v2/framework/tests/test_clip_op.py index 5df6a494989017bab0416e0af962b2a85db046ba..a7e1bf174408e4139db0435d9f4bb0c885f76705 100644 --- a/python/paddle/v2/framework/tests/test_clip_op.py +++ b/python/paddle/v2/framework/tests/test_clip_op.py @@ -37,14 +37,14 @@ class TestCase1(TestClipOp): def initTestCase(self): self.shape = (8, 16, 8) self.max = 0.7 - self.min = 0 + self.min = 0.0 class TestCase2(TestClipOp): def initTestCase(self): self.shape = (8, 16) - self.max = 1 - self.min = 0 + self.max = 1.0 + self.min = 0.0 class TestCase3(TestClipOp): diff --git a/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py b/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py new file mode 100644 index 0000000000000000000000000000000000000000..71ca262f00378381d2d65e87d198d6b1755e9a2b --- /dev/null +++ b/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py @@ -0,0 +1,102 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): + # [2, 3, 5, 5] + in_n, in_c, in_h, in_w = input_.shape + # [3, 6, 3, 3] + f_c, out_c, f_h, f_w = filter_.shape + assert in_c == f_c + + stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad'] + out_h = (in_h - 1) * stride[0] + f_h + out_w = (in_w - 1) * stride[1] + f_w + + out = np.zeros((in_n, out_c, out_h, out_w)) + + for n in range(in_n): + for i in range(in_h): + for j in range(in_w): + input_masked = input_[n, :, i, j] # (c) + input_masked = np.reshape(input_masked, (in_c, 1, 1)) + input_masked = np.tile(input_masked, (1, f_h, f_w)) + + for k in range(out_c): + tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0) + i1, i2 = i * stride[0], i * stride[0] + f_h + j1, j2 = j * stride[0], j * stride[0] + f_w + out[n, k, i1:i2, j1:j2] += tmp_out + + return out + + +class TestConv2dTransposeOp(OpTest): + def setUp(self): + # init as conv transpose + self.init_op_type() + + # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7] + self.init_test_case() + + conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad} + input_ = np.random.random(self.input_size).astype("float32") + filter_ = np.random.random(self.filter_size).astype("float32") + output = conv2dtranspose_forward_naive(input_, filter_, + conv2dtranspose_param) + # print 'deconv output py', output, output.shape + + self.inputs = {'Input': input_, 'Filter': filter_} + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + # 'dilations': self.dilations + } + self.outputs = {'Output': output} + + def test_check_output(self): + print 'check output here' + self.check_output() + + def test_check_grad(self): + self.check_grad( + set(['Input', 'Filter']), 'Output', max_relative_error=0.05) + + def test_check_grad_no_filter(self): + self.check_grad( + ['Input'], + 'Output', + max_relative_error=0.05, + no_grad_set=set(['Filter'])) + + def test_check_grad_no_input(self): + self.check_grad( + ['Filter'], + 'Output', + max_relative_error=0.05, + no_grad_set=set(['Input'])) + + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + def init_op_type(self): + self.op_type = "conv2dtranspose" + + +""" +class TestCudnn(TestConv2dOp): + def init_group(self): + self.groups = 1 + + def init_op_type(self): + self.op_type = "conv_cudnn" +""" + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py deleted file mode 100644 index 9f56fe5049c66aa5fce40ce815105e7871ebc3b2..0000000000000000000000000000000000000000 --- a/python/paddle/v2/framework/tests/test_fc_op.py +++ /dev/null @@ -1,62 +0,0 @@ -import unittest -import numpy as np -from op_test import OpTest - - -class TestFCOp1(OpTest): - def setUp(self): - x0 = np.random.random((16, 32)).astype("float32") - w0 = np.random.random((32, 10)).astype("float32") - - mul_out0 = np.dot(x0, w0) - identity_out = mul_out0 - - self.op_type = "fc" - self.inputs = {"X": [("X0", x0)], "W": [("W0", w0)]} - self.outputs = {"MulOut": [("MulOut0", mul_out0)], "Out": identity_out} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X0", "W0"], "Out", max_relative_error=0.01) - - -class TestFCOp2(OpTest): - def setUp(self): - x0 = np.random.random((16, 4, 8)).astype("float32") - x1 = np.random.random((4, 4, 32)).astype("float32") - w0 = np.random.random((32, 10)).astype("float32") - w1 = np.random.random((32, 10)).astype("float32") - b = np.random.random(10).astype("float32") - - mul_out0 = np.dot(x0.reshape(16, 4 * 8), w0) - mul_out1 = np.dot(x1.reshape(4 * 4, 32), w1) - sum_out = mul_out0 + mul_out1 - add_out = np.add(sum_out, b) - sigmoid_out = 1 / (1 + np.exp(-add_out)) - - self.op_type = "fc" - self.inputs = { - "X": [("X0", x0), ("X1", x1)], - "W": [("W0", w0), ("W1", w1)], - "B": b - } - self.attrs = {"xNumColDims": [1, 2], "activation": "sigmoid"} - self.outputs = { - "MulOut": [("MulOut0", mul_out0), ("MulOut1", mul_out1)], - "SumOut": sum_out, - "AddOut": add_out, - "Out": sigmoid_out - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad( - ["X0", "X1", "W0", "W1", "B"], "Out", max_relative_error=0.01) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/v2/framework/tests/test_identity_op.py b/python/paddle/v2/framework/tests/test_identity_op.py deleted file mode 100644 index 26cec1fcc3ad003281c9c41571d475b55bd30026..0000000000000000000000000000000000000000 --- a/python/paddle/v2/framework/tests/test_identity_op.py +++ /dev/null @@ -1,20 +0,0 @@ -import unittest -import numpy as np -from op_test import OpTest - - -class TestIdentityOp(OpTest): - def setUp(self): - self.op_type = "identity" - self.inputs = {'X': np.random.random((10, 10)).astype("float32")} - self.outputs = {'Y': self.inputs['X']} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Y') - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/v2/framework/tests/test_interp_op.py b/python/paddle/v2/framework/tests/test_interp_op.py deleted file mode 100644 index 066569b96c9611bd20e7192f8bd6caa6e467202f..0000000000000000000000000000000000000000 --- a/python/paddle/v2/framework/tests/test_interp_op.py +++ /dev/null @@ -1,28 +0,0 @@ -import unittest -import numpy as np -from op_test import OpTest - - -class TestInterpOp(OpTest): - def setUp(self): - self.op_type = "interp" - x = np.random.random((2, 3)).astype("float32") - y = np.random.random((2, 3)).astype("float32") - w = np.random.random(2).astype("float32") - - sub_out = x - y - mul_out = sub_out * w.reshape(2, 1) - out = mul_out + y - - self.inputs = {'X': x, 'Y': y, 'W': w} - self.outputs = {'Out': out, 'SubOut': sub_out, 'MulOut': mul_out} - - def test_check_output(self): - self.check_output() - - def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py index 4ecc02b12d8db53e897dea10186bc053d05be303..7aedb985f98f2d8953e0968d19ece9c70d792246 100644 --- a/python/paddle/v2/framework/tests/test_layers.py +++ b/python/paddle/v2/framework/tests/test_layers.py @@ -88,6 +88,77 @@ class TestBook(unittest.TestCase): print str(program) + def test_word_embedding(self): + program = Program() + dict_size = 10000 + embed_size = 32 + first_word = layers.data( + name='firstw', shape=[1], data_type='int32', program=program) + second_word = layers.data( + name='secondw', shape=[1], data_type='int32', program=program) + third_word = layers.data( + name='thirdw', shape=[1], data_type='int32', program=program) + forth_word = layers.data( + name='forthw', shape=[1], data_type='int32', program=program) + next_word = layers.data( + name='nextw', shape=[1], data_type='int32', program=program) + + embed_param_attr_1 = { + 'name': 'shared_w', + 'init_attr': { + 'max': 1.0, + 'type': 'uniform_random', + 'min': -1.0 + } + } + embed_param_attr_2 = {'name': 'shared_w'} + + embed_first = layers.embedding( + input=first_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_1, + program=program) + embed_second = layers.embedding( + input=second_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program) + + embed_third = layers.embedding( + input=third_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program) + embed_forth = layers.embedding( + input=forth_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program) + + concat_embed = layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], + axis=1, + program=program) + + hidden1 = layers.fc(input=concat_embed, + size=256, + act='sigmoid', + program=program) + predict_word = layers.fc(input=hidden1, + size=dict_size, + act='softmax', + program=program) + cost = layers.cross_entropy( + input=predict_word, label=next_word, program=program) + avg_cost = layers.mean(x=cost, program=program) + self.assertIsNotNone(avg_cost) + + print str(program) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_lookup_table_op.py b/python/paddle/v2/framework/tests/test_lookup_table_op.py index b259bb67e832adcb31b0ab4e992738be2b85f884..2c48f9bf93b939aa631cd54e8fb14b5cba22f2e0 100644 --- a/python/paddle/v2/framework/tests/test_lookup_table_op.py +++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py @@ -8,7 +8,8 @@ class TestLookupTableOp(OpTest): self.op_type = "lookup_table" table = np.random.random((17, 31)).astype("float32") ids = np.random.randint(0, 17, 4).astype("int32") - self.inputs = {'W': table, 'Ids': ids} + ids_expand = np.expand_dims(ids, axis=1) + self.inputs = {'W': table, 'Ids': ids_expand} self.outputs = {'Out': table[ids]} def test_check_output(self): diff --git a/python/paddle/v2/framework/tests/test_lstm_op.py b/python/paddle/v2/framework/tests/test_lstm_op.py new file mode 100644 index 0000000000000000000000000000000000000000..bcce8d32c944a39e6d6aad4c99f8aa152222c3c1 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_lstm_op.py @@ -0,0 +1,185 @@ +import unittest +import numpy as np +from op_test import OpTest + +SIGMOID_THRESHOLD_MIN = -40.0 +SIGMOID_THRESHOLD_MAX = 13.0 +EXP_MAX_INPUT = 40.0 + + +def identity(x): + return x + + +def sigmoid(x): + y = np.copy(x) + y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN + y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX + return 1. / (1. + np.exp(-y)) + + +def tanh(x): + y = -2. * x + y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT + return (2. / (1. + np.exp(y))) - 1. + + +def relu(x): + return np.maximum(x, 0) + + +ACTVATION = { + 'identity': identity, + 'sigmoid': sigmoid, + 'tanh': tanh, + 'relu': relu +} + + +def lstm( + input, # T x 4D + lod, # 1 x N + h0=None, # N x D + c0=None, # N x D + w_h=None, # D x 4D + w_b=None, # 1 x 4D + w_c=None, # 1 x 3D + is_reverse=False, + act_gate=None, + act_cell=None, + act_cand=None): + def _step(x, w_h, w_c, h_pre, c_pre, act_gate, act_cell, act_cand): + g = np.dot(h_pre, w_h) # 1 x 4D + g = g + x + g = np.reshape(g, (1, g.size)) + c_tmp, g_i, g_f, g_o = np.split(g, 4, axis=1) + if w_c is None: + g_i = act_gate(g_i) # 1 x D + g_f = act_gate(g_f) # 1 x D + else: + w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1) + g_i = act_gate(g_i + w_ic * c_pre) # 1 x D + g_f = act_gate(g_f + w_fc * c_pre) # 1 x D + c = g_f * c_pre + g_i * act_cand(c_tmp) # 1 x D + + if w_c is None: + g_o = act_gate(g_o) # 1 x D + else: + _, _, w_oc = np.split(w_c, 3, axis=1) + g_o = act_gate(g_o + w_oc * c) # 1 x D + h = g_o * act_cell(c) + bg = np.concatenate((act_cand(c_tmp), g_i, g_f, g_o), axis=1) + return h, c, bg + + def _reverse(x, lod): + y = np.zeros_like(x) + for i in range(len(lod) - 1): + b, e = lod[i], lod[i + 1] + y[b:e, :] = np.flip(x[b:e, :], 0) + return y + + offset = lod[0] + batch_size = len(offset) - 1 + hidden = [] + cell = [] + gate = [] + input = _reverse(input, offset) if is_reverse else input + if w_b is not None: + input = input + np.tile(w_b, (offset[-1], 1)) + for i in range(batch_size): + # compute one sequence + seq_len = offset[i + 1] - offset[i] + x = input[offset[i]:offset[i + 1], :] + h_pre = h0[i] # 1 x D + c_pre = c0[i] # 1 x D + for j in range(seq_len): + # compute one step + h_pre, c_pre, g_pre = _step(x[j], w_h, w_c, h_pre, c_pre, act_gate, + act_cell, act_cand) + hidden.append(h_pre.flatten()) + cell.append(c_pre.flatten()) + gate.append(g_pre.flatten()) + + hidden = np.array(hidden).astype("float64") + cell = np.array(cell).astype("float64") + gate = np.array(gate).astype("float64") + + hidden = _reverse(hidden, offset) if is_reverse else hidden + cell = _reverse(cell, offset) if is_reverse else cell + + assert gate.shape == input.shape + assert hidden.shape == (input.shape[0], input.shape[1] / 4) + assert cell.shape == (input.shape[0], input.shape[1] / 4) + return hidden, cell, gate + + +class TestLstmOp(OpTest): + def set_data(self): + self.lod = [[0, 2, 6, 9]] + self.D = 64 + self.sort_idx = [2, 6, 0, 3, 7, 1, 4, 8, 5] + + self.act_gate = "sigmoid" + self.act_cell = "tanh" + self.act_cand = "tanh" + + self.is_reverse = False + + def setUp(self): + self.set_data() + self.op_type = "lstm" + + T = self.lod[0][-1] + N = len(self.lod[0]) - 1 + + x = np.random.normal(size=(T, 4 * self.D)).astype("float64") + h0 = np.zeros((N, self.D)).astype("float64") + c0 = np.zeros((N, self.D)).astype("float64") + w = np.random.normal(size=(self.D, 4 * self.D)).astype("float64") + b = np.random.normal(size=(1, 7 * self.D)).astype("float64") + + w_b = b[:, 0:4 * self.D] + w_c = b[:, 4 * self.D:] + h, c, g = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse, + ACTVATION[self.act_gate], ACTVATION[self.act_cell], + ACTVATION[self.act_cand]) + + g_sort = np.zeros_like(x) + for i, j in enumerate(self.sort_idx): + g_sort[i, :] = g[j, :] + + self.inputs = { + 'Input': (x, self.lod), + 'H0': h0, + 'C0': c0, + 'Weight': w, + 'Bias': b + } + self.outputs = {'Hidden': h, 'Cell': c, 'BatchGate': g_sort} + self.attrs = { + 'usePeepholes': True, + 'isReverse': self.is_reverse, + 'gateActivation': 'sigmoid', + 'cellActivation': 'tanh', + 'candidateActivation': 'tanh' + } + + def test_check_output(self): + self.check_output() + + +class TestLstmOpRerverse(TestLstmOp): + def set_data(self): + self.lod = [[0, 2, 6, 9]] + self.D = 64 + self.sort_idx = [2, 6, 0, 3, 7, 1, 4, 8, 5] + + self.act_gate = "sigmoid" + self.act_cell = "tanh" + self.act_cand = "tanh" + + self.is_reverse = True + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py index b3d95a56b88e510734da54f36ff21ccd7e1baabb..57d6d7e7e095cab2c3afb60d229fc09da98aed8b 100644 --- a/python/paddle/v2/framework/tests/test_mul_op.py +++ b/python/paddle/v2/framework/tests/test_mul_op.py @@ -35,10 +35,10 @@ class TestMulOp2(OpTest): 'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32") } self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2} - self.outputs = { - 'Out': np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10), - self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9)) - } + result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10), + self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9)) + result = result.reshape(15, 4, 8, 2, 9) + self.outputs = {'Out': result} def test_check_output(self): self.check_output() diff --git a/python/paddle/v2/framework/tests/test_pad_op.py b/python/paddle/v2/framework/tests/test_pad_op.py index 9052e63b5683801da7c73be4de23013c949add98..55f1774e5755c846f60a2f1df3e705444a81192b 100644 --- a/python/paddle/v2/framework/tests/test_pad_op.py +++ b/python/paddle/v2/framework/tests/test_pad_op.py @@ -27,7 +27,7 @@ class TestPadOp(OpTest): def initTestCase(self): self.shape = (16, 16) self.paddings = [(0, 1), (2, 3)] - self.pad_value = 0 + self.pad_value = 0.0 class TestCase1(TestPadOp): @@ -41,7 +41,7 @@ class TestCase2(TestPadOp): def initTestCase(self): self.shape = (2, 2, 2) self.paddings = [(0, 0), (0, 0), (1, 2)] - self.pad_value = 1 + self.pad_value = 1.0 class TestCase3(TestPadOp): diff --git a/python/paddle/v2/framework/tests/test_reduce_op.py b/python/paddle/v2/framework/tests/test_reduce_op.py index 0fec31c2e22e1eda2c62aa9b38487d703815f414..70359d60cbe656150877673c63e81eae92d8ab9a 100644 --- a/python/paddle/v2/framework/tests/test_reduce_op.py +++ b/python/paddle/v2/framework/tests/test_reduce_op.py @@ -85,33 +85,5 @@ class Test1DReduce(OpTest): self.check_grad(['X'], 'Out') -class TestNorm(OpTest): - def setUp(self): - # use x away from 0 to avoid errors of numerical gradient when gradient near 0 - x = np.random.random((5, 6, 10)).astype("float32") + 0.2 - p = 2 - dim = 1 - keep_dim = False - abs_out = np.absolute(x) - pow_out = np.power(x, p) - sum_out = np.sum(pow_out, axis=dim, keepdims=keep_dim) - out = np.power(sum_out, 1. / p) - self.op_type = "norm" - self.inputs = {'X': x} - self.attrs = {"p": p, "dim": dim, "keep_dim": keep_dim} - self.outputs = { - "AbsOut": abs_out, - "PowOut": pow_out, - "SumOut": sum_out, - "Out": out - } - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out', max_relative_error=0.01) - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_rnn_helpers.py b/python/paddle/v2/framework/tests/test_rnn_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..be0ecfb129aa181229bc42d8d6818ad860991965 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_rnn_helpers.py @@ -0,0 +1,38 @@ +import unittest +from paddle.v2.framework.layers import * +from paddle.v2.framework.framework import g_program + + +class TestRNN(unittest.TestCase): + def test_rnn(self): + img = data( + shape=[ + 80, # sequence length + 22, # image height + 22 + ], # image width + data_type='float32', + name='image') + hidden = fc(input=img, size=100, act='sigmoid', num_flatten_dims=2) + self.assertEqual((-1, 80, 100), hidden.shape) + hidden = fc(input=hidden, size=100, act='sigmoid', num_flatten_dims=2) + self.assertEqual((-1, 80, 100), hidden.shape) + + rnn = StaticRNN() + with rnn.step(): + hidden = rnn.step_input(hidden) + self.assertEqual((-1, 100), hidden.shape) + memory = rnn.memory(shape=(-1, 32), dtype='float32', init_value=0.0) + + rnn_out = fc(input=[hidden, memory], size=32, act='sigmoid') + self.assertEqual((-1, 32), rnn_out.shape) + rnn.update_memory(memory, rnn_out) + rnn.output(rnn_out) + + out = rnn() + self.assertEqual((-1, 80, 32), out.shape) + print g_program + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py new file mode 100644 index 0000000000000000000000000000000000000000..b5d98035156c425ab97d2bf75f8f09c71884368f --- /dev/null +++ b/python/paddle/v2/framework/tests/test_word2vec.py @@ -0,0 +1,165 @@ +import paddle.v2 as paddle +import paddle.v2.framework.layers as layers +import paddle.v2.framework.core as core +import paddle.v2.framework.optimizer as optimizer + +from paddle.v2.framework.framework import Program, g_program +from paddle.v2.framework.executor import Executor + +import numpy as np + +init_program = Program() +program = Program() + +embed_size = 32 +hidden_size = 256 +N = 5 +batch_size = 32 + +word_dict = paddle.dataset.imikolov.build_dict() +dict_size = len(word_dict) + +first_word = layers.data( + name='firstw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +second_word = layers.data( + name='secondw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +third_word = layers.data( + name='thirdw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +forth_word = layers.data( + name='forthw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +next_word = layers.data( + name='nextw', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) + +embed_param_attr_1 = { + 'name': 'shared_w', + 'init_attr': { + 'max': 1.0, + 'type': 'uniform_random', + 'min': -1.0 + } +} +embed_param_attr_2 = {'name': 'shared_w'} + +embed_first = layers.embedding( + input=first_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_1, + program=program, + init_program=init_program) +embed_second = layers.embedding( + input=second_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program, + init_program=init_program) + +embed_third = layers.embedding( + input=third_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program, + init_program=init_program) +embed_forth = layers.embedding( + input=forth_word, + size=[dict_size, embed_size], + data_type='float32', + param_attr=embed_param_attr_2, + program=program, + init_program=init_program) + +concat_embed = layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], + axis=1, + program=program, + init_program=init_program) + +hidden1 = layers.fc(input=concat_embed, + size=hidden_size, + act='sigmoid', + program=program, + init_program=init_program) +predict_word = layers.fc(input=hidden1, + size=dict_size, + act='softmax', + program=program, + init_program=init_program) +cost = layers.cross_entropy( + input=predict_word, + label=next_word, + program=program, + init_program=init_program) +avg_cost = layers.mean(x=cost, program=program, init_program=init_program) + +sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +opts = sgd_optimizer.minimize(avg_cost) + +train_reader = paddle.batch( + paddle.dataset.imikolov.train(word_dict, N), batch_size) + +place = core.CPUPlace() +exe = Executor(place) + +exe.run(init_program, feed={}, fetch_list=[]) +PASS_NUM = 100 +for pass_id in range(PASS_NUM): + for data in train_reader(): + input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)] + input_data = map(lambda x: np.array(x).astype("int32"), input_data) + input_data = map(lambda x: np.expand_dims(x, axis=1), input_data) + + first_data = input_data[0] + first_tensor = core.LoDTensor() + first_tensor.set(first_data, place) + + second_data = input_data[0] + second_tensor = core.LoDTensor() + second_tensor.set(second_data, place) + + third_data = input_data[0] + third_tensor = core.LoDTensor() + third_tensor.set(third_data, place) + + forth_data = input_data[0] + forth_tensor = core.LoDTensor() + forth_tensor.set(forth_data, place) + + next_data = input_data[0] + next_tensor = core.LoDTensor() + next_tensor.set(next_data, place) + + outs = exe.run(program, + feed={ + 'firstw': first_tensor, + 'secondw': second_tensor, + 'thirdw': third_tensor, + 'forthw': forth_tensor, + 'nextw': next_tensor + }, + fetch_list=[avg_cost]) + out = np.array(outs[0]) + if out[0] < 10.0: + exit(0) # if avg cost less than 10.0, we think our code is good. +exit(1) diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py index 4cfd91882e2d5f0098d27b8897359152ddd94dda..bd97dc1199fedc8ac91c1c6086957e8cce88bdc4 100644 --- a/python/paddle/v2/parameters.py +++ b/python/paddle/v2/parameters.py @@ -101,6 +101,10 @@ class Parameters(object): self.__param_conf__[param_conf.name] = param_conf + def update_param_conf(self, model_config): + for p in model_config.parameters: + self.__param_conf__[p.name] = p + def keys(self): """ keys are the names of each parameter. @@ -322,6 +326,17 @@ class Parameters(object): self.set(name, arr.reshape(self.get_shape(name))) def to_tar(self, f): + """ + Save parameters to a tar file. + + WARNING: You should use `paddle.v2.trainer.SGD.save_parameter_to_tar(f)` + to save parameters most of the time. Otherwise, some settings such + as model average will not take effect. + + :param f: + :type f: file + :return: + """ tar = tarfile.TarFile(fileobj=f, mode='w') for nm in self.names(): buf = cStringIO.StringIO() diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index b7791559594321a85f41b508b69efeb077d69595..b4333ed530ce464095ec38d72706949cc464fbe4 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -5,3 +5,4 @@ py_test(test_topology SRCS test_topology.py) py_test(test_rnn_layer SRCS test_rnn_layer.py) py_test(test_parameters SRCS test_parameters.py) py_test(test_data_feeder SRCS test_data_feeder.py) +py_test(test_paramconf_order SRCS test_paramconf_order.py) diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py new file mode 100644 index 0000000000000000000000000000000000000000..41fea64122b81948d57cce07f00d764e4889da66 --- /dev/null +++ b/python/paddle/v2/tests/test_paramconf_order.py @@ -0,0 +1,85 @@ +# Copyright PaddlePaddle contributors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import math +import paddle.v2 as paddle + + +def wordemb(inlayer): + wordemb = paddle.layer.table_projection( + input=inlayer, + size=5, + param_attr=paddle.attr.Param( + name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)) + return wordemb + + +def train(): + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + # Every layer takes integer value of range [0, dict_size) + firstword = paddle.layer.data( + name="firstw", type=paddle.data_type.integer_value(dict_size)) + secondword = paddle.layer.data( + name="secondw", type=paddle.data_type.integer_value(dict_size)) + thirdword = paddle.layer.data( + name="thirdw", type=paddle.data_type.integer_value(dict_size)) + fourthword = paddle.layer.data( + name="fourthw", type=paddle.data_type.integer_value(dict_size)) + nextword = paddle.layer.data( + name="fifthw", type=paddle.data_type.integer_value(dict_size)) + + Efirst = wordemb(firstword) + Esecond = wordemb(secondword) + Ethird = wordemb(thirdword) + Efourth = wordemb(fourthword) + + contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) + hidden1 = paddle.layer.fc(name="fc1", + input=contextemb, + size=128, + act=paddle.activation.Sigmoid(), + layer_attr=paddle.attr.Extra(drop_rate=0.5), + bias_attr=paddle.attr.Param(learning_rate=2), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(5 * 8), + learning_rate=1, + l2_rate=6e-4)) + predictword = paddle.layer.fc(input=hidden1, + size=dict_size, + bias_attr=paddle.attr.Param(learning_rate=2), + act=paddle.activation.Softmax()) + + return paddle.layer.classification_cost(input=predictword, label=nextword) + + +class TestParamConfOrder(unittest.TestCase): + def test_param_conf_order(self): + paddle.init() + cost = train() + parameters = paddle.parameters.create(cost) + adagrad = paddle.optimizer.AdaGrad( + learning_rate=3e-3, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + + trainer = paddle.trainer.SGD(cost, parameters, adagrad) + for p in trainer.get_topology_proto().parameters: + if p.name == "_fc1.w0": + self.assertEqual(p.decay_rate, 6e-4) + else: + self.assertEqual(p.decay_rate, 8e-4) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index 2db66be2505dde38a501edf45984e1f36beb351d..923ccecb0bf1236b4a3768fdc07dc3027e2863b7 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -19,6 +19,7 @@ import paddle.trainer_config_helpers as conf_helps import layer as v2_layer import config_base import cPickle +from paddle.trainer import config_parser as cp __all__ = ['Topology'] @@ -50,6 +51,35 @@ class Topology(object): assert isinstance(self.__model_config__, ModelConfig) + def update_from_default(self): + # HACK(typhoonzero): update ParameterConfig(proto) in case of + # optimizers are defined after layers, or between layers. + # Must be called from trainer.__init__() + for parameter in self.__model_config__.parameters: + if parameter.momentum == 0.0 and cp.g_default_momentum: + parameter.momentum = cp.g_default_momentum + if parameter.decay_rate == 0.0 and cp.g_default_decay_rate: + parameter.decay_rate = cp.g_default_decay_rate + if parameter.initial_mean == 0.0: + parameter.initial_mean = cp.g_default_initial_mean + if parameter.initial_std == 0.01: + parameter.initial_std = cp.g_default_initial_std + if parameter.initial_strategy == 0: + parameter.initial_strategy = cp.g_default_initial_strategy + if parameter.initial_smart == False: + parameter.initial_smart = cp.g_default_initial_smart + if parameter.num_batches_regularization == 1 and \ + cp.g_default_num_batches_regularization: + parameter.num_batches_regularization = \ + cp.g_default_num_batches_regularization + if parameter.gradient_clipping_threshold == 0.0 and \ + cp.g_default_gradient_clipping_threshold: + parameter.gradient_clipping_threshold = \ + cp.g_default_gradient_clipping_threshold + if parameter.device == -1 and cp.g_default_device: + parameter.device = cp.g_default_device + # FIXME(typhoonzero): ignored: update_hooks, g_default_compact_func + def use_sparse_updater(self): """ check if any parameter require to use sparse_update diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 076e75593991415bc3fbcbd36a108c8c7de66932..b68fd0d5a97a7993ddd0a1d947304fa5428c01b8 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -64,6 +64,11 @@ class SGD(object): "paddle.v2.optimizer.Optimizer") import py_paddle.swig_paddle as api topology = Topology(cost, extra_layers=extra_layers) + # HACK(typhoonzero): update ParameterConfig(proto) in case of optimizers + # are defined after layers, or between layers. + topology.update_from_default() + parameters.update_param_conf(topology.proto()) + self.__optimizer__ = update_equation self.__topology__ = topology self.__parameters__ = parameters @@ -91,6 +96,9 @@ class SGD(object): self.__parameters__.append_gradient_machine(gm) self.__parameter_updater__ = None + def get_topology_proto(self): + return self.__topology_in_proto__ + def __use_remote_sparse_updater__(self): return self.__use_sparse_updater__ and not self.__is_local__