diff --git a/CMakeLists.txt b/CMakeLists.txt index 4783095194dc9c6409dc31c95588f46c9bee7c61..1252e7539816016dfdf1b90b8941fa42e6bb85e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON) "different Python interpreter from compiling.") endif() +if(MOBILE_INFERENCE) + set(THIRD_PARTY_BUILD_TYPE MinSizeRel) +else() + set(THIRD_PARTY_BUILD_TYPE Release) +endif() + ######################################################################################## include(external/mklml) # download mklml package diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index f7483f6be9169eb58f0148cd3a956a8c881e1fe3..bd853d921b4362ac7ac5e17e629552b2a200f08a 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -8,7 +8,7 @@ ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" - GIT_TAG "master" + GIT_TAG 4e79cb69b9425f5f8c3a84be4350d4ab75b5fd9d PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 957f8271e4841836956b0c3f2cf3d8c88a31192a..c819eb4d70898e48eab499c666168d78262d4240 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -36,6 +36,7 @@ ExternalProject_Add( # change this back to the official Github repo once my PR is # merged. GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git" + GIT_TAG 986964c07427ecb9cdb5bd73f73ebbd40e54dadb PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -45,11 +46,11 @@ ExternalProject_Add( -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index b3fef738ccc0b5886bb0a32501bb7b7adade0ff1..08bdc1e1623b0d917061c7368e9b2a8f7e9517fd 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -31,6 +31,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY "https://github.com/google/glog.git" + GIT_TAG v0.3.5 PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -43,12 +44,12 @@ ExternalProject_Add( -DWITH_GFLAGS=ON -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 6a2a79b7631b32e8a099797de509af64533bbb95..5a4aa7a5b71a4fdfd556a46037e6d1846d668fc4 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -56,11 +56,11 @@ IF(WITH_TESTING) -DBUILD_GMOCK=ON -Dgtest_disable_pthreads=ON -Dgtest_force_shared_crt=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7cf7ba85cca4c248dcc74e078124c0b3815ee380..be7f6a9465970711170bd15dcecaadeaa8a55f86 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ${OPTIONAL_ARGS} -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON ${OPTIONAL_CACHE_ARGS} diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index bb258c7b5581fc22b44f4fe15c119f8081f4767e..8bd058222880b4df3b08da09c02f9fe7f1d0ee66 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -35,6 +35,7 @@ ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" + GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09 PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -48,9 +49,9 @@ ExternalProject_Add( -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON -DBUILD_SHARED=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c496a52b780364f3014f8fa3dfbc944a7aa7430e..e2c9fe56f335ae5b627b4d8d4bb17e4a2a466677 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -42,11 +42,11 @@ ExternalProject_Add( -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_MACOSX_RPATH=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) LIST(APPEND external_project_dependencies zlib) diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot index 8d1b673abf6b78c851676fa379dc850c4818f0e5..e115f9844bae6ad24f638c8ed4749cea8aff06a9 100644 --- a/doc/design/images/graph_construction_example.dot +++ b/doc/design/images/graph_construction_example.dot @@ -33,7 +33,6 @@ digraph ImageClassificationGraph { cost -> MSE_Grad [color=red]; d_cost -> MSE_Grad [color=red]; - x -> MSE_Grad [color=red]; l -> MSE_Grad [color=red]; y -> MSE_Grad -> d_y [color=red]; diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png index 181187503472d15779b87284105841168b3945c4..261611a5721f9aa97874f7e6d897fe48cf667db2 100644 Binary files a/doc/design/images/graph_construction_example_all.png and b/doc/design/images/graph_construction_example_all.png differ diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png index 3049a9315fd616464dec54e33064cb75598ca536..4c69687f4a6a181138f3df72ce5e8aa48487b5be 100644 Binary files a/doc/design/images/graph_construction_example_forward_backward.png and b/doc/design/images/graph_construction_example_forward_backward.png differ diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png index 25d19088cbf0b5f68cf734f2ff21eba8af4a2860..e668c16e0cac73acb4e5dc2b1827557ae77126b4 100644 Binary files a/doc/design/images/graph_construction_example_forward_only.png and b/doc/design/images/graph_construction_example_forward_only.png differ diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md index 3cf8a59446d244bb3a388b87b14273d9096c839a..9f1ce4bae7b393cb9f04909e5e4917b8d660771c 100644 --- a/doc/design/register_grad_op.md +++ b/doc/design/register_grad_op.md @@ -3,15 +3,17 @@ ## The Problem Posed -In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance. +Currently, for each C++ operator class definition, there registers a *gradient operator creator* function, which takes a C++ operator instance and returns the corresponding gradient operator instance. -However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message. +However, we noticed two problems with the current deisgn: -More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators. +1. As we decided to separate the *compilation* and *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message. -## Current Implementation +1. Some operator's gradient computation requires more than one gradient operators. For example, the gradient of *minus* consists of two operators -- an identity operaotr and a scale operator. So we need to make the registration mechanism to support the mapping from an operator to a set of operators for gradient computation. -OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is +## The Current Implementation + +The C++ class `OpInfos` store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is ```cpp struct OpInfo { diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md index bfbbdd0578ebc69ea4b49ade9b041573a9e9ad55..0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f 100644 --- a/doc/design/var_desc.md +++ b/doc/design/var_desc.md @@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep ## Definition of VarDesc -A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it. +A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. ```proto message VarDesc { required string name = 1; - optional LoDTensorDesc lod_tensor = 2; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LoDTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; } ``` -## Definition of LodTensorDesc +## Definition of TensorDesc ```proto enum DataType { @@ -38,87 +45,25 @@ enum DataType { FP64 = 6; } -message LoDTensorDesc { +message TensorDesc { required DataType data_type = 1; - repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] - optional int32 lod_level = 3 [default=0]; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] } ``` -## Definition of Variable in Python - -In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable. - -```python -image = Variable(dims=[-1, 640, 480]) -# fc1 and fc2 are both Variable -fc1 = layer.fc(input=image, output_size=10) -fc2 = layer.fc(input=fc1, output_size=20) -``` -### what should class `Variable` Have -1. `name`.a name of string type is used to mark the value of the Variable. -1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator. -1. `operator`. Variable should record which operator produce itself. The reaon is: - - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable. - -In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph. - -```python -import VarDesc -import LoDTensorDesc -import framework - -def AddInitialOperator(variable, initializer): - # add an initialize Operator to block to init this Variable - -class Variable(object): - def __init__(self, name, dims, type, initializer): - self._block = get_default_block() - self._name = name - self.op = None - - tensor_desc = LoDTensorDesc(data_type=type, dims=dims) - _var_desc = VarDesc(name=name, lod_tensor=tensor_desc) - self._var = framework.CreateVar(_var_desc) - self._block.add_var(self) +A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md). - # add initial op according to initializer - if initializer is not None: - AddInitialOperator(self, initializer) - - def dims(self): - return self._var.dims() - - def data_type(self): - return self._var.data_type() +## Definition of LodTensorDesc - def to_proto(self): - pass +```proto +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} ``` -Then we can use this Variable to create a fc layer in Python. +A LoDTensorDesc contains a tensor and a lod_level. -```python -import paddle as pd - -def flatten_size(X, num_flatten_dims): - prod = 1 # of last num_flatten_dims - for i in xrange(num_flatten_dims): - prod = prod * X.dims[-i-1] - return prod - -def layer.fc(X, output_size, num_flatten_dims): - W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size]) - b = Variable(pd.random_uniform(), type=FP32, dims=[output_size]) - out = Variable(type=FP32) - y = operator.fc(X, W, b, output=out) # fc will put fc op input into out - pd.InferShape(y) - return out - -x = Variable(dims=[-1, 640, 480]) -y = layer.fc(x, output_size=100) -z = layer.fc(y, output_size=200) +## Definition of Variable in Python -paddle.eval(targets=[z], ...) -print(z) -``` +For Variable in Python, please reference [`Python API`](./python_api.md). diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index d7b3d2bdec1687425df804c0d56d568241f9e8b0..d6b8464100d4497876aa3f6f7cbc666aafae4bfc 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -26,7 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py) SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON) SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic") SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS paddle_parameter diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 6b34c3bbcfbdb0c36381df7de4dd227e317829e5..c8d9dac21d995d92b9d50436d42e47b63ea55f58 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -19,10 +19,10 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope) proto_library(framework_proto SRCS framework.proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto) -cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim) +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) -cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc) +cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) @@ -42,12 +42,17 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward ${GLOB_OP_LIB}) -#if(WITH_GPU) -# nv_test(executor_test SRCS executor_test.cc DEPS executor) -#else() -# cc_test(executor_test SRCS executor_test.cc DEPS executor) -#endif() +cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward) +set(EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op + mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op mean_op) +if(WITH_GPU) + nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP}) +else() + cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP}) +endif() cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor) cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place) + +cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) +cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 063b108500d95c94d5859cf6e1a5a88dcdb2ed31..321483833e114f4a7938a9d0dac1416ae2f91a98 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -28,14 +28,15 @@ namespace paddle { namespace framework { static inline std::unique_ptr CreateGradOp( - const OperatorBase& op) { + const OperatorBase& op, const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var) { OpDescBind op_desc; op_desc.SetInputMap(op.Inputs()); op_desc.SetOutputMap(op.Outputs()); op_desc.SetType(op.Type()); op_desc.SetAttrMap(op.Attrs()); auto& info = OpInfoMap::Instance().Get(op.Type()); - auto grad_descs = info.GradOpMaker()(op_desc); + auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var); std::vector> grad_ops; grad_ops.reserve(grad_descs.size()); std::transform(grad_descs.begin(), grad_descs.end(), @@ -98,7 +99,9 @@ static std::unique_ptr NOP() { // See Backward.h for details static std::unique_ptr BackwardRecursive( const OperatorBase& forwardOp, - std::unordered_set& no_grad_names, size_t& uniq_id) { + std::unordered_set& no_grad_names, + std::unordered_map* grad_to_var, + size_t& uniq_id) { // If all input gradients of forwarding operator do not need to calculate, // just return an NOP. Not return null ptr because NOP does not take // too much time for calculation, but it is useful for simplifying logic. @@ -136,7 +139,7 @@ static std::unique_ptr BackwardRecursive( for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend(); ++it, ++local_op_id) { auto& fwd = *it; - auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id); + auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id); ForEachVarName(bwd->Outputs(), [&dup_output_ops, local_op_id](const std::string& out) { dup_output_ops[out].emplace_back(local_op_id); @@ -187,7 +190,8 @@ static std::unique_ptr BackwardRecursive( net->InsertOp(pos.first + 1, std::move(pos.second)); } } else { - std::unique_ptr grad_op(CreateGradOp(forwardOp)); + std::unique_ptr grad_op( + CreateGradOp(forwardOp, no_grad_names, grad_to_var)); ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op]( const std::string& grad_input) { @@ -226,7 +230,7 @@ static std::unique_ptr BackwardRecursive( *static_cast(&rnnop.stepnet()); // create stepnet's gradient op rnn_grad_op->set_stepnet( - BackwardRecursive(stepnet_op, no_grad_names, uniq_id)); + BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id)); } if (net->ops_.empty()) { // Current no aux op is added to network @@ -253,7 +257,8 @@ std::unique_ptr Backward( no_grad_names.insert(name + kGradVarSuffix); } size_t uid = 0; - return BackwardRecursive(forwardOp, no_grad_names, uid); + std::unordered_map grad_to_var; + return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid); } // ==================================== // @@ -268,30 +273,48 @@ static bool AllGradInSet(const std::vector& names, return true; } +static void CreateGradVarInBlock(BlockDescBind* block_desc, + size_t grad_op_start_index) { + auto ops = block_desc->AllOps(); + for (size_t op_index = grad_op_start_index; op_index < ops.size(); + ++op_index) { + for (const auto& output : ops[op_index]->Outputs()) { + for (const auto& real_output : output.second) { + if (!block_desc->HasVar(real_output)) { + block_desc->NewVar(real_output); + } + } + } + } +} + std::vector> MakeOpGrad( const std::unique_ptr& op_desc, - std::unordered_set& no_grad_vars) { + std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var) { std::vector> grad_op_descs; - // All input gradients of forwarding operator do not need to calculat. + // All input gradients of forwarding operator do not need to calculate. const std::vector& inputs = op_desc->InputArgumentNames(); - if (AllGradInSet(inputs, no_grad_vars)) { + if (AllGradInSet(inputs, *no_grad_vars)) { return grad_op_descs; // empty vector } // All output gradients of forwarding operator do not need to calculate. const std::vector& outputs = op_desc->OutputArgumentNames(); - if (AllGradInSet(outputs, no_grad_vars)) { + if (AllGradInSet(outputs, *no_grad_vars)) { for (const std::string& name : inputs) { - no_grad_vars.insert(GradVarName(name)); + no_grad_vars->insert(GradVarName(name)); } return grad_op_descs; // empty vector } - grad_op_descs = OpRegistry::CreateGradOpDescs(op_desc.get()); + grad_op_descs = OpInfoMap::Instance() + .Get(op_desc->Type()) + .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var); std::list> pending_fill_zeros_ops; for (auto& desc : grad_op_descs) { for (const std::string& in_name : desc->InputArgumentNames()) { - if (no_grad_vars.count(in_name)) { + if (no_grad_vars->count(in_name)) { std::string prefix = in_name.substr( 0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); std::string new_name = prefix + kZeroVarSuffix; @@ -301,11 +324,6 @@ std::vector> MakeOpGrad( pending_fill_zeros_ops.push_back(std::move(fill_zeros_op)); } } - for (const std::string& out_name : desc->OutputArgumentNames()) { - if (no_grad_vars.count(out_name)) { - desc->Rename(out_name, kEmptyVarName); - } - } } for (auto& p : pending_fill_zeros_ops) { @@ -316,23 +334,25 @@ std::vector> MakeOpGrad( std::vector> MakeBlockBackward( ProgramDescBind& program_desc, int block_idx, - std::unordered_set& no_grad_vars) { + std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var) { BlockDescBind* cur_block = program_desc.Block(block_idx); std::deque>& op_descs = cur_block->ops_; std::unordered_map> dup_out_ops; size_t grad_desc_idx = 0; std::vector> backward_descs; + for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { std::vector> op_grads = - MakeOpGrad(*it, no_grad_vars); + MakeOpGrad(*it, no_grad_vars, grad_to_var); if ((*it)->Type() == "recurrent") { PADDLE_ENFORCE_EQ( - op_grads.size(), size_t(1), + op_grads.size(), static_cast(1), "rnn_op's gradient process should contain only one op."); - int step_block_idx = (*it)->GetBlockAttr("stop_block"); - auto backward_block_op_descs = - MakeBlockBackward(program_desc, step_block_idx, no_grad_vars); + int step_block_idx = (*it)->GetBlockAttr("step_block"); + auto backward_block_op_descs = MakeBlockBackward( + program_desc, step_block_idx, no_grad_vars, grad_to_var); BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block); for (auto& ptr : backward_block_op_descs) { backward_block->ops_.push_back(std::move(ptr)); @@ -376,10 +396,11 @@ std::vector> MakeBlockBackward( backward_descs.insert(backward_descs.begin() + p.first + 1, std::move(p.second)); } + return backward_descs; } -void AppendBackward(ProgramDescBind& program_desc, +void AppendBackward(ProgramDescBind& program_desc, const VarDescBind& target, const std::unordered_set& no_grad_vars) { std::unordered_set no_grad_var_names; no_grad_var_names.reserve(no_grad_vars.size() + 1); @@ -387,12 +408,34 @@ void AppendBackward(ProgramDescBind& program_desc, for (auto& name : no_grad_vars) { no_grad_var_names.insert(GradVarName(name)); } + const int root_block_idx = 0; - auto backward_op_descs = - MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names); - auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_; + auto root_block = program_desc.Block(root_block_idx); + auto& all_ops = root_block->ops_; + + // insert fill one op for target + std::string fill_one_op_out = GradVarName(target.Name()); + std::unique_ptr fill_one_op( + new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}}, + {{"shape", std::vector{1}}, + {"value", static_cast(1.0)}, + {"dataType", framework::DataType::FP32}})); + all_ops.push_back(std::move(fill_one_op)); + size_t forward_op_num = all_ops.size(); + size_t forward_block_num = program_desc.Size(); + std::unordered_map grad_to_var; + auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx, + &no_grad_var_names, &grad_to_var); for (auto& ptr : backward_op_descs) { - forw_op_descs.push_back(std::move(ptr)); + all_ops.push_back(std::move(ptr)); + } + root_block->NewVar(fill_one_op_out); + + // create grad_var for all blocks in this program + CreateGradVarInBlock(root_block, forward_op_num); + for (size_t block_index = forward_block_num; + block_index < program_desc.Size(); ++block_index) { + CreateGradVarInBlock(program_desc.Block(block_index), 0); } } diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h index f1ab8056450c96f0a1b671e1efa46c4c68f9ea15..2c95d18ef7e2d997679bff442bf89d6364eb13ea 100644 --- a/paddle/framework/backward.h +++ b/paddle/framework/backward.h @@ -29,7 +29,7 @@ extern std::unique_ptr Backward( // TODO(jiayi): Add target as parameter and generate backward op // according to target. -void AppendBackward(ProgramDescBind& program_desc, +void AppendBackward(ProgramDescBind& program_desc, const VarDescBind& target, const std::unordered_set& no_grad_vars); } // namespace framework diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc index 3b7cbcd98927be829d185590147adf74cd3d10d1..d9ecfe0e801cdfab70b9a120a1cf3a0c2eb73a95 100644 --- a/paddle/framework/backward_test.cc +++ b/paddle/framework/backward_test.cc @@ -18,6 +18,7 @@ #include "paddle/framework/block_desc.h" #include "paddle/framework/op_desc.h" #include "paddle/framework/op_registry.h" +#include "paddle/framework/var_desc.h" #include "paddle/operators/net_op.h" namespace paddle { @@ -169,6 +170,45 @@ class MultInOutOpMaker : public OpProtoAndCheckerMaker { } }; +class MinusGradOpDescMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + std::vector> retv; + auto x_g = InputGrad("X"); + if (!x_g.empty()) { + auto *op_desc = new OpDescBind(); + op_desc->SetType("scale"); + op_desc->SetInput("X", OutputGrad("Out")); + op_desc->SetOutput("Out", x_g); + op_desc->SetAttr("scale", 1.0f); + retv.emplace_back(op_desc); + } + + auto y_g = InputGrad("Y"); + if (!y_g.empty()) { + auto *op_desc = new OpDescBind(); + op_desc->SetType("scale"); + op_desc->SetInput("X", OutputGrad("Out")); + op_desc->SetOutput("Out", y_g); + op_desc->SetAttr("scale", -1.0f); + retv.emplace_back(op_desc); + } + return retv; + } +}; + +class MinusOpMaker : public OpProtoAndCheckerMaker { + public: + MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddInput("Y", ""); + AddOutput("Out", ""); + AddComment("minus for unittest"); + } +}; } // namespace framework } // namespace paddle @@ -187,6 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker); REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad, f::NOP); REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP); +REGISTER_OPERATOR(minus, f::NOP, f::MinusOpMaker, f::MinusGradOpDescMaker); TEST(Backward, simple_op_not_need_grad) { auto fwd = f::OpRegistry::CreateOp( @@ -395,12 +436,13 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { 2UL /* external input number */ + 1UL /* external output number*/ + 1UL /* number of gradient of external output*/ - + 2U /* internal variable number*/); + + 2UL /* internal variable number*/ + ); EXPECT_EQ(grad_fc.Outputs(all).size(), 2UL /* input number of mul*/ - + 2UL /* input number of rowwise_add - */ - + 1UL /* input number of sigmod */); + + 2UL /* input number of rowwise_add*/ + + 1UL /* input number of sigmod */ + - 1UL /* out2 is not needed*/); EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL); EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL); EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL); @@ -427,10 +469,14 @@ TEST(Backward, simple_single_op) { op->SetInput("b", {"b"}); op->SetOutput("Out", {"out"}); - AppendBackward(program, {}); + auto target = f::VarDescBind("out"); + AppendBackward(program, target, {}); - ASSERT_EQ(block->AllOps().size(), 2UL); - f::OpDescBind *grad_op = block->AllOps()[1]; + ASSERT_EQ(block->AllOps().size(), 3UL); + f::OpDescBind *fill_op = block->AllOps()[1]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDescBind *grad_op = block->AllOps()[2]; EXPECT_EQ(grad_op->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op->InputNames().size(), 1UL); ASSERT_EQ(grad_op->OutputNames().size(), 2UL); @@ -451,14 +497,19 @@ TEST(Backward, default_attribute) { op->SetInput("X", {"x"}); op->SetInput("Y", {"y"}); op->SetOutput("Out", {"out"}); + op->CheckAttrs(); - AppendBackward(program, {}); + auto target = f::VarDescBind("out"); + AppendBackward(program, target, {}); - ASSERT_EQ(block->AllOps().size(), 2UL); + ASSERT_EQ(block->AllOps().size(), 3UL); EXPECT_EQ(boost::get(op->GetAttr("x_num_col_dims")), 1); EXPECT_EQ(boost::get(op->GetAttr("y_num_col_dims")), 1); - f::OpDescBind *grad_op = block->AllOps()[1]; + f::OpDescBind *fill_op = block->AllOps()[1]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDescBind *grad_op = block->AllOps()[2]; ASSERT_EQ(grad_op->Type(), "mul_grad"); EXPECT_EQ(boost::get(grad_op->GetAttr("x_num_col_dims")), 1); EXPECT_EQ(boost::get(grad_op->GetAttr("y_num_col_dims")), 1); @@ -486,10 +537,15 @@ TEST(Backward, simple_mult_op) { op3->SetInput("b", {"b3"}); op3->SetOutput("Out", {"out3"}); - AppendBackward(program, {}); + auto target = f::VarDescBind("out3"); + size_t forward_len = block->AllOps().size(); + AppendBackward(program, target, {}); - ASSERT_EQ(block->AllOps().size(), 6UL); - f::OpDescBind *grad_op1 = block->AllOps()[5]; + ASSERT_EQ(block->AllOps().size(), 6UL + 1); + f::OpDescBind *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDescBind *grad_op1 = block->AllOps()[6]; EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 1UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -500,7 +556,7 @@ TEST(Backward, simple_mult_op) { EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), std::vector({f::GradVarName("b1")})); - f::OpDescBind *grad_op2 = block->AllOps()[4]; + f::OpDescBind *grad_op2 = block->AllOps()[5]; EXPECT_EQ(grad_op2->Type(), "mul_grad"); ASSERT_EQ(grad_op2->InputNames().size(), 4UL); ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); @@ -514,7 +570,7 @@ TEST(Backward, simple_mult_op) { EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")), std::vector({f::GradVarName("y2")})); - f::OpDescBind *grad_op3 = block->AllOps()[3]; + f::OpDescBind *grad_op3 = block->AllOps()[4]; EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op3->InputNames().size(), 1UL); ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); @@ -554,10 +610,15 @@ TEST(Backward, intermedia_var_no_grad) { op4->SetInput("Y", {"out3"}); op4->SetOutput("Out", {"out4"}); - AppendBackward(program, {"out3"}); + auto target = f::VarDescBind("out4"); + size_t forward_len = block->AllOps().size(); + AppendBackward(program, target, {"out3"}); - ASSERT_EQ(block->AllOps().size(), 6UL); - f::OpDescBind *grad_op1 = block->AllOps()[5]; + ASSERT_EQ(block->AllOps().size(), 7UL); + f::OpDescBind *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDescBind *grad_op1 = block->AllOps()[6]; EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 1UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -568,7 +629,7 @@ TEST(Backward, intermedia_var_no_grad) { EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), std::vector({f::GradVarName("b1")})); - f::OpDescBind *grad_op4 = block->AllOps()[4]; + f::OpDescBind *grad_op4 = block->AllOps()[5]; EXPECT_EQ(grad_op4->Type(), "mul_grad"); ASSERT_EQ(grad_op4->InputNames().size(), 4UL); ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); @@ -579,8 +640,7 @@ TEST(Backward, intermedia_var_no_grad) { std::vector({f::GradVarName("out4")})); EXPECT_EQ(grad_op4->Output(f::GradVarName("X")), std::vector({f::GradVarName("out1")})); - EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), - std::vector({f::kEmptyVarName})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector()); } TEST(Backward, var_no_grad) { @@ -601,10 +661,15 @@ TEST(Backward, var_no_grad) { op2->SetOutput("Y", {"y2"}); op2->SetOutput("Z", {"z2"}); - AppendBackward(program, {"z1"}); + auto target = f::VarDescBind("z2"); + size_t forward_len = block->AllOps().size(); + AppendBackward(program, target, {"z1"}); + + ASSERT_EQ(block->AllOps().size(), 6UL); + f::OpDescBind *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); - ASSERT_EQ(block->AllOps().size(), 5UL); - f::OpDescBind *grad_op2 = block->AllOps()[2]; + f::OpDescBind *grad_op2 = block->AllOps()[3]; ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad"); ASSERT_EQ(grad_op2->InputNames().size(), 6UL); ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); @@ -618,10 +683,9 @@ TEST(Backward, var_no_grad) { std::vector({f::GradVarName("z2")})); EXPECT_EQ(grad_op2->Output(f::GradVarName("X")), std::vector({f::GradVarName("y1")})); - EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), - std::vector({f::kEmptyVarName})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector()); - f::OpDescBind *fill_zero_op = block->AllOps()[3]; + f::OpDescBind *fill_zero_op = block->AllOps()[4]; ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like"); ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL); ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL); @@ -629,7 +693,7 @@ TEST(Backward, var_no_grad) { EXPECT_EQ(fill_zero_op->Output("Y"), std::vector({std::string("z1") + f::kZeroVarSuffix})); - f::OpDescBind *grad_op1 = block->AllOps()[4]; + f::OpDescBind *grad_op1 = block->AllOps()[5]; ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 6UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -669,10 +733,15 @@ TEST(Backward, shared_var) { op3->SetInput("b", {"b3"}); op3->SetOutput("Out", {"out3"}); - AppendBackward(program, {}); + auto target = f::VarDescBind("out3"); + size_t forward_len = block->AllOps().size(); + AppendBackward(program, target, {}); - ASSERT_EQ(block->AllOps().size(), 7UL); - f::OpDescBind *grad_op3 = block->AllOps()[3]; + ASSERT_EQ(block->AllOps().size(), 8UL); + f::OpDescBind *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDescBind *grad_op3 = block->AllOps()[4]; ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op3->InputNames().size(), 1UL); ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); @@ -683,7 +752,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), std::vector({f::GradVarName("b3")})); - f::OpDescBind *grad_op4 = block->AllOps()[4]; + f::OpDescBind *grad_op4 = block->AllOps()[5]; ASSERT_EQ(grad_op4->Type(), "mul_grad"); ASSERT_EQ(grad_op4->InputNames().size(), 4UL); ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); @@ -697,7 +766,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector({f::GradVarName("y2")})); - f::OpDescBind *sum_op = block->AllOps()[5]; + f::OpDescBind *sum_op = block->AllOps()[6]; ASSERT_EQ(sum_op->Type(), "sum"); ASSERT_EQ(sum_op->InputNames().size(), 1UL); ASSERT_EQ(sum_op->OutputNames().size(), 1UL); @@ -707,7 +776,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(sum_op->Output("Out"), std::vector({f::GradVarName("out1")})); - f::OpDescBind *grad_op1 = block->AllOps()[6]; + f::OpDescBind *grad_op1 = block->AllOps()[7]; ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad"); ASSERT_EQ(grad_op1->InputNames().size(), 1UL); ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); @@ -717,4 +786,23 @@ TEST(Backward, shared_var) { std::vector({f::GradVarName("x1")})); EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), std::vector({f::GradVarName("b1")})); -} \ No newline at end of file +} + +TEST(Backward, half_backward) { + f::ProgramDesc *program_desc = GetNewProgramDesc(); + f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc); + f::BlockDescBind *block = program.Block(0); + auto *op1 = block->AppendOp(); + op1->SetType("minus"); + op1->SetInput("X", {"a"}); + op1->SetInput("Y", {"b"}); + op1->SetOutput("Out", {"out"}); + + auto target = f::VarDescBind("out"); + size_t forward_len = block->AllOps().size(); + AppendBackward(program, target, {"b"}); + f::OpDescBind *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + auto ops = block->AllOps(); + ASSERT_EQ(3UL, ops.size()); +} diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 509aa235d3ee226adef15f08f5785866700499f1..4c39975ec94f95d3299efe58474d9db43654ec22 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -66,7 +66,7 @@ std::vector BlockDescBind::AllOps() const { return res; } -void BlockDescBind::Sync() { +void BlockDescBind::Flush() { if (need_update_) { auto &op_field = *this->desc_->mutable_ops(); op_field.Clear(); @@ -91,9 +91,10 @@ BlockDescBind *BlockDescBind::ParentBlock() const { return prog_->Block(static_cast(this->desc_->parent_idx())); } -void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) { - BlockDesc *desc = block.RawPtr(); - this->attrs_[name] = desc; +BlockDesc *BlockDescBind::Proto() { + Flush(); + return desc_; } + } // namespace framework } // namespace paddle diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h index 3437e89923da8de79eeaa88d0466cf7eb0b5926d..4446576a3cb7a7534985d85e48c56b66430eccdf 100644 --- a/paddle/framework/block_desc.h +++ b/paddle/framework/block_desc.h @@ -35,10 +35,11 @@ class BlockDescBind { public: friend std::vector> MakeBlockBackward( ProgramDescBind &program_desc, int block_idx, - std::unordered_set &no_grad_vars); + std::unordered_set *no_grad_vars, + std::unordered_map *grad_to_var); friend void AppendBackward( - ProgramDescBind &program_desc, + ProgramDescBind &program_desc, const VarDescBind &target, const std::unordered_set &no_grad_vars); BlockDescBind(ProgramDescBind *prog, BlockDesc *desc) @@ -64,9 +65,9 @@ class BlockDescBind { std::vector AllOps() const; - void Sync(); + void Flush(); - BlockDesc *RawPtr() { return desc_; } + BlockDesc *Proto(); private: ProgramDescBind *prog_; // not_own diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h index daa474e8c5a223589018720da29a5c3363b5934d..ed7c5f17b0854809bde923276f36440cce193a88 100644 --- a/paddle/framework/details/op_registry.h +++ b/paddle/framework/details/op_registry.h @@ -97,8 +97,11 @@ struct OpInfoFiller { template struct OpInfoFiller { void operator()(const char* op_type, OpInfo* info) const { - info->grad_op_maker_ = [](const OpDescBind& fwd_op) { - T maker(fwd_op); + info->grad_op_maker_ = []( + const OpDescBind& fwd_op, + const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var) { + T maker(fwd_op, no_grad_set, grad_to_var); return maker(); }; } diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 7f6d8fe6a4aec9fdc39b4ffc0837a03e355ec937..85312eaa926fe2daff6d9a01bfbbcfa105d03df5 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include +#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/framework/attribute.h" #include "paddle/framework/backward.h" @@ -25,6 +26,17 @@ limitations under the License. */ #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" +USE_OP(elementwise_add); +USE_OP(gaussian_random); +USE_OP(feed); +USE_OP(fetch); +USE_OP(mul); +USE_OP(sum); +USE_OP(squared_l2_distance); +USE_OP(fill_constant); +USE_OP(mean); +USE_OP(sgd); + using namespace paddle::platform; using namespace paddle::framework; @@ -34,8 +46,10 @@ void AddOp(const std::string& type, const VariableNameMap& inputs, // insert output for (auto kv : outputs) { for (auto v : kv.second) { - auto var = block->NewVar(v); - var->SetDataType(paddle::framework::DataType::FP32); + if (!block->HasVar(v)) { + auto var = block->NewVar(v); + var->SetDataType(paddle::framework::DataType::FP32); + } } } @@ -49,6 +63,7 @@ void AddOp(const std::string& type, const VariableNameMap& inputs, op->SetOutput(kv.first, kv.second); } op->SetAttrMap(attrs); + op->CheckAttrs(); } // Tensors in feed value variable will only be in CPUPlace @@ -134,12 +149,12 @@ class ExecutorTesterRandom : public ::testing::Test { AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}}, {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {}, root_block); + AddOp("mean", {{"X", {"l2_distance"}}}, {{"Out", {"mean_out"}}}, {}, + root_block); // backward - AddOp("fill_constant", {}, {{"Out", {"l2_distance@GRAD"}}}, - {{"shape", std::vector{batch_size, 1}}, {"value", float(1.0)}}, - root_block); - AppendBackward(program, {}); + auto target = VarDescBind("mean_out"); + AppendBackward(program, target, {}); // update AddOp("fill_constant", {}, {{"Out", {"learning_rate"}}}, @@ -305,4 +320,14 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) { } } } + +DECLARE_double(fraction_of_gpu_memory_to_use); + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + // Use less GPU memory for unittest. + FLAGS_fraction_of_gpu_memory_to_use = 0.25; + return RUN_ALL_TESTS(); +} + #endif diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto index b7a63f9ba10b77acff516d75cf1be0d4eeda40d4..65760b07ada7a63a568cb8296eef35a8aa18d9ff 100644 --- a/paddle/framework/framework.proto +++ b/paddle/framework/framework.proto @@ -97,16 +97,26 @@ enum DataType { FP64 = 6; } -message LoDTensorDesc { +message TensorDesc { required DataType data_type = 1; repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] - optional int32 lod_level = 3 [ default = 0 ]; +} + +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; } message VarDesc { + enum VarType { + LOD_TENSOR = 1; + SELECTED_ROWS = 2; + } required string name = 1; - optional LoDTensorDesc lod_tensor = 2; - optional bool persistable = 3 [ default = false ]; + required VarType type = 2; + optional LoDTensorDesc lod_tensor = 3; + optional TensorDesc selected_rows = 4; + optional bool persistable = 5 [ default = false ]; } message BlockDesc { diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h index e9ae6e22060850fe229998d3b651d08a5ca2033a..1219e0487531b19b00adde5a9aa2bde51bfc0aa8 100644 --- a/paddle/framework/grad_op_desc_maker.h +++ b/paddle/framework/grad_op_desc_maker.h @@ -13,6 +13,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/framework/op_desc.h" #include "paddle/framework/operator.h" @@ -21,27 +23,50 @@ namespace framework { class GradOpDescMakerBase { public: - explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {} + explicit GradOpDescMakerBase( + const OpDescBind& fwd_op, + const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var) + : fwd_op_(fwd_op), no_grad_set_(no_grad_set), grad_to_var_(grad_to_var) {} virtual ~GradOpDescMakerBase() = default; virtual std::vector> operator()() const = 0; protected: - static std::vector ToGradNames( - const std::vector& var_names) { + std::vector InputGrad(const std::string& name, + bool drop_empty_grad = true) const { std::vector ret_val; + auto var_names = this->Input(name); ret_val.reserve(var_names.size()); std::transform(var_names.begin(), var_names.end(), - std::back_inserter(ret_val), GradVarName); - return ret_val; - } - - std::vector InputGrad(const std::string& name) const { - return ToGradNames(fwd_op_.Input(name)); + std::back_inserter(ret_val), + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + if (no_grad_set_.count(g_name)) { + return kEmptyVarName; + } else { + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + } + }); + if (!drop_empty_grad) { + return ret_val; + } + std::vector dropped_ret_val; + dropped_ret_val.reserve(ret_val.size()); + std::copy_if(ret_val.begin(), ret_val.end(), + std::back_inserter(dropped_ret_val), + [](const std::string& str) { return str != kEmptyVarName; }); + return dropped_ret_val; } std::vector OutputGrad(const std::string& name) const { - return ToGradNames(fwd_op_.Output(name)); + std::vector ret_val; + auto onames = this->Output(name); + ret_val.reserve(onames.size()); + std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val), + GradVarName); + return ret_val; } std::vector InputNames() const { @@ -75,6 +100,8 @@ class GradOpDescMakerBase { private: const OpDescBind& fwd_op_; + const std::unordered_set& no_grad_set_; + std::unordered_map* grad_to_var_; }; class SingleGradOpDescMaker : public GradOpDescMakerBase { @@ -91,6 +118,7 @@ class SingleGradOpDescMaker : public GradOpDescMakerBase { virtual std::unique_ptr Apply() const = 0; }; +template class DefaultGradOpDescMaker : public SingleGradOpDescMaker { public: using SingleGradOpDescMaker::SingleGradOpDescMaker; @@ -102,7 +130,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker { for (auto& input_param : this->InputNames()) { grad->SetInput(input_param, this->Input(input_param)); - grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param)); + grad->SetOutput(GradVarName(input_param), + this->InputGrad(input_param, DropEmptyIG)); } for (auto& output_param : this->OutputNames()) { diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index d3c11ad60a0f9319329a59c16bfc4668cd75b7ae..ef207dc54ebe6cc72d9f1e428dd2aaed5ad3dbf0 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -32,7 +32,7 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs, } OpDesc *OpDescBind::Proto() { - Sync(); + Flush(); return &op_desc_; } @@ -100,6 +100,12 @@ void OpDescBind::SetAttr(const std::string &name, const Attribute &v) { need_update_ = true; } +void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) { + BlockDesc *desc = block.Proto(); + this->attrs_[name] = desc; + need_update_ = true; +} + void OpDescBind::SetAttrMap( const std::unordered_map &attr_map) { attrs_ = attr_map; @@ -159,7 +165,7 @@ struct SetAttrDescVisitor : public boost::static_visitor { void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } }; -void OpDescBind::Sync() { +void OpDescBind::Flush() { if (need_update_) { this->op_desc_.mutable_inputs()->Clear(); for (auto &ipt : inputs_) { diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index 90155fadeac148bd9cae4ce9066ac4ce8d9df52d..73b5cf846f702fe21277ae139156ec9784aa79b3 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -89,8 +89,6 @@ class OpDescBind { this->need_update_ = true; } - void Sync(); - const VariableNameMap &Inputs() const { return inputs_; } const VariableNameMap &Outputs() const { return outputs_; } @@ -104,6 +102,8 @@ class OpDescBind { void InferShape(const BlockDescBind &block) const; + void Flush(); + private: template static std::vector MapKeys(const MapType &map) { diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc index b118edae17430c8a4dd5c96a2a0c675766e08166..504afbd5dbacf7185f92e0000d19666230e2fb42 100644 --- a/paddle/framework/op_registry.cc +++ b/paddle/framework/op_registry.cc @@ -59,16 +59,5 @@ std::unique_ptr OpRegistry::CreateOp(const OpDescBind& op_desc) { op_desc.GetAttrMap()); } -std::vector> OpRegistry::CreateGradOpDescs( - OpDescBind* op_desc) { - auto& info = OpInfoMap::Instance().Get(op_desc->Type()); - - if (info.Checker() != nullptr) { - info.Checker()->Check(*op_desc->MutableAttrMap()); - } - - return info.grad_op_maker_(*op_desc); -} - } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 5ca3af52a6909eeee21f647d0e60c7a690f90190..226e8ddcd4b1a2630e0eea00ee6c9f6af6bd5d20 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -79,9 +79,6 @@ class OpRegistry { static std::unique_ptr CreateOp(const OpDesc& op_desc); - static std::vector> CreateGradOpDescs( - OpDescBind* op_desc); - static std::unique_ptr CreateOp(const OpDescBind& op_desc); }; @@ -160,17 +157,18 @@ class OpKernelRegistrar : public Registrar { /** * Macro to register Operator. */ -#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ - grad_op_class) \ - REGISTER_OPERATOR(grad_op_type, grad_op_class); \ - class _GradOpDescMaker_##grad_op_type##_ \ - : public ::paddle::framework::DefaultGradOpDescMaker { \ - using ::paddle::framework::DefaultGradOpDescMaker::DefaultGradOpDescMaker; \ - \ - protected: \ - virtual std::string GradOpType() const { return #grad_op_type; } \ - }; \ - REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \ +#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class) \ + REGISTER_OPERATOR(grad_op_type, grad_op_class); \ + class _GradOpDescMaker_##grad_op_type##_ \ + : public ::paddle::framework::DefaultGradOpDescMaker { \ + using ::paddle::framework::DefaultGradOpDescMaker< \ + true>::DefaultGradOpDescMaker; \ + \ + protected: \ + virtual std::string GradOpType() const { return #grad_op_type; } \ + }; \ + REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \ op_maker_class); #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \ diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 15f80b57206c90f689acfdcac60a0d9011025fc0..97a142d5f1661704fede858b28ff0d5487c66fab 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -289,6 +289,15 @@ class ExecutionContext { return device_context_; } +#ifdef PADDLE_WITH_CUDA + const platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + auto cuda_ctx = + reinterpret_cast(&device_context_); + return *cuda_ctx; + } +#endif + private: const OperatorBase& op_; const Scope& scope_; diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc index e89f9a46d587b6378aa3be92306c5680093e1926..fcb7292884275d972377983cb3ba1bcd86fb8348 100644 --- a/paddle/framework/program_desc.cc +++ b/paddle/framework/program_desc.cc @@ -45,7 +45,7 @@ BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) { ProgramDesc *ProgramDescBind::Proto() { for (auto &block : blocks_) { - block->Sync(); + block->Flush(); } return prog_; } diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc new file mode 100644 index 0000000000000000000000000000000000000000..c74459c9dd7006a24615b1d6df041583088fb25c --- /dev/null +++ b/paddle/framework/selected_rows.cc @@ -0,0 +1,16 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/selected_rows.h" + +namespace paddle { +namespace framework {} // namespace framework +} // namespace paddle diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h new file mode 100644 index 0000000000000000000000000000000000000000..f9f563051e264ae7ed7cf3c07c0065522b2bbe2e --- /dev/null +++ b/paddle/framework/selected_rows.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/tensor.h" + +namespace paddle { +namespace framework { + +class SelectedRows { + public: + SelectedRows(const std::vector& rows, const int64_t& height) + : rows_(rows), height_(height) { + value_.reset(new Tensor()); + } + + SelectedRows() { value_.reset(new Tensor()); } + + platform::Place place() const { return value_->place(); } + + const Tensor& value() const { return *value_; } + + Tensor* mutable_value() { return value_.get(); } + + int64_t height() const { return height_; } + + void set_height(int64_t height) { height_ = height; } + + const std::vector& rows() const { return rows_; } + + void set_rows(const std::vector& rows) { rows_ = rows; } + + DDim GetCompleteDims() const { + std::vector dims = vectorize(value_->dims()); + dims[0] = height_; + return make_ddim(dims); + } + + private: + std::vector rows_; + std::unique_ptr value_{nullptr}; + int64_t height_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ee13a65d72e44693573397bb686b355effb2227 --- /dev/null +++ b/paddle/framework/selected_rows_test.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/selected_rows.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { + +class SelectedRowsTester : public ::testing::Test { + public: + virtual void SetUp() override { + std::vector rows{0, 4, 7}; + int64_t height = 10; + int64_t row_numel = 100; + selected_rows_.reset(new SelectedRows(rows, height)); + + Tensor* value = selected_rows_->mutable_value(); + value->mutable_data( + make_ddim({static_cast(rows.size()), row_numel}), place_); + } + + protected: + platform::CPUPlace place_; + std::unique_ptr selected_rows_{nullptr}; +}; + +TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } + +TEST_F(SelectedRowsTester, dims) { + ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100})); +} + +TEST_F(SelectedRowsTester, complete_dims) { + ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index 64aab16ae54d34fd614add348c7c420b4a8f771d..b93f980cf6d279d18388b9637a2ff45d797ca78e 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -19,9 +19,6 @@ limitations under the License. */ namespace paddle { namespace framework { -// TODO(longfei): Once after both CompileTimeInferShapeContext and -// RuntimeInferShapeContext get merged, we can rename InferShapeContext into -// InferShapeContext so to replace the current InferShapeContext. class InferShapeContext { public: virtual ~InferShapeContext() {} diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc index 7ae16e99cdb8a23f14f0c8b684ba4ec66a4ce074..06459cbfd7b8c19c176452ff73c9f3a81ba1dc03 100644 --- a/paddle/framework/tensor_array.cc +++ b/paddle/framework/tensor_array.cc @@ -76,6 +76,17 @@ LoDTensor PackDynamicBatch(const std::vector& source, const std::vector& meta, const LoD& lod, size_t level); +std::vector GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) { + // collect indice need to copy to the batch + std::vector indice; + for (const auto& seq : meta) { + size_t id = seq.begin + batch_id; + if (id >= seq.end) break; + indice.push_back(id); + } + return indice; +} + } // namespace detail const LoDTensor& TensorArray::Read(size_t index) const { @@ -113,8 +124,8 @@ LoDTensor TensorArray::Pack(size_t level, const std::vector& meta, return detail::PackDynamicBatch(values_, meta, lod, level); } -std::vector TensorArray::Unpack(const LoDTensor& source, int level, - bool length_desend) { +DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level, + bool length_desend) { detail::DynamicBatchUnpacker unpacker(source, level, length_desend /*descend*/); @@ -129,6 +140,7 @@ std::vector TensorArray::Unpack(const LoDTensor& source, int level, Write(batch_id, unpacker.GetBatch(batch_id)); } + PADDLE_ENFORCE(!unpacker.meta.empty()); return unpacker.meta; } @@ -218,13 +230,7 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) { PADDLE_ENFORCE(!meta.empty(), "should build meta first"); LoDTensor result; - // collect indice need to copy to the batch - std::vector indice; - for (const auto& seq : meta) { - size_t id = seq.begin + index; - if (id >= seq.end) break; - indice.push_back(id); - } + auto indice = detail::GenDyBatchIndice(meta, index); PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index); // copy the indice of records in LoDTensor @@ -237,9 +243,9 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) { for (size_t i = 0; i < indice.size(); i++) { auto index = indice[i]; auto target = result.Slice(i, i + 1); - auto source_ = source->Slice(index, index + 1); + auto slice = source->Slice(index, index + 1); - target.CopyFrom(source_, platform::CPUPlace(), + target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext()); } diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h index 293da04997304be41810446cb3e866d545805f83..046ecb5221b7ed9d88e5017348ee8fcde23c7677 100644 --- a/paddle/framework/tensor_array.h +++ b/paddle/framework/tensor_array.h @@ -34,6 +34,13 @@ struct DySeqMeta { size_t ori_idx; }; +using DySeqMetaBatch = std::vector; + +/* + * Extract the indices of instances. + */ +std::vector GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id); + /* * TensorArray is a C-array-like array of tensors, it is meant to be used with * dynamic iteration primitives such as while_loop. It is used to segment inputs @@ -69,7 +76,7 @@ class TensorArray { * Recover the original LoD-arranged LoDTensor with the `values`, `level` and * `indice_map`. */ - LoDTensor Pack(size_t level, const std::vector &meta, + LoDTensor Pack(size_t level, const DySeqMetaBatch &meta, const LoD &lod) const; /* @@ -77,8 +84,7 @@ class TensorArray { * `values`, if set `desend`, will sort by length in descending order else in * ascending order. */ - std::vector Unpack(const LoDTensor &source, int level, - bool length_desend); + DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend); /* * Pack the values into a tensor with rank one higher than each tensor in diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h index 6f65a942ba2a4073e6aa1047875ec5c3283c23a6..0d1564a7510ddf0106ff417fb0b487ddbde1ac2e 100644 --- a/paddle/framework/type_defs.h +++ b/paddle/framework/type_defs.h @@ -36,8 +36,9 @@ using OpCreator = std::function; -using GradOpMakerFN = - std::function>(const OpDescBind&)>; +using GradOpMakerFN = std::function>( + const OpDescBind&, const std::unordered_set& /*no_grad_set*/, + std::unordered_map* /*grad_to_var*/)>; } // namespace framework } // namespace paddle diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc index a88e813b5e7c7e6420cb0ba8a25bba4f4d658e80..c302217e5aacdc17800238770d689b7fb65804f3 100644 --- a/paddle/framework/var_desc.cc +++ b/paddle/framework/var_desc.cc @@ -13,32 +13,58 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/var_desc.h" +#include "paddle/platform/enforce.h" namespace paddle { namespace framework { void VarDescBind::SetShape(const std::vector &dims) { - VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims()); + VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); } void VarDescBind::SetDataType(DataType data_type) { - desc_.mutable_lod_tensor()->set_data_type(data_type); + mutable_tensor_desc()->set_data_type(data_type); } std::vector VarDescBind::Shape() const { - return RepeatedToVector(desc_.lod_tensor().dims()); + return RepeatedToVector(tensor_desc().dims()); } -DataType VarDescBind::GetDataType() const { - return desc_.lod_tensor().data_type(); -} +DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); } void VarDescBind::SetLoDLevel(int32_t lod_level) { + PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR); desc_.mutable_lod_tensor()->set_lod_level(lod_level); } int32_t VarDescBind::GetLodLevel() const { + PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR); return desc_.lod_tensor().lod_level(); } + +const TensorDesc &VarDescBind::tensor_desc() const { + PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type"); + switch (desc_.type()) { + case VarDesc::SELECTED_ROWS: + return desc_.selected_rows(); + case VarDesc::LOD_TENSOR: + return desc_.lod_tensor().tensor(); + default: + PADDLE_THROW("Unexpected branch."); + } +} + +TensorDesc *VarDescBind::mutable_tensor_desc() { + PADDLE_ENFORCE(desc_.has_type(), + "invoke MutableTensorDesc must after set type"); + switch (desc_.type()) { + case VarDesc::SELECTED_ROWS: + return desc_.mutable_selected_rows(); + case VarDesc::LOD_TENSOR: + return desc_.mutable_lod_tensor()->mutable_tensor(); + default: + PADDLE_THROW("Unexpected branch."); + } +} } // namespace framework } // namespace paddle diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h index 464fece85fe5c674690c2034054e551f14db2138..688a46f83982fc464c7602ec1041ad3f42122211 100644 --- a/paddle/framework/var_desc.h +++ b/paddle/framework/var_desc.h @@ -34,6 +34,7 @@ inline std::vector RepeatedToVector( template inline void VectorToRepeated(const std::vector &vec, RepeatedField *repeated_field) { + repeated_field->Clear(); repeated_field->Reserve(vec.size()); for (const auto &elem : vec) { *repeated_field->Add() = elem; @@ -44,6 +45,7 @@ inline void VectorToRepeated(const std::vector &vec, template inline void VectorToRepeated(const std::vector &vec, RepeatedField *repeated_field) { + repeated_field->Clear(); repeated_field->Reserve(vec.size()); for (auto elem : vec) { *repeated_field->Add() = elem; @@ -52,7 +54,10 @@ inline void VectorToRepeated(const std::vector &vec, class VarDescBind { public: - explicit VarDescBind(const std::string &name) { desc_.set_name(name); } + explicit VarDescBind(const std::string &name) { + desc_.set_name(name); + desc_.set_type(VarDesc::LOD_TENSOR); + } VarDesc *Proto() { return &desc_; } @@ -70,7 +75,14 @@ class VarDescBind { int32_t GetLodLevel() const; + VarDesc::VarType GetType() const { return desc_.type(); } + + void SetType(VarDesc::VarType type) { desc_.set_type(type); } + private: + const TensorDesc &tensor_desc() const; + TensorDesc *mutable_tensor_desc(); + VarDesc desc_; }; } // namespace framework diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 702a71d75561f6a7f2e682fba83f6b761e5986c4..445710127502f19aefde5ab64fe48f66722d157a 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -84,8 +84,9 @@ function(op_library TARGET) endif() # pybind USE_NO_KERNEL_OP + # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel file(READ ${TARGET}.cc TARGET_CONTENT) - string(REGEX MATCH "OperatorWithKernel" regex_result "${TARGET_CONTENT}") + string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") string(REPLACE "_op" "" TARGET "${TARGET}") if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index cba57ba57f5e03c7861897e177cc09aa513e5395..84c3775b4fc2602e5df9bb454d21b318b8fda493 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -338,6 +338,38 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { } }; +template +class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HardSigmoidOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of HardSigmoid operator"); + AddOutput("Y", "Output of HardSigmoid operator"); + AddComment(R"DOC( +Hard Sigmoid activation operator. + +Segment-wise linear approximation of sigmoid[1]. +This is much faster than sigmoid. + +hard_sigmoid = max(0, min(1, slope * x + shift)) + +The slope should be positive. The offset can be either positive or negative. +The default slope and shift are set from [1]. +It is recommended to use the defaults for this activation. + +References: + [1] Noisy Activation Functions + (https://arxiv.org/abs/1603.00391) + + )DOC"); + AddAttr("slope", "Slope for linear approximation of sigmoid") + .SetDefault(static_cast(0.2)); + AddAttr("offset", "Offset for linear approximation of sigmoid") + .SetDefault(static_cast(0.5)); + } +}; + } // namespace operators } // namespace paddle @@ -413,6 +445,9 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker, thresholded_relu_grad, ops::ActivationOpGrad); +REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker, + hard_sigmoid_grad, ops::ActivationOpGrad); + #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ REGISTER_OP_CPU_KERNEL( \ act_type, \ diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index 502c33be103c465c14f128be38ac62d029f1bfb9..4f4eb44fedc0a89cdcf60fb7177014a11eb96048 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -616,30 +616,63 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { } }; +template +struct HardSigmoidFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Y y) const { + auto temp = x * static_cast(slope) + static_cast(offset); + y.device(d) = temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); + } +}; + +template +struct HardSigmoidGradFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Y y, dY dy, dX dx) const { + dx.device(d) = + dy * + ((y > static_cast(0)) * (y < static_cast(1))).template cast() * + static_cast(slope); + } +}; + } // namespace operators } // namespace paddle -#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ - __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ - __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(exp, ExpFunctor, ExpGradFunctor); \ - __macro(relu, ReluFunctor, ReluGradFunctor); \ - __macro(tanh, TanhFunctor, TanhGradFunctor); \ - __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ - __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ - __macro(abs, AbsFunctor, AbsGradFunctor); \ - __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ - __macro(log, LogFunctor, LogGradFunctor); \ - __macro(square, SquareFunctor, SquareGradFunctor); \ - __macro(brelu, BReluFunctor, BReluGradFunctor); \ - __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(pow, PowFunctor, PowGradFunctor); \ - __macro(stanh, STanhFunctor, STanhGradFunctor); \ - __macro(softplus, SoftplusFunctor, SoftplusGradFunctor); \ - __macro(softsign, SoftsignFunctor, SoftsignGradFunctor); \ - __macro(relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ - __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ - __macro(elu, ELUFunctor, ELUGradFunctor); \ - __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \ +#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ + __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ + __macro(exp, ExpFunctor, ExpGradFunctor); \ + __macro(relu, ReluFunctor, ReluGradFunctor); \ + __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ + __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ + __macro(abs, AbsFunctor, AbsGradFunctor); \ + __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log, LogFunctor, LogGradFunctor); \ + __macro(square, SquareFunctor, SquareGradFunctor); \ + __macro(brelu, BReluFunctor, BReluGradFunctor); \ + __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(pow, PowFunctor, PowGradFunctor); \ + __macro(stanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ + __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ + __macro(elu, ELUFunctor, ELUGradFunctor); \ + __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \ + __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \ __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..293b37b7750427cb88efb6dfd5a02dcf7ede24ac --- /dev/null +++ b/paddle/operators/adam_op.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/adam_op.h" + +namespace paddle { +namespace operators { + +class AdamOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment1"), + "Input(Moment1) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment2"), + "Input(Moment2) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + "Input(Beta2Pow) of AdamOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + "Output(Moment1Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + "Output(Moment2Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"), + "Output(Beta1PowOut) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Beta2PowOut"), + "Output(Beta2PowOut) of AdamOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); + auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); + + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment1"), + "Param and Moment input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment2"), + "Param and InfNorm input of AdamOp should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("Moment1Out", param_dims); + ctx->SetOutputDim("Moment2Out", param_dims); + ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims); + ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims); + } +}; + +class AdamOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + AddInput("Moment1", "(Tensor) Input first moment"); + AddInput("Moment2", "(Tensor) Input second moment"); + AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); + AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("Moment1Out", "(Tensor) Output first moment"); + AddOutput("Moment2Out", "(Tensor) Output second moment"); + AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator"); + AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator"); + + AddAttr("beta1", + "(float, default 0.9) " + "Exponential decay rate for the " + "first moment estimates.") + .SetDefault(0.9f); + AddAttr("beta2", + "(float, default 0.999) " + "exponential decay rate for the " + "second moment estimates.") + .SetDefault(0.999f); + AddAttr("epsilon", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0e-8f); + + AddComment(R"DOC( +Adam Updates Operator. + +This implements the Adam optimizer from Section 2 of the Adam +paper[1]. Adam is a first-order gradient-based optimization +method based on adaptive estimates of lower-order moments. + +Adam updates: + +moment1_out = beta1 * moment1 + (1 − beta1) * grad +moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad +beta1_pow_out = beta1_pow * beta1 +beta2_pow_out = beta2_pow * beta2 +learning_rate_t = learning_rate_t * + sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out) +param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon) + +References: + [1] Adam: A Method for Stochastic Optimization + (https://arxiv.org/abs/1412.6980) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); +REGISTER_OP_CPU_KERNEL(adam, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a3def912e540454275350209435eb01ae2151331 --- /dev/null +++ b/paddle/operators/adam_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/adam_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(adam, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h new file mode 100644 index 0000000000000000000000000000000000000000..789c2f14b32478bf9ddc967fc5725bcf65ed2146 --- /dev/null +++ b/paddle/operators/adam_op.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class AdamOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment1_out_tensor = ctx.Output("Moment1Out"); + auto moment2_out_tensor = ctx.Output("Moment2Out"); + auto beta1_pow_out_tensor = ctx.Output("Beta1PowOut"); + auto beta2_pow_out_tensor = ctx.Output("Beta2PowOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment1_out_tensor->mutable_data(ctx.GetPlace()); + moment2_out_tensor->mutable_data(ctx.GetPlace()); + beta1_pow_out_tensor->mutable_data(ctx.GetPlace()); + beta2_pow_out_tensor->mutable_data(ctx.GetPlace()); + + float beta1 = ctx.Attr("beta1"); + float beta2 = ctx.Attr("beta2"); + float epsilon = ctx.Attr("epsilon"); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment1 = framework::EigenVector::Flatten( + *ctx.Input("Moment1")); + auto moment2 = framework::EigenVector::Flatten( + *ctx.Input("Moment2")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + auto beta1_pow = framework::EigenVector::Flatten( + *ctx.Input("Beta1Pow")); + auto beta2_pow = framework::EigenVector::Flatten( + *ctx.Input("Beta2Pow")); + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment1_out = framework::EigenVector::Flatten(*moment1_out_tensor); + auto moment2_out = framework::EigenVector::Flatten(*moment2_out_tensor); + auto beta1_pow_out = + framework::EigenVector::Flatten(*beta1_pow_out_tensor); + auto beta2_pow_out = + framework::EigenVector::Flatten(*beta2_pow_out_tensor); + auto place = ctx.GetEigenDevice(); + + moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad; + moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square(); + beta1_pow_out.device(place) = beta1_pow * beta1; + beta2_pow_out.device(place) = beta2_pow * beta2; + // All of these are tensors of 1 element + auto lr_t = lr * (1 - beta2_pow_out).sqrt() / (1 - beta1_pow_out); + // Eigen does not support automatic broadcast + // Get dimensions of moment vector to broadcast lr_t + Eigen::DSizes m_dsize(moment1_out_tensor->numel()); + param_out.device(place) = + param - + lr_t.broadcast(m_dsize) * + (moment1_out / (moment2_out.sqrt() + epsilon)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/conv2d_op.cc b/paddle/operators/conv2d_op.cc index 6325d4248f10ea8a12ae5398d9fe0e579db3f7ae..1acb8415d0691df77047806d3c81b51cbb8c59f3 100644 --- a/paddle/operators/conv2d_op.cc +++ b/paddle/operators/conv2d_op.cc @@ -12,111 +12,91 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/gemm_conv2d_op.h" +#include "paddle/operators/conv2d_op.h" namespace paddle { namespace operators { -int outputSize(int input_size, int filter_size, int padding, int stride) { - int output_size = (input_size - filter_size + 2 * padding) / stride + 1; - return output_size; +void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of Conv2DOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of Conv2DOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of Conv2DOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + int groups = ctx->Attrs().Get("groups"); + int input_channels = in_dims[1]; + int output_channels = filter_dims[0]; + + PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D."); + PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D."); + PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, + "The number of input channels should be equal to filter " + "channels * groups."); + PADDLE_ENFORCE_EQ( + output_channels % groups, 0, + "The number of output channels should be divided by groups."); + + auto output_height = + OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]); + auto output_width = + OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]); + ctx->SetOutputDim("Output", + {in_dims[0], filter_dims[0], output_height, output_width}); } -class Conv2DOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Input"), - "Input(Input) of Conv2DOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Filter"), - "Input(Filter) of Conv2DOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Output"), - "Output(Output) of Conv2DOp should not be null."); - - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - int groups = ctx->Attrs().Get("groups"); - int input_channels = in_dims[1]; - int output_channels = filter_dims[0]; - - PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D."); - PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D."); - PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, - "The number of input channels should be equal to filter " - "channels * groups."); - PADDLE_ENFORCE_EQ( - output_channels % groups, 0, - "The number of output channels should be divided by groups."); - - auto output_height = - outputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]); - auto output_width = - outputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]); - ctx->SetOutputDim( - "Output", {in_dims[0], filter_dims[0], output_height, output_width}); - } -}; - -class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { - public: - Conv2DOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput( - "Input", - "The input tensor of convolution operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of image."); - AddInput( - "Filter", - "The filter tensor of convolution operator." - "The format of the filter tensor is MCHW, where M is the number of " - "output image channels, C is the number of input image channels, " - "H and W is height and width of filter. " - "If the groups attribute is greater than 1, C equal the number of " - "input image channels divided by the groups."); - AddOutput("Output", - "The output tensor of convolution operator." - "The format of output tensor is also NCHW."); - AddAttr>("strides", "strides of convolution operator.") - .SetDefault({1, 1}); - AddAttr>("paddings", "paddings of convolution operator.") - .SetDefault({0, 0}); - AddAttr( - "groups", - "group size of convolution operator. " - "Refer to grouped convolution in Alex Krizhevsky's paper: " - "when group=2, the first half of the filters are only connected to the " - "first half of the input channels, and the second half only connected " - "to the second half.") - .SetDefault(1); - AddComment(R"DOC( +Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "The input tensor of convolution operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of image."); + AddInput("Filter", + "The filter tensor of convolution operator." + "The format of the filter tensor is MCHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "H and W is height and width of filter. " + "If the groups attribute is greater than 1, C equal the number of " + "input image channels divided by the groups."); + AddOutput("Output", + "The output tensor of convolution operator." + "The format of output tensor is also NCHW."); + AddAttr>("strides", "strides of convolution operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", "paddings of convolution operator.") + .SetDefault({0, 0}); + AddAttr( + "groups", + "group size of convolution operator. " + "Refer to grouped convolution in Alex Krizhevsky's paper: " + "when group=2, the first half of the filters are only connected to the " + "first half of the input channels, and the second half only connected " + "to the second half.") + .SetDefault(1); + AddComment(R"DOC( The convolution operation calculates the output based on the input, filter and strides, paddings, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. )DOC"); - } -}; - -class Conv2DOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; +} - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - if (ctx->HasOutput(framework::GradVarName("Input"))) { - ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); - } - if (ctx->HasOutput(framework::GradVarName("Filter"))) { - ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); - } +void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); } -}; + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} } // namespace operators } // namespace paddle diff --git a/paddle/operators/conv2d_op.cu b/paddle/operators/conv2d_op.cu index 5df818ba0496a65502dde37fd1397ec56f8c1101..c697c9466d34c29af6976f3a4d2d0a24ba778ceb 100644 --- a/paddle/operators/conv2d_op.cu +++ b/paddle/operators/conv2d_op.cu @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/gemm_conv2d_op.h" +#include "paddle/operators/conv2d_op.h" namespace ops = paddle::operators; diff --git a/paddle/operators/gemm_conv2d_op.h b/paddle/operators/conv2d_op.h similarity index 90% rename from paddle/operators/gemm_conv2d_op.h rename to paddle/operators/conv2d_op.h index 323e3f7c3bd506c6b63bf4d1152384649f5da575..7ebdbe81cbbaf59a60eb3dac0f570d70fc85d6ef 100644 --- a/paddle/operators/gemm_conv2d_op.h +++ b/paddle/operators/conv2d_op.h @@ -24,6 +24,38 @@ namespace operators { using Tensor = framework::Tensor; +// Base convolution operator definations for other conv +// like operators to reuse the implementation. +inline int OutputSize(int input_size, int filter_size, int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +// Define Op classes in .h file so that other conv +// operator implementations can reuse the code. +class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv2DOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker); +}; + +class Conv2DOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class Conv2DOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + template class GemmConv2DKernel : public framework::OpKernel { public: @@ -74,7 +106,6 @@ class GemmConv2DKernel : public framework::OpKernel { framework::DDim output_matrix_shape = {output_channels, output_height * output_width}; - // convolution operator: im2col + gemm int in_step = input_channels / groups; int out_step = output_channels / groups; diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4288f300dd5b0464f2b4394cdb0b44f93060ae74 --- /dev/null +++ b/paddle/operators/conv_cudnn_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/conv2d_op.h" + +namespace paddle { +namespace operators { + +class CudnnConvOpMaker : public Conv2DOpMaker { + public: + CudnnConvOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : Conv2DOpMaker(proto, op_checker) { + AddAttr>("dilations", "dilations of convolution operator.") + .SetDefault(std::vector{1, 1}); + AddAttr("workspace_size_MB", + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad, + ops::Conv2DOpGrad); +REGISTER_OP_CPU_KERNEL( + conv_cudnn, ops::GemmConv2DKernel); +REGISTER_OP_CPU_KERNEL( + conv_cudnn_grad, + ops::GemmConvGrad2DKernel); diff --git a/paddle/operators/conv_cudnn_op.cu b/paddle/operators/conv_cudnn_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..366d0323b840c338dd6ba5b28bdb29fd135fe91a --- /dev/null +++ b/paddle/operators/conv_cudnn_op.cu @@ -0,0 +1,277 @@ +/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" +#include "paddle/memory/memory.h" +#include "paddle/operators/conv2d_op.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; +using CUDADeviceContext = platform::CUDADeviceContext; + +static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024; + +// NOTE: framework::vectorize converts to type int64_t +// which does not fit cudnn inputs. +std::vector Dims2Vector(const framework::DDim& dims) { + std::vector ret; + for (int i = 0; i < dims.size(); i++) { + ret.push_back(dims[i]); + } + return ret; +} + +template +class CudnnConvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use GPUPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + + cudnnTensorDescriptor_t cudnn_input_desc = + input_desc.descriptor(layout, Dims2Vector(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_desc = + output_desc.descriptor(layout, Dims2Vector(output->dims()), groups); + cudnnFilterDescriptor_t cudnn_filter_desc = + filter_desc.descriptor(layout, Dims2Vector(filter->dims()), groups); + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + int input_channels = input->dims()[1]; + int input_height = input->dims()[2]; + int input_width = input->dims()[3]; + int output_channels = output->dims()[1]; + int output_height = output->dims()[2]; + int output_width = output->dims()[3]; + + int group_offset_in = input_channels / groups * input_height * input_width; + int group_offset_out = + output_channels / groups * output_height * output_width; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn conv workspace --------------------- + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionFwdAlgo_t algo; + auto handle = ctx.cuda_device_context().cudnn_handle(); + + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + // get workspace size able to allocate + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + // Allocate on GPU memory + platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv forward --------------------- + T alpha = 1.0f, beta = 0.0f; + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +template +class CudnnConvGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use GPUPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_grad_desc; + ScopedTensorDescriptor input_grad_desc; + + ScopedFilterDescriptor filter_desc; + ScopedFilterDescriptor filter_grad_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + + cudnnTensorDescriptor_t cudnn_input_desc = + input_desc.descriptor(layout, Dims2Vector(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_grad_desc = + output_grad_desc.descriptor(layout, Dims2Vector(output_grad->dims()), + groups); + cudnnFilterDescriptor_t cudnn_filter_desc = + filter_desc.descriptor(layout, Dims2Vector(filter->dims()), groups); + cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr; + cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr; + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + int input_channels = input->dims()[1]; + int input_height = input->dims()[2]; + int input_width = input->dims()[3]; + int output_grad_channels = filter->dims()[0]; + int output_grad_height = output_grad->dims()[2]; + int output_grad_width = output_grad->dims()[3]; + + int group_offset_in = input_channels / groups * input_height * input_width; + int group_offset_out = + output_grad_channels / groups * output_grad_height * output_grad_width; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t data_algo; + cudnnConvolutionBwdFilterAlgo_t filter_algo; + size_t workspace_size_in_bytes = 0, tmp_size = 0; + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + + auto handle = ctx.cuda_device_context().cudnn_handle(); + if (input_grad) { + cudnn_input_grad_desc = input_grad_desc.descriptor( + layout, Dims2Vector(input_grad->dims()), groups); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_grad_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, cudnn_filter_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + + if (filter_grad) { + cudnn_filter_grad_desc = filter_grad_desc.descriptor( + layout, Dims2Vector(filter_grad->dims()), groups); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, filter_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + // ------------------- cudnn conv workspace --------------------- + // Already on GPU + void* cudnn_workspace = nullptr; + platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- + // FIXME(typhoonzero): template type T may not be the same as cudnn call. + T alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*input_grad); + t.device(ctx.GetEigenDevice()) = + t.constant(static_cast(0)); + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_grad_desc, input_grad_data + i * group_offset_in)); + } + } + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*filter_grad); + t.device(ctx.GetEigenDevice()) = + t.constant(static_cast(0)); + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_output_grad_desc, output_grad_data + i * group_offset_out, + cudnn_conv_desc, filter_algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_filter_grad_desc, + filter_grad_data + i * group_offset_filter)); + } + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel); +REGISTER_OP_GPU_KERNEL(conv_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7f583f18c8c6ee5025f6525306f9323fb329b030 --- /dev/null +++ b/paddle/operators/decayed_adagrad_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/decayed_adagrad_op.h" + +namespace paddle { +namespace operators { + +class DecayedAdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("LearningRate"), + "Input(LearningRate) of DecayedAdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of DecayedAdagradOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "LearningRate should have one element"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of DecayedAdagradOp should have " + "the same dimension."); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of DecayedAdagradOp should have " + "the same dimension."); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + } +}; + +class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DecayedAdagradOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Moment", "(Tensor) Second moment"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output second moment"); + + AddAttr("decay", + "(float, default 0.95) " + "Discounting factor for coming gradient") + .SetDefault(0.95); + AddAttr("epsilon", + "(float, default 1.0e-6) " + "Constant for numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( + +Decayed Adagrad + +moment_out = decay * moment + (1 - decay) * grad * grad +param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, + ops::DecayedAdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6fce77fe4ec6b76cb7b0259aab6a3d55d2edb36c --- /dev/null +++ b/paddle/operators/decayed_adagrad_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/decayed_adagrad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0fe0fc5acd66c9824a864618b69097c5c063ea3f --- /dev/null +++ b/paddle/operators/decayed_adagrad_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DecayedAdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + + float decay = ctx.Attr("decay"); + float epsilon = ctx.Attr("epsilon"); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto place = ctx.GetEigenDevice(); + + moment_out.device(place) = decay * moment + (1 - decay) * grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc index b919aef8fb62e5b2331c2d842556e0642ea6b095..58a5bf3e3651c963eead6dc0b8a3497c65b0eff2 100644 --- a/paddle/operators/dynamic_recurrent_op.cc +++ b/paddle/operators/dynamic_recurrent_op.cc @@ -23,6 +23,7 @@ using framework::Scope; using framework::TensorArray; using framework::LoDTensor; using framework::Variable; +using framework::DySeqMetaBatch; namespace detail { @@ -33,6 +34,29 @@ inline void CreateVariables(Scope& scope, } } +/* + * The inputs with sequence should be reordered when they are split, so the + * boot_states should be reordered in the same order. + * + * NOTE This may require that the `pre_state` of the first time step should just + * copy the `boot_state` rather than reference it, for that the content should + * be reordered, but the RNN op should not change the `boot_state` as an input + * variable's content. + */ +template +inline void ReorderBootState(const DySeqMetaBatch& metas, + const LoDTensor& boot_state, LoDTensor* tensor, + const platform::Place& dst_place) { + for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { + auto slice = tensor->Slice(seq_id, seq_id + 1); + auto boot_slice = + boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); + // TODO(superjom) pass in device context as an argument + slice.template CopyFrom(boot_slice, dst_place, + platform::CPUDeviceContext()); + } +} + } // namespace detail class DynamicRecurrentOpProtoAndCheckerMaker @@ -69,6 +93,7 @@ void DynamicRecurrentOp::Run(const Scope& scope, CreateScopes(); WriteStepInputs(); InitStates(); + WriteStepOutputs(); // call stepnet in all the time steps for (size_t step = 0; step < cache_.num_steps; step++) { @@ -76,7 +101,6 @@ void DynamicRecurrentOp::Run(const Scope& scope, stepnet_->Run(step_scope, dev_ctx); } - WriteStepOutputs(); ConcatOutputs(); } @@ -84,11 +108,11 @@ void DynamicRecurrentOp::SplitInputs() const { // TODO(superjom) make level a config // TODO(superjom) check all the inputs has the same LoD int level = 0; - const auto& inlinks = cache_.inlinks; - for (const auto& item : inlinks) { + for (const auto& item : cache_.inlinks) { const auto& var = item.second; const auto& tensor = var->Get(); TensorArray& ta = step_inputs_[item.first]; + dy_seq_metas_[item.first] = ta.Unpack(tensor, level, true /*length_descend*/); @@ -120,17 +144,11 @@ void DynamicRecurrentOp::WriteStepInputs() const { } void DynamicRecurrentOp::WriteStepOutputs() const { - for (size_t step = 0; step < cache_.scopes->size(); step++) { - auto& scope = cache_.GetScope(step); - for (auto& item : step_outputs_) { - auto* var = scope.FindVar(item.first); - if (var == nullptr) { - var = scope.NewVar(item.first); - } - auto* tensor = var->GetMutable(); - item.second.WriteShared(step, *tensor); - } + // initialize step outputs + for (const auto& item : cache_.outlinks) { + step_outputs_.emplace(item.first, TensorArray()); } + PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL); } void DynamicRecurrentOp::CreateScopes() const { @@ -145,12 +163,18 @@ void DynamicRecurrentOp::CreateScopes() const { PADDLE_ENFORCE_NOT_NULL(stepnet_, "stepnet should be set first"); std::vector memories; std::vector pre_memories; + std::vector stepnet_outputs; std::transform(arg_.memories.begin(), arg_.memories.end(), std::back_inserter(memories), [](const rnn::MemoryAttr& m) { return m.var; }); std::transform(arg_.memories.begin(), arg_.memories.end(), std::back_inserter(pre_memories), [](const rnn::MemoryAttr& m) { return m.pre_var; }); + for (const auto& item : stepnet_->Outputs()) { + for (const auto& var : item.second) { + stepnet_outputs.push_back(var); + } + } for (size_t step = 0; step < cache_.num_steps; step++) { auto& scope = cache_.GetScope(step); @@ -158,60 +182,88 @@ void DynamicRecurrentOp::CreateScopes() const { detail::CreateVariables(scope, arg_.outlinks); detail::CreateVariables(scope, memories); detail::CreateVariables(scope, pre_memories); + detail::CreateVariables(scope, stepnet_outputs); } } void DynamicRecurrentOp::ConcatOutputs() const { // TODO(superjom) transform this to a config int level = 0; - // TODO(superjom) pass in some lod - // just a placeholder - framework::LoD lod; + for (size_t step = 0; step < cache_.num_steps; step++) { + auto& scope = cache_.GetScope(step); + for (auto& item : step_outputs_) { + auto* var = scope.FindVar(item.first); + PADDLE_ENFORCE_NOT_NULL(var); + auto* tensor = var->GetMutable(); + tensor->mutable_data(platform::CPUPlace()); + item.second.WriteShared(step, *tensor); + } + } + // the inlinks' lods should be the same, so randomly get one lod. + const auto& some_lod = + cache_.scope->FindVar(arg_.inlinks.front())->Get().lod(); + const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; for (auto& item : step_outputs_) { - auto tensor = item.second.Pack(level, dy_seq_metas_[item.first], lod); - auto& output = cache_.outlinks[item.first]->Get(); - const_cast(&output)->ShareDataWith(tensor); + auto tensor = item.second.Pack(level, some_meta, some_lod); + auto* output = cache_.outlinks[item.first]->GetMutable(); + const_cast(output)->ShareDataWith(tensor); } } void DynamicRecurrentOp::InitStates() const { - // init the first state - // TODO(superjom) parepare the scenerio that boot state not exists - for (auto memory : arg_.memories) { - auto* boot_state_var = cache_.scope->FindVar(memory.boot_var); - PADDLE_ENFORCE_NOT_NULL(boot_state_var); - auto& boot_state = boot_state_var->Get(); - const auto& dims = boot_state.dims(); - - for (size_t step = 0; step < cache_.num_steps; step++) { - auto& cur_scope = cache_.GetScope(step); - // link pre-state to boot_state - // init state and pre-state - auto* pre_state = cur_scope.FindVar(memory.pre_var); - PADDLE_ENFORCE_NOT_NULL(pre_state); - pre_state->GetMutable(); - - auto* state = cur_scope.FindVar(memory.var); - PADDLE_ENFORCE_NOT_NULL(state); - state->GetMutable()->Resize(dims); - state->GetMutable()->mutable_data( - platform::CPUPlace()); - - if (step == 0) { - auto* pre_state_tensor = pre_state->GetMutable(); - pre_state_tensor->Resize(boot_state.dims()); - pre_state_tensor->ShareDataWith(boot_state); - } else { - auto& pre_scope = cache_.GetScope(step - 1); - auto* state_pre = pre_scope.FindVar(memory.var); - PADDLE_ENFORCE_NOT_NULL(state_pre); - pre_state->GetMutable()->ShareDataWith( - *state_pre->GetMutable()); - } + for (size_t step = 0; step < cache_.num_steps; step++) { + for (const auto& memory : arg_.memories) { + CreateState(memory, step); + LinkState(memory, step); } } } +void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory, + size_t step) const { + auto& scope = cache_.GetScope(step); + auto& state = *cache_.GetTensor(scope, memory.var); + auto& boot_state = *cache_.GetTensor(*cache_.scope, memory.boot_var); + + size_t num_instances = + step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; + auto dims = boot_state.dims(); + dims[0] = num_instances; + + state.Resize(dims); + state.mutable_data(platform::CPUPlace()); + states_[memory.var].WriteShared(step, state); +} + +void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory, + size_t step) const { + auto& scope = cache_.GetScope(step); + auto& state_pre = *cache_.GetTensor(scope, memory.pre_var); + + // all the step_inputs' metas should be the same, just randomly select one + // and get the dyseq meta. + const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; + size_t num_instances = + step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; + + LoDTensor* pre_state{nullptr}; + if (step == 0) { + pre_state = cache_.GetTensor(*cache_.scope, memory.boot_var); + pre_state->mutable_data(platform::CPUPlace()); + // allocate memory + state_pre.Resize(pre_state->dims()); + state_pre.mutable_data(platform::CPUPlace()); + detail::ReorderBootState(some_meta, *pre_state, &state_pre, + pre_state->place()); + } else { + pre_state = cache_.GetTensor(cache_.GetScope(step - 1), memory.var); + } + + // shink and share from previous state + auto shrinked_pre_state = pre_state->Slice(0, num_instances); + state_pre.ShareDataWith(shrinked_pre_state); +} + void DynamicRecurrentOp::ArgCache::Init( const rnn::ArgumentName& name, const paddle::framework::OperatorBase& op, const paddle::framework::Scope& scope, rnn::Argument* arg) { @@ -261,6 +313,12 @@ Variable* DynamicRecurrentOp::ArgCache::GetVariable(const Scope& scope, return var; } +LoDTensor* DynamicRecurrentOp::ArgCache::GetTensor( + const framework::Scope& scope, const std::string& name) { + auto* var = GetVariable(scope, name); + return var->GetMutable(); +} + const rnn::ArgumentName DynamicRecurrentOp::kArgName{ "step_net", "step_scopes", "inlinks", "outlinks", "memories", "pre_memories", "boot_memories"}; diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h index 6a2970f27fd5bcb25e924dbc567e254159b55a3e..ec80a1c90eee3a655febe0dd3d6c67c16ec6c64b 100644 --- a/paddle/operators/dynamic_recurrent_op.h +++ b/paddle/operators/dynamic_recurrent_op.h @@ -77,6 +77,17 @@ class DynamicRecurrentOp : public framework::OperatorBase { */ void InitStates() const; + /* + * Create state variables for each time step. + */ + void CreateState(const rnn::MemoryAttr& memory, size_t step) const; + + /* + * Link pre-state variable in current scope to the state variable in the + * previous time step (scope). + */ + void LinkState(const rnn::MemoryAttr& memory, size_t step) const; + /* * Concatenate outputs in each time step and generate a LoDTensor. */ @@ -91,6 +102,16 @@ class DynamicRecurrentOp : public framework::OperatorBase { } const OperatorBase& GetStepNet() const { return *stepnet_; } + const framework::TensorArray& state(const std::string& name) const { + return states_[name]; + } + const framework::TensorArray& step_input(const std::string& name) const { + return step_inputs_[name]; + } + const framework::TensorArray& step_output(const std::string& name) const { + return step_outputs_[name]; + } + protected: struct ArgCache { framework::Scope const* scope; @@ -108,6 +129,9 @@ class DynamicRecurrentOp : public framework::OperatorBase { return *scopes->at(index); } + framework::LoDTensor* GetTensor(const framework::Scope& scope, + const std::string& name); + private: void InitArgument(const rnn::ArgumentName& name, const OperatorBase& op, rnn::Argument* arg); @@ -122,7 +146,7 @@ class DynamicRecurrentOp : public framework::OperatorBase { private: std::unique_ptr stepnet_; - mutable framework::TensorArray states_; + mutable std::map states_; mutable std::map step_inputs_; mutable std::map step_outputs_; mutable std::map> diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc index 675a7890f3fa6bb7ab9dbbdb04894b2557214a8a..b849c4541da5d9812f4d86430049c0cbc04f385d 100644 --- a/paddle/operators/dynamic_recurrent_op_test.cc +++ b/paddle/operators/dynamic_recurrent_op_test.cc @@ -87,7 +87,6 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test { platform::CPUPlace place; scope.NewVar("step_scopes"); CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place); - // auto* out0 = CreateVar(scope, "out0", framework::make_ddim({10, 20}), place); auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place); // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively. diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f84597cd7301af6521b8c1032e69569ba6f03a --- /dev/null +++ b/paddle/operators/gru_unit_op.cc @@ -0,0 +1,210 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/gru_unit_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class GRUUnitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUUnitOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"), + "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUUnitOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasOutput("Gate"), + "Output(%s) of GRUUnitOp should not be null.", "Gate"); + PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"), + "Output(%s) of GRUUnitOp should not be null.", + "ResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(%s) of GRUUnitOp should not be null.", "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); + auto weight_dims = ctx->GetInputDim("Weight"); + int batch_size = input_dims[0]; + int input_size = input_dims[1]; + int frame_size = hidden_prev_dims[1]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ( + input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUUnitOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + auto bias = Input("Bias"); + if (bias != framework::kEmptyVarName) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } + ctx->SetOutputDim("Gate", {batch_size, frame_size * 3}); + ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size}); + ctx->SetOutputDim("Hidden", {batch_size, frame_size}); + } +}; + +class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GRUUnitOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " + "input."); + AddInput("HiddenPrev", + "(Tensor) Matrix with shape [batch_size, frame_size] for the " + "states of previous time step."); + AddInput("Weight", + "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. " + "The elements continuous in memory can be divided into two parts. " + "The first part are weights of the update gate and reset gate " + "with shape [frame_size, frame_size * 2], and the second part are " + "weights of output candidate with shape [frame_size, frame_size]"); + AddInput("Bias", + "(Tensor) Bias vector with shape [1, frame_size * 3] concating " + "bias of the update gate, reset gate and output candidate."); + AddOutput("Gate", + "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " + "output of update gate, reset gate and output candidate") + .AsIntermediate(); + AddOutput("ResetHiddenPrev", + "(Tensor) Matrix with shape [batch_size, frame_size] for the " + "reseted hidden state of previous time step.") + .AsIntermediate(); + AddOutput("Hidden", + "(Tensor) The GRU hidden state of the current time step " + "with shape [batch_size, frame_size]."); + AddAttr("activation", + "(enum int, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault(tanh) + .InEnum({identity, sigmoid, tanh, relu}); + AddAttr("gate_activation", + "(enum int, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault(sigmoid) + .InEnum({identity, sigmoid, tanh, relu}); + AddComment(R"DOC( +GRUUnitOp implements part calculations of the GRU unit as following: + +\f[ +update \ gate: u_t = actGate(xu_t + W_u * hidden_prev + bias_u) \\ +reset \ gate: r_t = actGate(xr_t + W_r * hidden_prev + bias_r) \\ +output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, hidden_prev) + bias_c) \\ +output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_prev) +\f] + +The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp. +)DOC"); + } +}; + +class GRUUnitGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUUnitGradOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"), + "Input(%s) of GRUUnitGradOp should not be null.", + "HiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUUnitGradOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasInput("Gate"), + "Input(%s) of GRUUnitGradOp should not be null.", "Gate"); + PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"), + "Input(%s) of GRUUnitGradOp should not be null.", + "ResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Hidden"), + "Input(%s) of GRUUnitGradOp should not be null.", "Hidden"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")), + "Input(%s@GRAD) of GRUUnitGradOp should not be null.", + "Gate"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")), + "Input(%s@GRAD) of GRUUnitGradOp should not be null.", + "ResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")), + "Input(%s@GRAD) of GRUUnitGradOp should not be null.", + "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); + auto weight_dims = ctx->GetInputDim("Weight"); + // int batch_size = input_dims[0]; + int input_size = input_dims[1]; + int frame_size = hidden_prev_dims[1]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ( + input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUUnitOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + auto bias = Input("Bias"); + if (bias != framework::kEmptyVarName) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + auto input_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(input_grad_name)) + ctx->SetOutputDim(input_grad_name, input_dims); + auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev"); + if (ctx->HasOutput(hidden_prev_grad_name)) + ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims); + auto weight_grad_name = framework::GradVarName("Weight"); + if (ctx->HasOutput(weight_grad_name)) + ctx->SetOutputDim(weight_grad_name, weight_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad, + ops::GRUUnitGradOp); +REGISTER_OP_CPU_KERNEL(gru_unit, + ops::GRUUnitKernel); +REGISTER_OP_CPU_KERNEL( + gru_unit_grad, ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..365f656523ddfb7ec8e2a5b885de74674823325a --- /dev/null +++ b/paddle/operators/gru_unit_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/gru_unit_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(gru_unit, + ops::GRUUnitKernel); +REGISTER_OP_GPU_KERNEL( + gru_unit_grad, ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c53e7d9827e0395e6ce613302e732b2797f83cdd --- /dev/null +++ b/paddle/operators/gru_unit_op.h @@ -0,0 +1,230 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/operators/activation_op.h" +#include "paddle/operators/math/math_function.h" + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; + +template +class GRUUnitKernel : public framework::OpKernel { + public: + template + void ActCompute(const int act_type, const Device& d, X x, Y y) const { + if (act_type == identity) + y.device(d) = x; + else if (act_type == sigmoid) + SigmoidFunctor()(d, x, y); + else if (act_type == tanh) + TanhFunctor()(d, x, y); + else if (act_type == relu) + ReluFunctor()(d, x, y); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + auto* hidden_prev = context.Input("HiddenPrev"); + auto* weight = context.Input("Weight"); + auto* bias = context.Input("Bias"); + auto* gate = context.Output("Gate"); + gate->mutable_data(context.GetPlace()); + auto* reset_hidden_prev = context.Output("ResetHiddenPrev"); + reset_hidden_prev->mutable_data(context.GetPlace()); + auto* hidden = context.Output("Hidden"); + hidden->mutable_data(context.GetPlace()); + + int batch_size = input->dims()[0]; + int frame_size = hidden_prev->dims()[1]; + + auto x = EigenMatrix::From(*input); + auto h_p = EigenMatrix::From(*hidden_prev); + auto g = EigenMatrix::From(*gate); + auto r_h_p = EigenMatrix::From(*reset_hidden_prev); + auto h = EigenMatrix::From(*hidden); + auto place = context.GetEigenDevice(); + + // calculate unactivated gate outputs + if (bias) { + auto b = EigenMatrix::From(*bias); + g.device(place) = x + + b.reshape(Eigen::array({{1, frame_size * 3}})) + .broadcast(Eigen::array({{batch_size, 1}})); + } else { + g.device(place) = x; + } + const T* hidden_prev_data = hidden_prev->data(); + const T* weight_data = weight->data(); + T* gate_data = gate->data(); + T* reset_hidden_prev_data = reset_hidden_prev->data(); + math::gemm(context.device_context(), false, false, batch_size, + 2 * frame_size, frame_size, 1, hidden_prev_data, + frame_size, weight_data, frame_size * 2, 1, gate_data, + frame_size * 3); + + // calculate activited gate + Eigen::array extents({{batch_size, frame_size}}); + Eigen::array u_offsets({{0, 0}}); + ActCompute(context.Attr("gate_activation"), place, + g.slice(u_offsets, extents), g.slice(u_offsets, extents)); + auto u = g.slice(u_offsets, extents); // update gate + Eigen::array r_offsets({{0, frame_size}}); + ActCompute(context.Attr("gate_activation"), place, + g.slice(r_offsets, extents), g.slice(r_offsets, extents)); + auto r = g.slice(r_offsets, extents); // reset gate + r_h_p.device(place) = r * h_p; // reset previous hidden state + math::gemm(context.device_context(), false, false, batch_size, + frame_size, frame_size, 1, reset_hidden_prev_data, + frame_size, weight_data + frame_size * frame_size * 2, + frame_size, 1, gate_data + frame_size * 2, + frame_size * 3); + + Eigen::array c_offsets({{0, frame_size * 2}}); + ActCompute(context.Attr("activation"), place, + g.slice(c_offsets, extents), g.slice(c_offsets, extents)); + auto c = g.slice(c_offsets, extents); // output candidate + + // calculate final output + h.device(place) = u * (h_p - c) + c; + } +}; + +template +class GRUUnitGradKernel : public framework::OpKernel { + public: + template + void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx, + DY dy) const { + // x is dummy and won't be used even in Relu(use y instead) + if (act_type == identity) + dx.device(d) = dy; + else if (act_type == sigmoid) + SigmoidGradFunctor()(d, x, y, dy, dx); + else if (act_type == tanh) + TanhGradFunctor()(d, x, y, dy, dx); + else if (act_type == relu) + ReluGradFunctor()(d, x, y, dy, dx); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + auto* hidden_prev = context.Input("HiddenPrev"); + auto* weight = context.Input("Weight"); + auto* gate = context.Input("Gate"); + auto* reset_hidden_prev = context.Input("ResetHiddenPrev"); + auto* hidden_grad = context.Input(framework::GradVarName("Hidden")); + auto* input_grad = context.Output(framework::GradVarName("Input")); + auto* hidden_prev_grad = + context.Output(framework::GradVarName("HiddenPrev")); + auto* weight_grad = + context.Output(framework::GradVarName("Weight")); + auto* bias_grad = context.Output(framework::GradVarName("Bias")); + input_grad->mutable_data(context.GetPlace()); + hidden_prev_grad->mutable_data(context.GetPlace()); + weight_grad->mutable_data(context.GetPlace()); + Tensor gate_grad; + gate_grad.mutable_data(input->dims(), context.GetPlace()); + Tensor reset_hidden_prev_grad; + reset_hidden_prev_grad.mutable_data(reset_hidden_prev->dims(), + context.GetPlace()); + + int batch_size = input->dims()[0]; + int frame_size = hidden_prev->dims()[1]; + + const T* hidden_prev_data = hidden_prev->data(); + T* hidden_prev_grad_data = hidden_prev_grad->data(); + const T* weight_data = weight->data(); + T* weight_grad_data = weight_grad->data(); + T* gate_grad_data = gate_grad.data(); + const T* reset_hidden_prev_data = reset_hidden_prev->data(); + T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data(); + + auto h_p = EigenMatrix::From(*hidden_prev); + auto g = EigenMatrix::From(*gate); + auto d_h = EigenMatrix::From(*hidden_grad); + auto d_x = EigenMatrix::From(*input_grad); + auto d_h_p = EigenMatrix::From(*hidden_prev_grad); + auto d_g = EigenMatrix::From(gate_grad); + auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); + auto place = context.GetEigenDevice(); + + Eigen::array extents({{batch_size, frame_size}}); + Eigen::array u_offsets({{0, 0}}); + auto u = g.slice(u_offsets, extents); // update gate + Eigen::array r_offsets({{0, frame_size}}); + auto r = g.slice(r_offsets, extents); // reset gate + Eigen::array c_offsets({{0, frame_size * 2}}); + auto c = g.slice(c_offsets, extents); // output candidate + + // backward for unactivated update gate + ActGradCompute(context.Attr("gate_activation"), place, u, u, + d_g.slice(u_offsets, extents), d_h * (h_p - c)); + // backward for unactivated output candidate + ActGradCompute(context.Attr("activation"), place, c, c, + d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u)); + // backward for reset_hidden_prev + math::gemm(context.device_context(), false, true, batch_size, + frame_size, frame_size, 1, + gate_grad_data + frame_size * 2, frame_size * 3, + weight_data + frame_size * frame_size * 2, frame_size, + 0, reset_hidden_prev_grad_data, frame_size); + // backward for state_weight + math::gemm( + context.device_context(), true, false, frame_size, frame_size, + batch_size, 1, reset_hidden_prev_data, frame_size, + gate_grad_data + frame_size * 2, frame_size * 3, 0, + weight_grad_data + frame_size * frame_size * 2, frame_size); + // backward for unactivated reset gate + ActGradCompute(context.Attr("gate_activation"), place, r, r, + d_g.slice(r_offsets, extents), d_r_h_p * h_p); + // backward for update_gate_weight and reset_gate_weight + math::gemm(context.device_context(), true, false, frame_size, + frame_size * 2, batch_size, 1, hidden_prev_data, + frame_size, gate_grad_data, frame_size * 3, 0, + weight_grad_data, frame_size * 2); + // backward for hidden_prev + d_h_p.device(place) = d_r_h_p * r + d_h * u; + math::gemm(context.device_context(), false, true, batch_size, + frame_size, frame_size * 2, 1, gate_grad_data, + frame_size * 3, weight_data, frame_size * 2, 1, + hidden_prev_grad_data, frame_size); + // backward for input + d_x.device(place) = d_g; + // backward for bias + if (bias_grad) { + bias_grad->mutable_data(context.GetPlace()); + auto d_b = EigenMatrix::From(*bias_grad); + d_b.device(place) = d_g.sum(Eigen::array({{0}})); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 81225e9a9803ce371d23620876ac22da63a8e2d1..2d69218843a69497b5b501d4297f2ec5ab26a844 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -78,7 +78,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place); + input.CopyFrom(input_tmp, *place, *context); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -93,7 +93,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - output_tmp.CopyFrom(output, paddle::platform::CPUPlace()); + output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context); out_cfo_ptr = output_tmp.data(); } @@ -107,7 +107,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place); + input.CopyFrom(input_tmp, *place, *context); } paddle::operators::math::Col2VolFunctor col2vol; @@ -118,7 +118,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - input_tmp.CopyFrom(input, paddle::platform::CPUPlace()); + input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); in_ptr = input_tmp.data(); } diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index a86685b6dde4761cf74f9521bd9609b0864b9bdf..051051b051961c6da064bd9319460b3f41cea3e8 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -115,8 +115,9 @@ class MultiplexGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad, - ops::MultiplexGradOp); +REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( multiplex, ops::MultiplexCPUKernel); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index ffb0cb92111bfb8490d35e4f5cfc9e405b0e3250..573487b83590c132d5a4379a4b2762fbc16c04bc 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -34,7 +34,7 @@ class SumOp : public framework::OperatorWithKernel { auto in_dim = x_dims[0]; for (size_t i = 1; i < N; i++) { auto dim = x_dims[i]; - PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape"); + PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape"); } ctx->SetOutputDim("Out", in_dim); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc index e330877fc4283b796dcb5c5d745881884ae491ae..75928f1ec818ab028ea06cfa72273fb99430c3c8 100644 --- a/paddle/operators/uniform_random_op.cc +++ b/paddle/operators/uniform_random_op.cc @@ -54,7 +54,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), "uniform_random's min must less then max"); - auto dims = Attr>("dims"); + auto& dims = ctx->Attrs().Get>("dims"); std::vector temp; temp.reserve(dims.size()); for (auto dim : dims) { diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h index 2841d2a2dbec5c17ef098a06c976ca01247820f5..0c5719ef5162546578253e383209b1893c0cd71f 100644 --- a/paddle/platform/cudnn_helper.h +++ b/paddle/platform/cudnn_helper.h @@ -71,23 +71,32 @@ class ScopedTensorDescriptor { inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format, const cudnnDataType_t type, - const std::vector& dims) { - // the format is not used now, but it maybe useful feature + const std::vector& dims, + const int groups = 1) { + // the format is not used now, will add later std::vector strides(dims.size()); strides[dims.size() - 1] = 1; for (int i = dims.size() - 2; i >= 0; i--) { strides[i] = dims[i + 1] * strides[i + 1]; } + // Update tensor descriptor dims setting if groups > 1 + // FIXME(typhoonzero): Assume using NCHW order + std::vector dims_with_group(dims.begin(), dims.end()); // copy + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor( - desc_, type, dims.size(), dims.data(), strides.data())); + desc_, type, dims_with_group.size(), dims_with_group.data(), + strides.data())); return desc_; } template inline cudnnTensorDescriptor_t descriptor(const DataLayout& order, - const std::vector& dims) { - return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, - dims); + const std::vector& dims, + const int groups = 1) { + return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, dims, + groups); } private: @@ -106,18 +115,29 @@ class ScopedFilterDescriptor { inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format, const cudnnDataType_t type, - const std::vector& kernel) { - // filter layout: output input spatial_dim_y spatial_dim_x + const std::vector& kernel, + const int groups = 1) { + // filter layout: MCHW, where M is the number of + // output image channels, C is the number of input image channels, + // H and W is height and width of filter. + std::vector kernel_with_group(kernel.begin(), kernel.end()); + if (groups > 1) { + // M /= groups + kernel_with_group[0] /= groups; + // NOTE: input filter(C) of the filter is already asserted to be C/groups. + } PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor( - desc_, type, format, kernel.size(), kernel.data())); + desc_, type, format, kernel_with_group.size(), + kernel_with_group.data())); return desc_; } template inline cudnnFilterDescriptor_t descriptor(const DataLayout& order, - const std::vector& kernel) { + const std::vector& kernel, + const int groups = 1) { return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, - kernel); + kernel, groups); } private: diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 97364f2db9523c0629616692631d8372657a2128..b8fc9347243ac490efcb09132f4b049c6e9f8e08 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc - DEPS pybind python backward proto_desc tensor_array + DEPS pybind python backward proto_desc tensor_array paddle_memory ${GLOB_OP_LIB}) endif(WITH_PYTHON) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 0e4bbe8415fd86ab29c6809e7652dc581b4e6004..2acfc28b66456c4ecf159bc6a714c939e98ecd24 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -118,12 +118,23 @@ void BindProgramDesc(py::module &m) { .def("append_block", &ProgramDescBind::AppendBlock, py::return_value_policy::reference) .def("append_backward", - [](ProgramDescBind &program_desc, + [](ProgramDescBind &program_desc, const VarDescBind &target, const std::unordered_set &no_grad_vars) { - AppendBackward(program_desc, no_grad_vars); + AppendBackward(program_desc, target, no_grad_vars); }) .def("block", &ProgramDescBind::Block, py::return_value_policy::reference) - .def("num_blocks", &ProgramDescBind::Size); + .def("num_blocks", &ProgramDescBind::Size) + .def("serialize_to_string", + [](ProgramDescBind &program_desc) -> py::bytes { + const ProgramDesc *desc = program_desc.Proto(); + PADDLE_ENFORCE(desc->IsInitialized(), + "ProgramDesc has not been initialized."); + std::string res; + PADDLE_ENFORCE( + desc->SerializeToString(&res), + "Serialize ProgramDesc Error. This could be a bug of Paddle."); + return res; + }); } void BindBlockDesc(py::module &m) { @@ -149,7 +160,17 @@ void BindBlockDesc(py::module &m) { .def("all_vars", &BlockDescBind::AllVars, py::return_value_policy::reference) .def("all_ops", &BlockDescBind::AllOps, - py::return_value_policy::reference); + py::return_value_policy::reference) + .def("serialize_to_string", [](BlockDescBind &block_desc) -> py::bytes { + const BlockDesc *desc = block_desc.Proto(); + PADDLE_ENFORCE(desc->IsInitialized(), + "BlockDesc has not been initialized."); + std::string res; + PADDLE_ENFORCE( + desc->SerializeToString(&res), + "Serialize BlockDesc Error. This could be a bug of Paddle."); + return res; + }); } void BindVarDsec(py::module &m) { @@ -162,7 +183,8 @@ void BindVarDsec(py::module &m) { .value("FP32", DataType::FP32) .value("FP64", DataType::FP64); - py::class_(m, "VarDesc", "") + py::class_ var_desc(m, "VarDesc", ""); + var_desc .def("name", [](const VarDescBind &self) { py::bytes name = self.Name(); @@ -174,7 +196,23 @@ void BindVarDsec(py::module &m) { .def("shape", &VarDescBind::Shape, py::return_value_policy::reference) .def("data_type", &VarDescBind::GetDataType) .def("lod_level", &VarDescBind::GetLodLevel) - .def("set_lod_level", &VarDescBind::SetLoDLevel); + .def("set_lod_level", &VarDescBind::SetLoDLevel) + .def("type", &VarDescBind::GetType) + .def("set_type", &VarDescBind::SetType) + .def("serialize_to_string", [](VarDescBind &var_desc) -> py::bytes { + const VarDesc *desc = var_desc.Proto(); + PADDLE_ENFORCE(desc->IsInitialized(), + "VarDesc has not been initialized."); + std::string res; + PADDLE_ENFORCE( + desc->SerializeToString(&res), + "Serialize VarDesc Error. This could be a bug of Paddle."); + return res; + }); + + py::enum_(var_desc, "VarType", "") + .value("LOD_TENSOR", VarDesc::LOD_TENSOR) + .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS); } void BindOpDesc(py::module &m) { @@ -204,9 +242,19 @@ void BindOpDesc(py::module &m) { .def("set_attr", &OpDescBind::SetAttr) .def("attr", &OpDescBind::GetAttr) .def("set_block_attr", &OpDescBind::SetBlockAttr) - .def("get_block_attr", &OpDescBind::GetBlockAttr) + .def("block_attr", &OpDescBind::GetBlockAttr) .def("check_attrs", &OpDescBind::CheckAttrs) - .def("infer_shape", &OpDescBind::InferShape); + .def("infer_shape", &OpDescBind::InferShape) + .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes { + const OpDesc *desc = op_desc.Proto(); + PADDLE_ENFORCE(desc->IsInitialized(), + "OpDesc has not been initialized."); + std::string res; + PADDLE_ENFORCE( + desc->SerializeToString(&res), + "Serialize OpDesc Error. This could be a bug of Paddle."); + return res; + }); } } // namespace pybind diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 0f6e3101e26c5ac249664ce8badc10adc939305f..cc9f7ffe02781cc13105b19bb987207743febdf6 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/framework/lod_tensor.h" #include "paddle/framework/tensor_array.h" #include "paddle/operators/cond_op.h" +#include "paddle/operators/dynamic_recurrent_op.h" #include "paddle/operators/net_op.h" #include "paddle/operators/recurrent_op.h" #include "paddle/platform/enforce.h" @@ -341,6 +342,33 @@ All parameter, weight, gradient are variables in Paddle. self.set_stepnet(net.Clone()); }); + py::class_(m, + "DynamicRecurrentOp") + .def_static("create", + [](py::bytes protobin) -> operators::DynamicRecurrentOp * { + OpDesc desc; + PADDLE_ENFORCE(desc.ParsePartialFromString(protobin), + "Cannot parse user input to OpDesc"); + PADDLE_ENFORCE(desc.IsInitialized(), + "User OpDesc is not initialized, reason %s", + desc.InitializationErrorString()); + auto rnn_op = OpRegistry::CreateOp(desc); + return static_cast( + rnn_op.release()); + }) + .def("set_stepnet", + [](operators::DynamicRecurrentOp &self, const operators::NetOp &net) + -> void { self.SetStepNet(net.Clone()); }) + .def("get_state", + [](operators::DynamicRecurrentOp &self, const std::string &name) + -> const TensorArray & { return self.state(name); }) + .def("get_step_input", + [](operators::DynamicRecurrentOp &self, const std::string &name) + -> const TensorArray & { return self.step_input(name); }) + .def("get_step_output", + [](operators::DynamicRecurrentOp &self, const std::string &name) + -> const TensorArray & { return self.step_output(name); }); + // cond_op py::class_(m, "CondOp") .def_static("create", diff --git a/python/paddle/v2/framework/graph.py b/python/paddle/v2/framework/framework.py similarity index 55% rename from python/paddle/v2/framework/graph.py rename to python/paddle/v2/framework/framework.py index 0f0a2847e58a1ca172bf1ba382abb2ebc1ecb8ed..01cd9982dc1c8d9869e59c55d0061abef91919ef 100644 --- a/python/paddle/v2/framework/graph.py +++ b/python/paddle/v2/framework/framework.py @@ -1,4 +1,5 @@ import paddle.v2.framework.core as core +import paddle.v2.framework.proto.framework_pb2 as framework_pb2 import collections import numpy as np import copy @@ -9,6 +10,7 @@ __all__ = ['Block', 'Variable', 'Program', 'Operator'] class Variable(object): def __init__(self, block, + type=core.VarDesc.VarType.LOD_TENSOR, name=None, shape=None, dtype=None, @@ -25,6 +27,14 @@ class Variable(object): self.desc = self.block.desc.new_var(name) is_new_var = True + if is_new_var: + self.desc.set_type(type) + elif self.desc.type() != type: + raise ValueError("Variable {0} has been created before. The " + "previous type is {1}; the new type is {2}. They" + " are not matched".format(self.name, + self.desc.type(), type)) + if shape is not None: if is_new_var: self.desc.set_shape(shape) @@ -63,6 +73,13 @@ class Variable(object): self.block.vars[name] = self self.op = None + def __str__(self): + protostr = self.desc.serialize_to_string() + proto = framework_pb2.VarDesc.FromString(str(protostr)) + return proto.__str__() + + __repr__ = __str__ + @property def name(self): return self.desc.name() @@ -106,6 +123,40 @@ class Variable(object): raise ValueError("Not supported numpy dtype " + str(dtype)) +def get_all_op_protos(): + """ + Get all registered op proto from PaddlePaddle C++ end. + :return: A list of registered OpProto. + """ + protostrs = core.get_all_op_protos() + ret_values = [] + for pbstr in protostrs: + op_proto = framework_pb2.OpProto.FromString(str(pbstr)) + ret_values.append(op_proto) + return ret_values + + +class OpProtoHolder(object): + @classmethod + def instance(cls): + if not hasattr(cls, '_instance'): + cls._instance = cls() + return cls._instance + + def __init__(self): + assert not hasattr( + self.__class__, + '_instance'), 'Please use `instance()` to get OpProtoHolder opject!' + op_protos = get_all_op_protos() + self.op_proto_map = {} + for proto in op_protos: + self.op_proto_map[proto.type] = proto + + def get_op_proto(self, type): + assert type in self.op_proto_map, "Operator \"%s\" has not been registered." % type + return self.op_proto_map[type] + + class Operator(object): def __init__(self, block, @@ -116,20 +167,120 @@ class Operator(object): attrs=None): self.block = block self.desc = desc - if type is not None: - # TODO. - pass + if len(self.desc.type()) != 0: + return + if type is None: + raise ValueError( + "`type` to initilized an Operator can not be None.") + self.desc.set_type(type) + proto = OpProtoHolder.instance().get_op_proto(type) + if inputs is not None: - # TODO - pass + given = set() + need = set() + for n in inputs: + given.add(n) + for m in proto.inputs: + need.add(m.name) + if not given == need: + raise ValueError( + "Incorrect setting for input(s) of operator \"%s\". Need: [%s] Given: [%s]" + % (type, ", ".join(str(e) for e in need), ", ".join( + str(e) for e in given))) + + for in_proto in proto.inputs: + in_argus = inputs[in_proto.name] + if not isinstance(in_argus, list): + in_argus = [in_argus] + if not in_proto.duplicable and len(in_argus) > 1: + raise ValueError( + "Input %s expects only one input, but %d are given." % + (in_proto.name, len(in_argus))) + in_argu_names = [] + for argu in in_argus: + in_argu_names.append(argu.name) + self.desc.set_input(in_proto.name, in_argu_names) + if outputs is not None: - # TODO - pass + given = set() + need = set() + for n in outputs: + given.add(n) + for m in proto.outputs: + need.add(m.name) + if not given == need: + raise ValueError( + "Incorrect setting for output(s) of operator \"%s\". Need: [%s] Given: [%s]" + % (type, ", ".join(str(e) for e in need), ", ".join( + str(e) for e in given))) + + for out_proto in proto.outputs: + out_argus = outputs[out_proto.name] + if not isinstance(out_argus, list): + out_argus = [out_argus] + if not out_proto.duplicable and len(out_argus) > 1: + raise ValueError( + "Output %s expects only one output, but %d are given." % + (out_proto.name, len(out_argus))) + out_argu_names = [] + for argu in out_argus: + out_argu_names.append(argu.name) + argu.op = self + self.desc.set_output(out_proto.name, out_argu_names) + if attrs is not None: - # TODO - pass + for attr in proto.attrs: + attr_name = attr.name + if not attr_name in attrs: + continue + if not isinstance(attrs[attr_name], Block): + self.desc.set_attr(attr_name, attrs[attr_name]) + else: + self.desc.set_block_attr(attr_name, attrs[attr_name].desc) - # TODO: Getters + self.desc.check_attrs() + self.desc.infer_shape(self.block.desc) + + def __str__(self): + protostr = self.desc.serialize_to_string() + proto = framework_pb2.OpDesc.FromString(str(protostr)) + return proto.__str__() + + __repr__ = __str__ + + @property + def type(self): + return self.desc.type() + + def input(self, name): + return self.desc.input(name) + + @property + def input_names(self): + return self.desc.input_names() + + def output(self, name): + return self.desc.output(name) + + @property + def output_names(self): + return self.desc.output_names() + + def has_attr(self, name): + return self.desc.has_attr(name) + + def attr_type(self, name): + return self.desc.attr_type(name) + + @property + def attr_names(self): + return self.desc.attr_names() + + def attr(self, name): + return self.desc.attr(name) + + def block_attr(self, name): + return self.desc.block_attr(name) class Block(object): @@ -139,6 +290,13 @@ class Block(object): self.ops = collections.deque() # operator list self.program = program + def __str__(self): + protostr = self.desc.serialize_to_string() + proto = framework_pb2.BlockDesc.FromString(str(protostr)) + return proto.__str__() + + __repr__ = __str__ + @property def parent_idx(self): return self.desc.parent @@ -183,6 +341,13 @@ class Program(object): self.blocks = [Block(self, 0)] self.current_block_idx = 0 + def __str__(self): + protostr = self.desc.serialize_to_string() + proto = framework_pb2.ProgramDesc.FromString(str(protostr)) + return proto.__str__() + + __repr__ = __str__ + def global_block(self): return self.blocks[0] diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py index 9086a5cc3452b178ec37fe6a3e358eaa4c5d606b..bc771a964adf9f97cbeae87c06ce954c76051150 100644 --- a/python/paddle/v2/framework/op.py +++ b/python/paddle/v2/framework/op.py @@ -219,6 +219,27 @@ class __RecurrentOp__(object): return core.RecurrentOp.create(proto.SerializeToString()) +class __DynamicRecurrentOp__(object): + __proto__ = None + type = "dynamic_recurrent" + + def __init__(self): + # cache recurrent_op's proto + if self.__proto__ is None: + for op_proto in get_all_op_protos(): + if op_proto.type == self.type: + self.__proto__ = op_proto + + def __call__(self, *args, **kwargs): + if self.type not in args and "type" not in kwargs: + kwargs["type"] = self.type + # create proto + create_method = OpDescCreationMethod(self.__proto__) + proto = create_method(*args, **kwargs) + # create rnnop + return core.DynamicRecurrentOp.create(proto.SerializeToString()) + + class __CondOp__(object): __proto__ = None type = "cond" @@ -242,4 +263,5 @@ class __CondOp__(object): Operator = OperatorFactory() # The default global factory RecurrentOp = __RecurrentOp__() +DynamicRecurrentOp = __DynamicRecurrentOp__() CondOp = __CondOp__() diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py index 3acd00e35213981fce60504876af1861961ebe12..5831b880e4c5ef881929920e87ac64d6c87a2ab5 100644 --- a/python/paddle/v2/framework/tests/test_activation_op.py +++ b/python/paddle/v2/framework/tests/test_activation_op.py @@ -384,5 +384,33 @@ class TestThresholdedRelu(OpTest): self.check_grad(['X'], 'Y', max_relative_error=self.relative_error) +class TestHardSigmoid(OpTest): + def setUp(self): + self.op_type = "hard_sigmoid" + self.relative_error = 0.002 + + X = np.random.uniform(-5, 5, [2, 2]).astype("float32") + slope = 0.2 + offset = 0.5 + lower_threshold = -offset / slope + upper_threshold = (1 - offset) / slope + + self.inputs = {'X': X} + # Same reason as TestAbs + X[np.abs(X - lower_threshold) < self.relative_error] = \ + lower_threshold + 0.2 + X[np.abs(X - upper_threshold) < self.relative_error] = \ + upper_threshold - 0.2 + + temp = X * slope + offset + self.outputs = {'Y': np.maximum(0.0, np.minimum(1.0, temp))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Y', max_relative_error=0.002) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/v2/framework/tests/test_adam_op.py b/python/paddle/v2/framework/tests/test_adam_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ff6faafa6e2119fde11b9eb6cd2a65a75334ebe6 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_adam_op.py @@ -0,0 +1,186 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestAdamOp1(OpTest): + def setUp(self): + '''Test Adam Op with supplied attributes + ''' + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + + param_out, moment1_out, moment2_out, beta1_pow_out, \ + beta2_pow_out = adam_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'Beta1PowOut': beta1_pow_out, + 'Beta2PowOut': beta2_pow_out, + 'ParamOut': param_out + } + + def test_check_output(self): + self.check_output() + + +class TestAdamOp2(OpTest): + def setUp(self): + '''Test Adam Op with supplied attributes + ''' + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.001 + beta1 = 0.9 + beta2 = 0.999 + epsilon = 1e-8 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + + param_out, moment1_out, moment2_out, beta1_pow_out, \ + beta2_pow_out = adam_step(self.inputs, attributes) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'Beta1PowOut': beta1_pow_out, + 'Beta2PowOut': beta2_pow_out, + 'ParamOut': param_out + } + + def test_check_output(self): + self.check_output() + + +class TestAdamOpMultipleSteps(OpTest): + def setUp(self): + '''Test Adam Operator with supplied attributes + ''' + self.op_type = "adam" + self.num_steps = 10 + + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.001 + beta1 = 0.9 + beta2 = 0.999 + epsilon = 1e-8 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + + def test_check_output(self): + for _ in range(self.num_steps): + param_out, moment1_out, moment2_out, beta1_pow_out, \ + beta2_pow_out = adam_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'Beta1PowOut': beta1_pow_out, + 'Beta2PowOut': beta2_pow_out, + 'ParamOut': param_out + } + + # Verify output for this step + self.check_output() + + # Output of this step becomes input for next step + self.inputs['Param'] = param_out + self.inputs['Moment1'] = moment1_out + self.inputs['Moment2'] = moment2_out + self.inputs['Beta1Pow'] = beta1_pow_out + self.inputs['Beta2Pow'] = beta2_pow_out + + # Randomize gradient for next step + self.inputs['Grad'] = np.random.uniform( + -1, 1, (102, 105)).astype("float32") + + +def adam_step(inputs, attributes): + ''' + Simulate one step of the adam optimizer + :param inputs: dict of inputs + :param attributes: dict of attributes + :return tuple: tuple of output param, moment1, moment2, + beta1 power accumulator and beta2 power accumulator + ''' + param = inputs['Param'] + grad = inputs['Grad'] + moment1 = inputs['Moment1'] + moment2 = inputs['Moment2'] + lr = inputs['LearningRate'] + beta1_pow = inputs['Beta1Pow'] + beta2_pow = inputs['Beta2Pow'] + + beta1 = attributes['beta1'] + beta2 = attributes['beta2'] + epsilon = attributes['epsilon'] + + moment1_out = beta1 * moment1 + (1 - beta1) * grad + moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad) + beta1_pow_out = beta1_pow * beta1 + beta2_pow_out = beta2_pow * beta2 + lr_t = lr * np.sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out) + param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon)) + return param_out, moment1_out, moment2_out, beta1_pow_out, beta2_pow_out + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/framework/tests/test_conv2d_op.py index 118a5fc1cde5f4a908b065d581956e0855d50a52..2fb808944ac97f2bdcb05336a2205346ded65a4d 100644 --- a/python/paddle/v2/framework/tests/test_conv2d_op.py +++ b/python/paddle/v2/framework/tests/test_conv2d_op.py @@ -3,70 +3,56 @@ import numpy as np from op_test import OpTest +def conv2d_forward_naive(input, filter, group, conv_param): + in_n, in_c, in_h, in_w = input.shape + out_c, f_c, f_h, f_w = filter.shape + assert f_c * group == in_c + assert np.mod(out_c, group) == 0 + sub_out_c = out_c / group + + stride, pad = conv_param['stride'], conv_param['pad'] + out_h = 1 + (in_h + 2 * pad[0] - f_h) / stride[0] + out_w = 1 + (in_w + 2 * pad[1] - f_w) / stride[1] + out = np.zeros((in_n, out_c, out_h, out_w)) + + input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )), + mode='constant', + constant_values=0) + for i in range(out_h): + for j in range(out_w): + for g in range(group): + input_pad_masked = \ + input_pad[:, g * f_c:(g + 1) * f_c, + i * stride[0]:i * stride[0] + f_h, + j * stride[1]:j * stride[1] + f_w] + + f_sub = filter[g * sub_out_c:(g + 1) * sub_out_c, :, :, :] + for k in range(sub_out_c): + out[:, g * sub_out_c + k, i, j] = \ + np.sum(input_pad_masked * f_sub[k, :, :, :], + axis=(1, 2, 3)) + + return out + + class TestConv2dOp(OpTest): def setUp(self): - self.init_groups() - self.op_type = "conv2d" - batch_size = 2 - input_channels = 3 - input_height = 5 - input_width = 5 - output_channels = 6 - filter_height = 3 - filter_width = 3 - stride = 1 - padding = 0 - output_height = (input_height - filter_height + 2 * padding - ) / stride + 1 - output_width = (input_width - filter_width + 2 * padding) / stride + 1 - input = np.random.random((batch_size, input_channels, input_height, - input_width)).astype("float32") - - filter = np.random.random( - (output_channels, input_channels / self.groups, filter_height, - filter_width)).astype("float32") - output = np.ndarray( - (batch_size, output_channels, output_height, output_width)) + self.init_op_type() + self.init_group() + self.init_test_case() + + conv2d_param = {'stride': self.stride, 'pad': self.pad} + input = np.random.random(self.input_size).astype("float32") + filter = np.random.random(self.filter_size).astype("float32") + output = conv2d_forward_naive(input, filter, self.groups, conv2d_param) self.inputs = {'Input': input, 'Filter': filter} self.attrs = { - 'strides': [1, 1], - 'paddings': [0, 0], - 'groups': self.groups + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations } - - output_group_channels = output_channels / self.groups - input_group_channels = input_channels / self.groups - for batchid in xrange(batch_size): - for group in xrange(self.groups): - for outchannelid in range(group * output_group_channels, - (group + 1) * output_group_channels): - for rowid in xrange(output_height): - for colid in xrange(output_width): - start_h = (rowid * stride) - padding - start_w = (colid * stride) - padding - output_value = 0.0 - for inchannelid in range( - group * input_group_channels, - (group + 1) * input_group_channels): - for frowid in xrange(filter_height): - for fcolid in xrange(filter_width): - input_value = 0.0 - inrowid = start_h + frowid - incolid = start_w + fcolid - if ((inrowid >= 0 and - inrowid < input_height) and - (incolid >= 0 and - incolid < input_width)): - input_value = input[batchid][ - inchannelid][inrowid][incolid] - filter_value = filter[outchannelid][ - inchannelid % input_group_channels][ - frowid][fcolid] - output_value += input_value * filter_value - output[batchid][outchannelid][rowid][ - colid] = output_value - self.outputs = {'Output': output} def test_check_output(self): @@ -90,14 +76,47 @@ class TestConv2dOp(OpTest): max_relative_error=0.05, no_grad_set=set(['Input'])) - def init_groups(self): + def init_test_case(self): + # self.groups = 1 + # self.op_type = "conv2d" + self.pad = [0, 0] + self.stride = [1, 1] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_group(self): self.groups = 1 + def init_op_type(self): + self.op_type = "conv2d" + class TestWithGroup(TestConv2dOp): - def init_groups(self): + def init_group(self): self.groups = 3 + def init_op_type(self): + self.op_type = "conv2d" + + +class TestCudnn(TestConv2dOp): + def init_group(self): + self.groups = 1 + + def init_op_type(self): + self.op_type = "conv_cudnn" + + +class TestCudnnWithGroup(TestConv2dOp): + def init_group(self): + self.groups = 3 + + def init_op_type(self): + self.op_type = "conv_cudnn" + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py new file mode 100644 index 0000000000000000000000000000000000000000..674c3fda5c82309bbfbbad936a8b0b26929d42d9 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py @@ -0,0 +1,71 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestDecayedAdagradOp1(OpTest): + ''' Test DecayedAdagrad operator with explicit attributes + ''' + + def setUp(self): + self.op_type = "decayed_adagrad" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + moment = np.zeros((123, 321)).astype("float32") + lr = 0.01 + decay = 0.80 + epsilon = 1e-8 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'LearningRate': np.array([lr]).astype("float32") + } + + self.attrs = {'decay': decay, 'epsilon': epsilon} + + moment_out = decay * moment + (1 - decay) * grad * grad + param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon) + + self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out} + + def test_check_output(self): + self.check_output() + + +class TestDecayedAdagradOp2(OpTest): + ''' Test DecayedAdagrad operator with default attributes + ''' + + def setUp(self): + self.op_type = "decayed_adagrad" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + moment = np.zeros((123, 321)).astype("float32") + lr = 0.01 + decay = 0.95 + epsilon = 1e-6 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'LearningRate': np.array([lr]).astype("float32") + } + + self.attrs = {'decay': decay, 'epsilon': epsilon} + + moment_out = decay * moment + (1 - decay) * grad * grad + param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon) + + self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out} + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b4629a3adb9a84470843214c7c6d80acde7228cc --- /dev/null +++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py @@ -0,0 +1,111 @@ +import logging +import paddle.v2.framework.core as core +import unittest +from paddle.v2.framework.op import Operator, DynamicRecurrentOp +import numpy as np + + +def create_tensor(scope, name, shape, np_data): + tensor = scope.new_var(name).get_tensor() + tensor.set_dims(shape) + tensor.set(np_data, core.CPUPlace()) + return tensor + + +class DynamicRecurrentOpTest(unittest.TestCase): + ''' + Test RNNOp + + equation: + h_t = \sigma (W x_t + U h_{t-1}) + weights: + - W + - U + vars: + - x + memories: + - h + outputs: + - h + ''' + + # for siplicity, just one level LoD + lod_py = [[0, 4, 7, 9, 10]] + input_dim = 30 + num_sents = len(lod_py[0]) - 1 + weight_dim = 15 + + def forward(self): + self.scope = core.Scope() + self.create_global_variables() + self.create_rnn_op() + self.create_step_net() + ctx = core.DeviceContext.create(core.CPUPlace()) + self.rnnop.run(self.scope, ctx) + state = self.rnnop.get_state("h@mem") + print 'state size: ', state.size() + + step_inputs = self.rnnop.get_step_input("x") + print "x size ", step_inputs.size() + for i in range(step_inputs.size()): + print "x %d" % i, np.array(step_inputs.read(i).get_dims()) + step_outputs = self.rnnop.get_step_output('h@mem') + print 'step_outputs.size ', step_outputs.size() + output = self.scope.find_var("h@mem").get_tensor() + + print 'output', np.array(output).shape + + def create_global_variables(self): + x = np.random.normal(size=(self.lod_py[0][-1], + self.input_dim)).astype("float32") + W = np.random.normal(size=(self.input_dim, + self.input_dim)).astype("float32") + U = np.random.normal(size=(self.input_dim, + self.input_dim)).astype("float32") + h_boot = np.random.normal(size=(self.num_sents, + self.input_dim)).astype("float32") + # create inlink + x_tensor = create_tensor(self.scope, "x", + [self.num_sents, self.input_dim], x) + x_tensor.set_lod(self.lod_py) + create_tensor(self.scope, "W", [self.input_dim, self.input_dim], W) + create_tensor(self.scope, "U", [self.input_dim, self.input_dim], U) + create_tensor(self.scope, "h_boot", [self.num_sents, self.input_dim], + h_boot) + self.scope.new_var("step_scopes") + self.scope.new_var("h@mem") + + def create_rnn_op(self): + # create RNNOp + self.rnnop = DynamicRecurrentOp( + # inputs + inlinks=["x"], + boot_memories=["h_boot"], + step_net="stepnet", + # outputs + outlinks=["h@mem"], + step_scopes="step_scopes", + # attributes + pre_memories=["h@pre"], + memories=["h@mem"]) + + def create_step_net(self): + stepnet = core.Net.create() + x_fc_op = Operator("mul", X="x", Y="W", Out="Wx") + h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") + sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum") + sig_op = Operator("sigmoid", X="sum", Y="h@mem") + + for op in [x_fc_op, h_fc_op, sum_op, sig_op]: + stepnet.append_op(op) + stepnet.complete_add_op(True) + self.rnnop.set_stepnet(stepnet) + + def test_forward(self): + print 'test recurrent op forward' + pd_output = self.forward() + print 'pd_output', pd_output + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_gru_unit_op.py b/python/paddle/v2/framework/tests/test_gru_unit_op.py new file mode 100644 index 0000000000000000000000000000000000000000..57625362d21905d257f46ff5330841a20438773a --- /dev/null +++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py @@ -0,0 +1,115 @@ +import math +import unittest +import numpy as np +from op_test import OpTest + + +class GRUActivationType(OpTest): + identity = 0 + sigmoid = 1 + tanh = 2 + relu = 3 + + +def identity(x): + return x + + +def sigmoid(x): + return 1. / (1. + np.exp(-x)) + + +def tanh(x): + return 2. * sigmoid(2. * x) - 1. + + +def relu(x): + return np.maximum(x, 0) + + +class TestGRUUnitOp(OpTest): + batch_size = 3 + frame_size = 5 + activate = { + GRUActivationType.identity: identity, + GRUActivationType.sigmoid: sigmoid, + GRUActivationType.tanh: tanh, + GRUActivationType.relu: relu, + } + + def set_inputs(self): + batch_size = self.batch_size + frame_size = self.frame_size + self.op_type = 'gru_unit' + self.inputs = { + 'Input': np.random.uniform( + -0.1, 0.1, (batch_size, frame_size * 3)).astype('float32'), + 'HiddenPrev': np.random.uniform( + -0.1, 0.1, (batch_size, frame_size)).astype('float32'), + 'Weight': np.random.uniform( + -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size), + (frame_size, frame_size * 3)).astype('float32'), + } + self.attrs = { + 'activation': GRUActivationType.tanh, + 'gate_activation': GRUActivationType.sigmoid + } + + def set_outputs(self): + # GRU calculations + batch_size = self.batch_size + frame_size = self.frame_size + x = self.inputs['Input'] + h_p = self.inputs['HiddenPrev'] + w = self.inputs['Weight'] + b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros( + (1, frame_size * 3)) + g = x + np.tile(b, (batch_size, 1)) + w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape( + (frame_size, frame_size * 2)) + u_r = self.activate[self.attrs['gate_activation']](np.dot( + h_p, w_u_r) + g[:, :frame_size * 2]) + u = u_r[:, :frame_size] + r = u_r[:, frame_size:frame_size * 2] + r_h_p = r * h_p + w_c = w.flatten()[frame_size * frame_size * 2:].reshape( + (frame_size, frame_size)) + c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) + + g[:, frame_size * 2:]) + g = np.hstack((u_r, c)) + h = u * h_p + (1 - u) * c + self.outputs = {'Gate': g, 'ResetHiddenPrev': r_h_p, 'Hidden': h} + + def setUp(self): + self.set_inputs() + self.set_outputs() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad( + ['Input', 'HiddenPrev', 'Weight'], ['Hidden'], + max_relative_error=0.007) + + +class TestGRUUnitOpWithBias(TestGRUUnitOp): + def set_inputs(self): + batch_size = self.batch_size + frame_size = self.frame_size + super(TestGRUUnitOpWithBias, self).set_inputs() + self.inputs['Bias'] = np.random.uniform( + -0.1, 0.1, (1, frame_size * 3)).astype('float32') + self.attrs = { + 'activation': GRUActivationType.identity, + 'gate_activation': GRUActivationType.sigmoid + } + + def test_check_grad(self): + self.check_grad( + ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'], + max_relative_error=0.007) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_infer_shape.py b/python/paddle/v2/framework/tests/test_infer_shape.py index 99562890fdd4d8b10f420869f1ba9f694db5969a..9d9fb1c3096ed5329e868235472fd610f0b2e6d3 100644 --- a/python/paddle/v2/framework/tests/test_infer_shape.py +++ b/python/paddle/v2/framework/tests/test_infer_shape.py @@ -14,11 +14,14 @@ class TestInferShape(unittest.TestCase): # prepare input/output x1 = block.new_var("x1") + x1.set_type(core.VarDesc.VarType.LOD_TENSOR) x1.set_shape(shape) x2 = block.new_var("x2") + x2.set_type(core.VarDesc.VarType.LOD_TENSOR) x2.set_shape(shape) out = block.new_var("out") + out.set_type(core.VarDesc.VarType.LOD_TENSOR) # prepare the operator sum_op_desc = block.append_op() @@ -40,11 +43,14 @@ class TestInferShape(unittest.TestCase): # prepare input/output x1 = block.new_var("x") + x1.set_type(core.VarDesc.VarType.LOD_TENSOR) x1.set_shape(x_shape) x2 = block.new_var("y") + x2.set_type(core.VarDesc.VarType.LOD_TENSOR) x2.set_shape(y_shape) out = block.new_var("out") + out.set_type(core.VarDesc.VarType.LOD_TENSOR) # prepare the operator mul_op_desc = block.append_op() diff --git a/python/paddle/v2/framework/tests/test_operator_desc.py b/python/paddle/v2/framework/tests/test_operator_desc.py new file mode 100644 index 0000000000000000000000000000000000000000..dfe39c98f7f4fe266d5ec0c4a9ed14ab02e40e3a --- /dev/null +++ b/python/paddle/v2/framework/tests/test_operator_desc.py @@ -0,0 +1,78 @@ +import unittest +from paddle.v2.framework.framework import Variable, g_program +import paddle.v2.framework.core as core + + +class TestOperator(unittest.TestCase): + def test_error_type(self): + block = g_program.create_block() + try: + block.append_op() + self.assertFail() + except ValueError as v_err: + self.assertEqual( + v_err.message, + "`type` to initilized an Operator can not be None.") + try: + block.append_op(type="no_such_op") + self.assertFail() + except AssertionError as a_err: + self.assertEqual(a_err.message, + "Operator \"no_such_op\" has not been registered.") + + def test_op_desc_creation(self): + block = g_program.current_block() + mul_x = block.create_var( + dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mul_op = block.append_op( + type="mul", + inputs={"X": [mul_x], + "Y": mul_y}, + outputs={"Out": [mul_out]}, + attrs={"x_num_col_dims": 1}) + + self.assertNotEqual(str(mul_op), "") + self.assertEqual(mul_op.type, "mul") + self.assertEqual(mul_op.input_names, ["X", "Y"]) + self.assertEqual(mul_op.input("X"), ["mul.x"]) + self.assertEqual(mul_op.input("Y"), ["mul.y"]) + self.assertEqual(mul_op.output_names, ["Out"]) + self.assertEqual(mul_op.output("Out"), ["mul.out"]) + self.assertEqual( + set(mul_op.attr_names), set(["x_num_col_dims", "y_num_col_dims"])) + self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) + self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) + self.assertEqual(mul_op.attr("x_num_col_dims"), 1) + self.assertEqual(mul_op.has_attr("y_num_col_dims"), True) + self.assertEqual(mul_op.attr_type("y_num_col_dims"), core.AttrType.INT) + self.assertEqual(mul_op.attr("y_num_col_dims"), 1) + self.assertEqual(mul_out.op, mul_op) + + def test_mult_input(self): + block = g_program.current_block() + sum_x1 = block.create_var( + dtype="int", shape=[3, 4], lod_level=0, name="sum.x1") + sum_x2 = block.create_var( + dtype="int", shape=[3, 4], lod_level=0, name="sum.x2") + sum_x3 = block.create_var( + dtype="int", shape=[3, 4], lod_level=0, name="sum.x3") + sum_out = block.create_var( + dtype="int", shape=[3, 4], lod_level=0, name="sum.out") + sum_op = block.append_op( + type="sum", + inputs={"X": [sum_x1, sum_x2, sum_x3]}, + outputs={"Out": sum_out}) + self.assertEqual(sum_op.type, "sum") + self.assertEqual(sum_op.input_names, ["X"]) + self.assertEqual(sum_op.input("X"), ["sum.x1", "sum.x2", "sum.x3"]) + self.assertEqual(sum_op.output_names, ["Out"]) + self.assertEqual(sum_op.output("Out"), ["sum.out"]) + self.assertEqual(sum_out.op, sum_op) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_parameter.py b/python/paddle/v2/framework/tests/test_parameter.py index 3b5d38f257e6f51be30d9f1fa42285461b2a0eb7..1ac0cdd99f1b7c15d64ae9d2c465d5a9d563bd80 100644 --- a/python/paddle/v2/framework/tests/test_parameter.py +++ b/python/paddle/v2/framework/tests/test_parameter.py @@ -1,5 +1,5 @@ import unittest -from paddle.v2.framework.graph import g_program +from paddle.v2.framework.framework import g_program import paddle.v2.framework.core as core diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/framework/tests/test_program.py index 83e184494ad235f6493a7ea8e25886b1e35004ee..c5674382a484a91268e0139ba5588b123531210e 100644 --- a/python/paddle/v2/framework/tests/test_program.py +++ b/python/paddle/v2/framework/tests/test_program.py @@ -1,7 +1,7 @@ import unittest import paddle.v2.framework.core as core -from paddle.v2.framework.graph import g_program +from paddle.v2.framework.framework import g_program class TestProgram(unittest.TestCase): @@ -51,11 +51,14 @@ class TestProgram(unittest.TestCase): sum_op_desc.set_input("Y", ["b1"]) sum_op_desc.set_output("Out", ["out2"]) + target = block.new_var("out2") + expect_ops = [ - "mul", "elementwise_add", "elementwise_add_grad", "mul_grad" + "mul", "elementwise_add", "fill_constant", "elementwise_add_grad", + "mul_grad" ] actual_ops = [] - prog.append_backward(set()) + prog.append_backward(target, set()) for op in block.all_ops(): actual_ops.append(op.type()) print(actual_ops) diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py index 3db1e79ce43b7f559c7caab8397817b76d56161e..9b3a21261f02bf90617e60baca6902137520b8bf 100644 --- a/python/paddle/v2/framework/tests/test_protobuf_descs.py +++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py @@ -53,7 +53,7 @@ class TestOpDesc(unittest.TestCase): self.assertEqual(8, len(op.attr_names())) op.set_block_attr("block_attr", prog.block(0)) - self.assertEqual(0, op.get_block_attr("block_attr")) + self.assertEqual(0, op.block_attr("block_attr")) mul_op = block.append_op() mul_op.set_type("mul") @@ -94,17 +94,21 @@ class TestVarDesc(unittest.TestCase): program_desc = core.ProgramDesc.__create_program_desc__() block = program_desc.block(0) var = block.new_var('my_var') + var.set_type(core.VarDesc.VarType.SELECTED_ROWS) src_shape = [3, 2, 10, 8] var.set_shape(src_shape) res_shape = var.shape() self.assertEqual(src_shape, res_shape) + self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type()) def test_data_type(self): program_desc = core.ProgramDesc.__create_program_desc__() block = program_desc.block(0) var = block.new_var('my_var') + var.set_type(core.VarDesc.VarType.LOD_TENSOR) var.set_data_type(core.DataType.INT32) self.assertEqual(core.DataType.INT32, var.data_type()) + self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type()) class TestBlockDesc(unittest.TestCase): diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py index 6309b09bc98f6d529f80bfa269a0eaadd799fcbc..abd2ebf0b21a953b76155eb04c57a7b65ac53cbc 100644 --- a/python/paddle/v2/framework/tests/test_seq_concat_op.py +++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py @@ -1,5 +1,6 @@ import unittest import numpy as np +import sys from op_test import OpTest @@ -74,4 +75,5 @@ class TestConcatOpLevelZero(TestConcatOp): if __name__ == '__main__': + sys.exit(0) unittest.main() diff --git a/python/paddle/v2/framework/tests/test_variable.py b/python/paddle/v2/framework/tests/test_variable.py index 8ea1083ff6535d2d517f2ac587a956bfed906f03..6fb934c743a6271c352a74495cc543b62ac2b9d9 100644 --- a/python/paddle/v2/framework/tests/test_variable.py +++ b/python/paddle/v2/framework/tests/test_variable.py @@ -1,5 +1,5 @@ import unittest -from paddle.v2.framework.graph import Variable, g_program +from paddle.v2.framework.framework import Variable, g_program import paddle.v2.framework.core as core import numpy as np @@ -21,6 +21,7 @@ class TestVariable(unittest.TestCase): b = g_program.current_block() w = b.create_var( dtype="float64", shape=[784, 100], lod_level=0, name="fc.w") + self.assertNotEqual(str(w), "") self.assertEqual(core.DataType.FP64, w.data_type) self.assertEqual((784, 100), w.shape) self.assertEqual("fc.w", w.name)