diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index d81481ca819c13ee0e299c204f998f3915c34bd4..ddf0b055a92d80295b24255a5462d477e0d9c796 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -252,6 +252,11 @@ first_seq .. autoclass:: paddle.v2.layer.first_seq :noindex: +sub_seq +--------- +.. autoclass:: paddle.v2.layer.sub_seq + :noindex: + concat ------ .. autoclass:: paddle.v2.layer.concat diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 939731c0f3438a702e947ba1a7abeb5e3e6a8f53..004ee2d8c85ce7661886179570e693d7d61bc6d8 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -68,12 +68,6 @@ scale :noindex: -reshape ---------- -.. autofunction:: paddle.v2.fluid.layers.reshape - :noindex: - - transpose --------- .. autofunction:: paddle.v2.fluid.layers.transpose diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index 73f894a3e20ab779f8607e63a67139b0e8cce79a..2191dd3783d5ed7bb59b96c70d38a72bb0b2fee7 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -27,7 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { -using DataTransformFN = +using DataTransformFn = std::function ctx, const Variable& in, Variable* out)>; using KernelTypePair = std::pair; @@ -47,7 +47,7 @@ struct KernelTypePairHash { }; using DataTransformMap = - std::unordered_map; + std::unordered_map; class DataTransformFnMap { public: @@ -58,25 +58,25 @@ class DataTransformFnMap { } void Insert(const OpKernelType& left, const OpKernelType& right, - const DataTransformFN& data_tranform_fn) { + const DataTransformFn& data_tranform_fn) { Insert(std::make_pair(left, right), data_tranform_fn); } void Insert(const KernelTypePair& kernel_type_pair, - const DataTransformFN& data_tranform_fn) { + const DataTransformFn& data_tranform_fn) { PADDLE_ENFORCE(!Has(kernel_type_pair), "KernelTypePair %s has been registered", ""); map_.insert({kernel_type_pair, data_tranform_fn}); } - const DataTransformFN& Get(const KernelTypePair& key_pair) const { + const DataTransformFn& Get(const KernelTypePair& key_pair) const { auto data_transformer = GetNullable(key_pair); PADDLE_ENFORCE_NOT_NULL(data_transformer, - "DataTransformFN should not be NULL"); + "DataTransformFn should not be NULL"); return *data_transformer; } - const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const { + const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const { auto it = map_.find(key_pair); if (it == map_.end()) { return nullptr; diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h index 97b542e345feab0bab701dd967558ce23375dc7f..b06002096fb109da806809f7b908d9768cf095ba 100644 --- a/paddle/framework/op_kernel_type.h +++ b/paddle/framework/op_kernel_type.h @@ -68,6 +68,8 @@ struct OpKernelType { data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && library_type_ == o.library_type_; } + + bool operator!=(const OpKernelType& o) const { return !(*this == o); } }; inline std::ostream& operator<<(std::ostream& os, @@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os, return os; } +inline std::string KernelTypeToString(const OpKernelType& kernel_key) { + std::ostringstream stream; + stream << kernel_key; + return stream.str(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc index dd048405007974667bbb8a052b77ab8b3aa4580e..649afeee8a846b0579545f2edff77e9dbe3b4dd8 100644 --- a/paddle/framework/op_kernel_type_test.cc +++ b/paddle/framework/op_kernel_type_test.cc @@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) { OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, LibraryType::kCUDNN); - std::ostringstream stream; - stream << op_kernel_type; ASSERT_EQ( - stream.str(), + paddle::framework::KernelTypeToString(op_kernel_type), "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 886f73e7b81c35cac573bd041e6462eb2111bf85..f48512b5c682698dae86593fb89a720eea503f7d 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -413,37 +413,51 @@ void OperatorWithKernel::Run(const Scope& scope, } if (actual_kernel_key == expected_kernel_key) { - kernel_iter->second->Compute(ctx); + PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_, + "Currently, model parallelism is only supported between " + "CPU and other devices. For example, multi-GPU model " + "parallelism will failed."); } else { - Scope& op_scope = scope.NewScope(); - auto input_vars = this->InputVars(); - for (auto var_name : input_vars) { - op_scope.Var(var_name); - } - - // TODO(qijun) get appropriate DeviceContext from DeviceContext pool - platform::DeviceContext* trans_dev_ctx = nullptr; - std::vector trans_dev_ctx_vec{trans_dev_ctx}; + const DataTransformFn* trans_fun = + DataTransformFnMap::Instance().GetNullable( + std::make_pair(actual_kernel_key, expected_kernel_key)); + if (trans_fun) { + auto input_vars = this->InputVars(); + // TODO(qijun) filter the input vars that do not need to be transformed + + // filter vars that has been transformed + std::vector need_trans; + for (auto var_name : input_vars) { + auto var_name_trans = + var_name + framework::KernelTypeToString(expected_kernel_key); + if (!scope.FindVar(var_name_trans)) { + const_cast(scope).Var(var_name_trans); + need_trans.push_back(var_name); + } + } - // TODO(qijun) get appropriate DataTransformFN from global map - framework::DataTransformFN trans_fun = nullptr; + if (!need_trans.empty()) { + // TODO(qijun) get appropriate DeviceContext from DeviceContext pool + platform::DeviceContext* trans_dev_ctx = nullptr; + std::vector trans_dev_ctx_vec{trans_dev_ctx}; - // Wait for transform starting - dev_ctx->Wait(); + // Wait for transform starting + dev_ctx->Wait(); - for (auto var_name : input_vars) { - trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)), - op_scope.FindVar(var_name)); - } - // Wait for data transform finishing - for (auto ctx : trans_dev_ctx_vec) { - ctx->Wait(); + for (auto var_name : need_trans) { + (*trans_fun)(trans_dev_ctx_vec, *(scope.FindVar(var_name)), + scope.FindVar(var_name + framework::KernelTypeToString( + expected_kernel_key))); + } + // Wait for data transform finishing + for (auto ctx : trans_dev_ctx_vec) { + ctx->Wait(); + } + } } - - // Create a new ExecutionContext - ExecutionContext op_ctx(*this, op_scope, *dev_ctx); - kernel_iter->second->Compute(op_ctx); } + + kernel_iter->second->Compute(ctx); } OpKernelType OperatorWithKernel::GetActualKernelType( diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 5aaaf993323c2d4dbef688d0977ec6374fde6512..3e686b1c415e61a24e0f6729555e672721cf806f 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -1,5 +1,6 @@ file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") +set(DEPS_OPS "") set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") function(op_library TARGET) @@ -48,6 +49,11 @@ function(op_library TARGET) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() + list(LENGTH op_library_DEPS op_library_DEPS_len) + if (${op_library_DEPS_len} GREATER 0) + set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) + endif() + if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) @@ -181,55 +187,26 @@ endfunction() add_subdirectory(math) add_subdirectory(nccl) -set(DEPS_OPS - cond_op - cross_entropy_op - recurrent_op - softmax_with_cross_entropy_op - softmax_op - sequence_softmax_op - sum_op - pool_op - maxout_op - unpool_op - pool_with_index_op - conv_op - conv_transpose_op - nccl_op - sequence_conv_op - sequence_pool_op - lod_rank_table_op - lod_tensor_to_array_op - array_to_lod_tensor_op - max_sequence_len_op - lstm_op - tensor_array_read_write_op - gru_op - adagrad_op - sgd_op - save_op - load_op - send_op - recv_op) +if(WITH_GPU) + op_library(nccl_op DEPS nccl_common) +else() + set(DEPS_OPS ${DEPS_OPS} nccl_op) +endif() if(WITH_DISTRIBUTE) -add_subdirectory(detail) -op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) -set_source_files_properties( - send_op.cc - PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - -op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) -set_source_files_properties( - recv_op.cc - PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - -cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) + add_subdirectory(detail) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) +else() + set(DEPS_OPS ${DEPS_OPS} send_op recv_op) endif() -op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) +op_library(cond_op DEPS framework_proto tensor net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) @@ -242,21 +219,16 @@ op_library(pool_op DEPS pooling) op_library(maxout_op DEPS maxouting) op_library(unpool_op DEPS unpooling) op_library(pool_with_index_op DEPS pooling) -op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) -op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) -op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op) -op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table) -op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc) -if(WITH_GPU) -op_library(nccl_op DEPS nccl_common) -endif() +op_library(lod_rank_table_op DEPS lod_rank_table) +op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) +op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) +op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) -op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) - +op_library(recurrent_op DEPS executor) # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) @@ -269,13 +241,12 @@ endforeach() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") - cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) if(WITH_GPU) - cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) + cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)