diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index d7b3d2bdec1687425df804c0d56d568241f9e8b0..d6b8464100d4497876aa3f6f7cbc666aafae4bfc 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -26,7 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py) SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON) SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic") SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS paddle_parameter diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 6b34c3bbcfbdb0c36381df7de4dd227e317829e5..184ec65d3fa5526b9ec32b376f1a10ca8ca69a6d 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -42,12 +42,14 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward ${GLOB_OP_LIB}) -#if(WITH_GPU) -# nv_test(executor_test SRCS executor_test.cc DEPS executor) -#else() -# cc_test(executor_test SRCS executor_test.cc DEPS executor) -#endif() +cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward) +set(EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op + mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op) +if(WITH_GPU) + nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP}) +else() + cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP}) +endif() cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor) cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place) diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h index 7ffe4c28103f9d6a9f179422d1beb86106ef786e..f1ab8056450c96f0a1b671e1efa46c4c68f9ea15 100644 --- a/paddle/framework/backward.h +++ b/paddle/framework/backward.h @@ -27,6 +27,8 @@ extern std::unique_ptr Backward( const OperatorBase& forwardOp, const std::unordered_set& no_grad_vars); +// TODO(jiayi): Add target as parameter and generate backward op +// according to target. void AppendBackward(ProgramDescBind& program_desc, const std::unordered_set& no_grad_vars); diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 7f6d8fe6a4aec9fdc39b4ffc0837a03e355ec937..137e53d849542e48080228e0002931867c4d7fb2 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -25,6 +25,16 @@ limitations under the License. */ #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" +USE_OP(elementwise_add); +USE_OP(gaussian_random); +USE_OP(feed); +USE_OP(fetch); +USE_OP(mul); +USE_OP(sum); +USE_OP(squared_l2_distance); +USE_OP(fill_constant); +USE_OP(sgd); + using namespace paddle::platform; using namespace paddle::framework; diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index e7538b4af3429e566a439d5a0db8496efcd94969..d3c11ad60a0f9319329a59c16bfc4668cd75b7ae 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -211,6 +211,15 @@ static InferShapeFuncMap &InferShapeFuncs() { return *g_map; } +void OpDescBind::CheckAttrs() { + PADDLE_ENFORCE(!Type().empty(), + "CheckAttr() can not be called before type is setted."); + const auto *checker = OpInfoMap::Instance().Get(Type()).Checker(); + PADDLE_ENFORCE_NOT_NULL(checker, "Operator \"%s\" has no registered checker.", + Type()); + checker->Check(attrs_); +} + void OpDescBind::InferShape(const BlockDescBind &block) const { auto &funcs = InferShapeFuncs(); auto it = funcs.find(this->Type()); diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index 81c4225041157ac600d1db73ef2363ebcd4abfc0..90155fadeac148bd9cae4ce9066ac4ce8d9df52d 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -100,6 +100,8 @@ class OpDescBind { return &this->attrs_; } + void CheckAttrs(); + void InferShape(const BlockDescBind &block) const; private: diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 15f80b57206c90f689acfdcac60a0d9011025fc0..97a142d5f1661704fede858b28ff0d5487c66fab 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -289,6 +289,15 @@ class ExecutionContext { return device_context_; } +#ifdef PADDLE_WITH_CUDA + const platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + auto cuda_ctx = + reinterpret_cast(&device_context_); + return *cuda_ctx; + } +#endif + private: const OperatorBase& op_; const Scope& scope_; diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index ba82127d9c028eb39b9dc1a7f34fcf546524142b..3304d857ae2600bd94013b6672b88d43d1d188c6 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -87,26 +87,31 @@ class Tensor { /** * @brief Copy the content of external tensor to a new place. * - * @param[in] src The external tensor. - * @param[in] ctx The device context contains place where to store. + * @param[in] src The external tensor. + * @param[in] dst_place The dst place. + * @param[in] ctx The device context contains device resources. * * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. */ + // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647 + // Remove `CopyFrom` and `CopyFromVector` from Tensor interface + // and make them global functions template - inline void CopyFrom(const Tensor& src, const platform::Place& dst_place); + inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx); /** * @brief Copy the content of an external vector to a tensor. * - * @param[in] src The external vector. - * @param[in] ctx The device context contains place where to store. + * @param[in] src The external tensor. + * @param[in] ctx The device context contains device resources. * * * @note CopyFromVector assumes that the tensor has been resized * before invoking. */ template inline void CopyFromVector(const std::vector& src, - const platform::Place& dst_place); + const platform::DeviceContext& ctx); /** * @brief Return the slice of the tensor. diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc index 2728bce1c1af848285e80d8ee8b3b61ec046342e..7ae16e99cdb8a23f14f0c8b684ba4ec66a4ce074 100644 --- a/paddle/framework/tensor_array.cc +++ b/paddle/framework/tensor_array.cc @@ -95,7 +95,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) { values_[index].Resize(value.dims()); values_[index].mutable_data(platform::CPUPlace()); - values_[index].CopyFrom(value, platform::CPUPlace()); + values_[index].CopyFrom(value, platform::CPUPlace(), + platform::CPUDeviceContext()); } void TensorArray::WriteShared(size_t index, const LoDTensor& value) { @@ -151,7 +152,8 @@ LoDTensor TensorArray::Stack() const { for (size_t idx = 0; idx < size(); idx++) { result.Slice(idx, idx + 1) - .CopyFrom(Read(idx), platform::CPUPlace()); + .CopyFrom(Read(idx), platform::CPUPlace(), + platform::CPUDeviceContext()); } return result; } @@ -182,7 +184,8 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const { // copy value.Resize(value_dims); value.CopyFrom(source.Slice(elem, elem + 1), - platform::CPUPlace()); + platform::CPUPlace(), + platform::CPUDeviceContext()); } } } @@ -236,7 +239,8 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) { auto target = result.Slice(i, i + 1); auto source_ = source->Slice(index, index + 1); - target.CopyFrom(source_, platform::CPUPlace()); + target.CopyFrom(source_, platform::CPUPlace(), + platform::CPUDeviceContext()); } return result; @@ -269,7 +273,8 @@ LoDTensor PackDynamicBatch(const std::vector& source, if (index >= seq_meta.end) break; auto source_ = source[batch_id].Slice(seq_id, seq_id + 1); auto target = result.Slice(index, index + 1); - target.CopyFrom(source_, platform::CPUPlace()); + target.CopyFrom(source_, platform::CPUPlace(), + platform::CPUDeviceContext()); } } diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 8ee9941982cdd8f78fdbace9dca085097b08eeb8..ce73e0a9edbe340f1165e2dbcba8c976c55df348 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -88,7 +88,8 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) { template inline void Tensor::CopyFrom(const Tensor& src, - const platform::Place& dst_place) { + const platform::Place& dst_place, + const platform::DeviceContext& ctx) { src.check_memory_size(); Resize(src.dims()); @@ -106,26 +107,45 @@ inline void Tensor::CopyFrom(const Tensor& src, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, - boost::get(src_place), src_ptr, size, 0); + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, - boost::get(src_place), src_ptr, size, 0); + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, - boost::get(src_place), src_ptr, size, 0); + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); } - PADDLE_ENFORCE(cudaStreamSynchronize(0), - "cudaStreamSynchronize failed in Tensor CopyFrom"); - #endif } template inline void Tensor::CopyFromVector(const std::vector& src, - const platform::Place& dst_place) { + const platform::DeviceContext& ctx) { + auto dst_place = ctx.GetPlace(); auto src_ptr = static_cast(src.data()); platform::CPUPlace src_place; auto dst_ptr = static_cast(mutable_data(dst_place)); @@ -137,12 +157,11 @@ inline void Tensor::CopyFromVector(const std::vector& src, } #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, src_place, - src_ptr, size, 0); + memory::Copy( + boost::get(dst_place), dst_ptr, src_place, src_ptr, + size, + reinterpret_cast(ctx).stream()); } - PADDLE_ENFORCE(cudaStreamSynchronize(0), - "cudaStreamSynchronize failed in Tensor CopyFromVector"); - #endif } diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 492eba69e1ea483eca1da782004231af61fc60be..0b62fe08ce9e592384e55432861a943403453bb7 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -194,6 +194,7 @@ TEST(Tensor, CopyFrom) { { Tensor src_tensor; Tensor dst_tensor; + CPUDeviceContext cpu_ctx((CPUPlace())); int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); @@ -201,7 +202,7 @@ TEST(Tensor, CopyFrom) { memcpy(src_ptr, arr, 9 * sizeof(int)); auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(src_tensor, *cpu_place); + dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx); const int* dst_ptr = dst_tensor.data(); ASSERT_NE(src_ptr, dst_ptr); @@ -210,7 +211,7 @@ TEST(Tensor, CopyFrom) { } Tensor slice_tensor = src_tensor.Slice(1, 2); - dst_tensor.CopyFrom(slice_tensor, *cpu_place); + dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx); const int* slice_ptr = slice_tensor.data(); dst_ptr = dst_tensor.data(); ASSERT_NE(dst_ptr, slice_ptr); @@ -231,13 +232,15 @@ TEST(Tensor, CopyFrom) { // CPU Tensor to GPU Tensor auto gpu_place = new paddle::platform::GPUPlace(0); - gpu_tensor.CopyFrom(src_tensor, *gpu_place); + CUDADeviceContext gpu_ctx(*gpu_place); + gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx); // GPU Tensor to CPU Tensor auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); - // Compare Tensors + // Sync before Compare Tensors + gpu_ctx.Wait(); const int* dst_ptr = dst_tensor.data(); ASSERT_NE(src_ptr, dst_ptr); for (size_t i = 0; i < 9; ++i) { @@ -247,12 +250,13 @@ TEST(Tensor, CopyFrom) { Tensor slice_tensor = src_tensor.Slice(1, 2); // CPU Slice Tensor to GPU Tensor - gpu_tensor.CopyFrom(slice_tensor, *gpu_place); + gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx); // GPU Tensor to CPU Tensor - dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); - // Compare Slice Tensors + // Sync before Compare Slice Tensors + gpu_ctx.Wait(); const int* slice_ptr = slice_tensor.data(); dst_ptr = dst_tensor.data(); ASSERT_NE(dst_ptr, slice_ptr); @@ -273,7 +277,8 @@ TEST(Tensor, CopyFromVector) { // Copy to CPU Tensor cpu_tensor.Resize(make_ddim({3, 3})); auto cpu_place = new paddle::platform::CPUPlace(); - cpu_tensor.CopyFromVector(src_vec, *cpu_place); + CPUDeviceContext cpu_ctx(*cpu_place); + cpu_tensor.CopyFromVector(src_vec, cpu_ctx); // Compare Tensors const int* cpu_ptr = cpu_tensor.data(); @@ -285,7 +290,7 @@ TEST(Tensor, CopyFromVector) { src_vec.erase(src_vec.begin(), src_vec.begin() + 5); cpu_tensor.Resize(make_ddim({2, 2})); - cpu_tensor.CopyFromVector(src_vec, *cpu_place); + cpu_tensor.CopyFromVector(src_vec, cpu_ctx); cpu_ptr = cpu_tensor.data(); src_ptr = src_vec.data(); ASSERT_NE(src_ptr, cpu_ptr); @@ -306,16 +311,19 @@ TEST(Tensor, CopyFromVector) { // Copy to CPU Tensor cpu_tensor.Resize(make_ddim({3, 3})); auto cpu_place = new paddle::platform::CPUPlace(); - cpu_tensor.CopyFromVector(src_vec, *cpu_place); + CPUDeviceContext cpu_ctx(*cpu_place); + cpu_tensor.CopyFromVector(src_vec, cpu_ctx); // Copy to GPUTensor gpu_tensor.Resize(make_ddim({3, 3})); auto gpu_place = new paddle::platform::GPUPlace(); - gpu_tensor.CopyFromVector(src_vec, *gpu_place); + CUDADeviceContext gpu_ctx(*gpu_place); + gpu_tensor.CopyFromVector(src_vec, gpu_ctx); // Copy from GPU to CPU tensor for comparison - dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); - // Compare Tensors + // Sync before Compare Tensors + gpu_ctx.Wait(); const int* src_ptr = src_vec.data(); const int* cpu_ptr = cpu_tensor.data(); const int* dst_ptr = dst_tensor.data(); @@ -329,11 +337,13 @@ TEST(Tensor, CopyFromVector) { src_vec.erase(src_vec.begin(), src_vec.begin() + 5); cpu_tensor.Resize(make_ddim({2, 2})); - cpu_tensor.CopyFromVector(src_vec, *cpu_place); + cpu_tensor.CopyFromVector(src_vec, cpu_ctx); gpu_tensor.Resize(make_ddim({2, 2})); - gpu_tensor.CopyFromVector(src_vec, *gpu_place); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place); + gpu_tensor.CopyFromVector(src_vec, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + // Sync before Compare Tensors + gpu_ctx.Wait(); src_ptr = src_vec.data(); cpu_ptr = cpu_tensor.data(); dst_ptr = dst_tensor.data(); diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index ced14a8923140ec6b08e3e6725a5780b61033daf..cba57ba57f5e03c7861897e177cc09aa513e5395 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -321,6 +321,23 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker { } }; +template +class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ThresholdedReluOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of ThresholdedRelu operator"); + AddOutput("Y", "Output of ThresholdedRelu operator"); + AddComment( + "ThresholdedRelu activation operator, " + "thresholded_relu = x for x > threshold, " + "thresholded_relu = 0 otherwise."); + AddAttr("threshold", "The threshold location of activation") + .SetDefault(static_cast(1.0)); + } +}; + } // namespace operators } // namespace paddle @@ -392,6 +409,10 @@ REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad, REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker, hard_shrink_grad, ops::ActivationOpGrad); +REGISTER_OP(thresholded_relu, ops::ActivationOp, + ops::ThresholdedReluOpMaker, thresholded_relu_grad, + ops::ActivationOpGrad); + #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ REGISTER_OP_CPU_KERNEL( \ act_type, \ diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index f88c9c48eb9fcb779de5a99a45a832e582d76ab0..502c33be103c465c14f128be38ac62d029f1bfb9 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -590,6 +590,32 @@ struct STanhGradFunctor : public BaseActivationFunctor { } }; +template +struct ThresholdedReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Y y) const { + y.device(d) = (x > static_cast(threshold)).template cast() * x; + } +}; + +template +struct ThresholdedReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Y y, dY dy, dX dx) const { + dx.device(d) = dy * (x > static_cast(threshold)).template cast(); + } +}; + } // namespace operators } // namespace paddle @@ -615,4 +641,5 @@ struct STanhGradFunctor : public BaseActivationFunctor { __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ __macro(elu, ELUFunctor, ELUGradFunctor); \ - __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor) + __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \ + __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); diff --git a/paddle/operators/conv2d_op.cc b/paddle/operators/conv2d_op.cc index 6325d4248f10ea8a12ae5398d9fe0e579db3f7ae..1acb8415d0691df77047806d3c81b51cbb8c59f3 100644 --- a/paddle/operators/conv2d_op.cc +++ b/paddle/operators/conv2d_op.cc @@ -12,111 +12,91 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/gemm_conv2d_op.h" +#include "paddle/operators/conv2d_op.h" namespace paddle { namespace operators { -int outputSize(int input_size, int filter_size, int padding, int stride) { - int output_size = (input_size - filter_size + 2 * padding) / stride + 1; - return output_size; +void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of Conv2DOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of Conv2DOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of Conv2DOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + int groups = ctx->Attrs().Get("groups"); + int input_channels = in_dims[1]; + int output_channels = filter_dims[0]; + + PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D."); + PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D."); + PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, + "The number of input channels should be equal to filter " + "channels * groups."); + PADDLE_ENFORCE_EQ( + output_channels % groups, 0, + "The number of output channels should be divided by groups."); + + auto output_height = + OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]); + auto output_width = + OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]); + ctx->SetOutputDim("Output", + {in_dims[0], filter_dims[0], output_height, output_width}); } -class Conv2DOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Input"), - "Input(Input) of Conv2DOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Filter"), - "Input(Filter) of Conv2DOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Output"), - "Output(Output) of Conv2DOp should not be null."); - - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - int groups = ctx->Attrs().Get("groups"); - int input_channels = in_dims[1]; - int output_channels = filter_dims[0]; - - PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D."); - PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D."); - PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, - "The number of input channels should be equal to filter " - "channels * groups."); - PADDLE_ENFORCE_EQ( - output_channels % groups, 0, - "The number of output channels should be divided by groups."); - - auto output_height = - outputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]); - auto output_width = - outputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]); - ctx->SetOutputDim( - "Output", {in_dims[0], filter_dims[0], output_height, output_width}); - } -}; - -class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { - public: - Conv2DOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput( - "Input", - "The input tensor of convolution operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of image."); - AddInput( - "Filter", - "The filter tensor of convolution operator." - "The format of the filter tensor is MCHW, where M is the number of " - "output image channels, C is the number of input image channels, " - "H and W is height and width of filter. " - "If the groups attribute is greater than 1, C equal the number of " - "input image channels divided by the groups."); - AddOutput("Output", - "The output tensor of convolution operator." - "The format of output tensor is also NCHW."); - AddAttr>("strides", "strides of convolution operator.") - .SetDefault({1, 1}); - AddAttr>("paddings", "paddings of convolution operator.") - .SetDefault({0, 0}); - AddAttr( - "groups", - "group size of convolution operator. " - "Refer to grouped convolution in Alex Krizhevsky's paper: " - "when group=2, the first half of the filters are only connected to the " - "first half of the input channels, and the second half only connected " - "to the second half.") - .SetDefault(1); - AddComment(R"DOC( +Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "The input tensor of convolution operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of image."); + AddInput("Filter", + "The filter tensor of convolution operator." + "The format of the filter tensor is MCHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "H and W is height and width of filter. " + "If the groups attribute is greater than 1, C equal the number of " + "input image channels divided by the groups."); + AddOutput("Output", + "The output tensor of convolution operator." + "The format of output tensor is also NCHW."); + AddAttr>("strides", "strides of convolution operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", "paddings of convolution operator.") + .SetDefault({0, 0}); + AddAttr( + "groups", + "group size of convolution operator. " + "Refer to grouped convolution in Alex Krizhevsky's paper: " + "when group=2, the first half of the filters are only connected to the " + "first half of the input channels, and the second half only connected " + "to the second half.") + .SetDefault(1); + AddComment(R"DOC( The convolution operation calculates the output based on the input, filter and strides, paddings, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. )DOC"); - } -}; - -class Conv2DOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; +} - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim("Input"); - auto filter_dims = ctx->GetInputDim("Filter"); - if (ctx->HasOutput(framework::GradVarName("Input"))) { - ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); - } - if (ctx->HasOutput(framework::GradVarName("Filter"))) { - ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); - } +void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); } -}; + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} } // namespace operators } // namespace paddle diff --git a/paddle/operators/conv2d_op.cu b/paddle/operators/conv2d_op.cu index 5df818ba0496a65502dde37fd1397ec56f8c1101..c697c9466d34c29af6976f3a4d2d0a24ba778ceb 100644 --- a/paddle/operators/conv2d_op.cu +++ b/paddle/operators/conv2d_op.cu @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/gemm_conv2d_op.h" +#include "paddle/operators/conv2d_op.h" namespace ops = paddle::operators; diff --git a/paddle/operators/gemm_conv2d_op.h b/paddle/operators/conv2d_op.h similarity index 90% rename from paddle/operators/gemm_conv2d_op.h rename to paddle/operators/conv2d_op.h index 323e3f7c3bd506c6b63bf4d1152384649f5da575..7ebdbe81cbbaf59a60eb3dac0f570d70fc85d6ef 100644 --- a/paddle/operators/gemm_conv2d_op.h +++ b/paddle/operators/conv2d_op.h @@ -24,6 +24,38 @@ namespace operators { using Tensor = framework::Tensor; +// Base convolution operator definations for other conv +// like operators to reuse the implementation. +inline int OutputSize(int input_size, int filter_size, int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +// Define Op classes in .h file so that other conv +// operator implementations can reuse the code. +class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv2DOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker); +}; + +class Conv2DOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class Conv2DOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + template class GemmConv2DKernel : public framework::OpKernel { public: @@ -74,7 +106,6 @@ class GemmConv2DKernel : public framework::OpKernel { framework::DDim output_matrix_shape = {output_channels, output_height * output_width}; - // convolution operator: im2col + gemm int in_step = input_channels / groups; int out_step = output_channels / groups; diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4288f300dd5b0464f2b4394cdb0b44f93060ae74 --- /dev/null +++ b/paddle/operators/conv_cudnn_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/conv2d_op.h" + +namespace paddle { +namespace operators { + +class CudnnConvOpMaker : public Conv2DOpMaker { + public: + CudnnConvOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : Conv2DOpMaker(proto, op_checker) { + AddAttr>("dilations", "dilations of convolution operator.") + .SetDefault(std::vector{1, 1}); + AddAttr("workspace_size_MB", + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad, + ops::Conv2DOpGrad); +REGISTER_OP_CPU_KERNEL( + conv_cudnn, ops::GemmConv2DKernel); +REGISTER_OP_CPU_KERNEL( + conv_cudnn_grad, + ops::GemmConvGrad2DKernel); diff --git a/paddle/operators/conv_cudnn_op.cu b/paddle/operators/conv_cudnn_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..366d0323b840c338dd6ba5b28bdb29fd135fe91a --- /dev/null +++ b/paddle/operators/conv_cudnn_op.cu @@ -0,0 +1,277 @@ +/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" +#include "paddle/memory/memory.h" +#include "paddle/operators/conv2d_op.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; +using CUDADeviceContext = platform::CUDADeviceContext; + +static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024; + +// NOTE: framework::vectorize converts to type int64_t +// which does not fit cudnn inputs. +std::vector Dims2Vector(const framework::DDim& dims) { + std::vector ret; + for (int i = 0; i < dims.size(); i++) { + ret.push_back(dims[i]); + } + return ret; +} + +template +class CudnnConvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use GPUPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + + cudnnTensorDescriptor_t cudnn_input_desc = + input_desc.descriptor(layout, Dims2Vector(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_desc = + output_desc.descriptor(layout, Dims2Vector(output->dims()), groups); + cudnnFilterDescriptor_t cudnn_filter_desc = + filter_desc.descriptor(layout, Dims2Vector(filter->dims()), groups); + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + int input_channels = input->dims()[1]; + int input_height = input->dims()[2]; + int input_width = input->dims()[3]; + int output_channels = output->dims()[1]; + int output_height = output->dims()[2]; + int output_width = output->dims()[3]; + + int group_offset_in = input_channels / groups * input_height * input_width; + int group_offset_out = + output_channels / groups * output_height * output_width; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn conv workspace --------------------- + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionFwdAlgo_t algo; + auto handle = ctx.cuda_device_context().cudnn_handle(); + + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + // get workspace size able to allocate + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + // Allocate on GPU memory + platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv forward --------------------- + T alpha = 1.0f, beta = 0.0f; + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +template +class CudnnConvGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use GPUPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_grad_desc; + ScopedTensorDescriptor input_grad_desc; + + ScopedFilterDescriptor filter_desc; + ScopedFilterDescriptor filter_grad_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + + cudnnTensorDescriptor_t cudnn_input_desc = + input_desc.descriptor(layout, Dims2Vector(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_grad_desc = + output_grad_desc.descriptor(layout, Dims2Vector(output_grad->dims()), + groups); + cudnnFilterDescriptor_t cudnn_filter_desc = + filter_desc.descriptor(layout, Dims2Vector(filter->dims()), groups); + cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr; + cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr; + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + int input_channels = input->dims()[1]; + int input_height = input->dims()[2]; + int input_width = input->dims()[3]; + int output_grad_channels = filter->dims()[0]; + int output_grad_height = output_grad->dims()[2]; + int output_grad_width = output_grad->dims()[3]; + + int group_offset_in = input_channels / groups * input_height * input_width; + int group_offset_out = + output_grad_channels / groups * output_grad_height * output_grad_width; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t data_algo; + cudnnConvolutionBwdFilterAlgo_t filter_algo; + size_t workspace_size_in_bytes = 0, tmp_size = 0; + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + + auto handle = ctx.cuda_device_context().cudnn_handle(); + if (input_grad) { + cudnn_input_grad_desc = input_grad_desc.descriptor( + layout, Dims2Vector(input_grad->dims()), groups); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_grad_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, cudnn_filter_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + + if (filter_grad) { + cudnn_filter_grad_desc = filter_grad_desc.descriptor( + layout, Dims2Vector(filter_grad->dims()), groups); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, filter_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + // ------------------- cudnn conv workspace --------------------- + // Already on GPU + void* cudnn_workspace = nullptr; + platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- + // FIXME(typhoonzero): template type T may not be the same as cudnn call. + T alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*input_grad); + t.device(ctx.GetEigenDevice()) = + t.constant(static_cast(0)); + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_grad_desc, input_grad_data + i * group_offset_in)); + } + } + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*filter_grad); + t.device(ctx.GetEigenDevice()) = + t.constant(static_cast(0)); + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_output_grad_desc, output_grad_data + i * group_offset_out, + cudnn_conv_desc, filter_algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_filter_grad_desc, + filter_grad_data + i * group_offset_filter)); + } + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel); +REGISTER_OP_GPU_KERNEL(conv_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca5141dabc2bb37bf60fa964ede88281bd23550a --- /dev/null +++ b/paddle/operators/decayed_adagrad_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/decayed_adagrad_op.h" + +namespace paddle { +namespace operators { + +class DecayedAdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContextBase *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("LearningRate"), + "Input(LearningRate) of DecayedAdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of DecayedAdagradOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "LearningRate should have one element"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of DecayedAdagradOp should have " + "the same dimension."); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of DecayedAdagradOp should have " + "the same dimension."); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + } +}; + +class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DecayedAdagradOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Moment", "(Tensor) Second moment"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output second moment"); + + AddAttr("decay", + "(float, default 0.95) " + "Discounting factor for coming gradient") + .SetDefault(0.95); + AddAttr("epsilon", + "(float, default 1.0e-6) " + "Constant for numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( + +Decayed Adagrad + +moment_out = decay * moment + (1 - decay) * grad * grad +param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, + ops::DecayedAdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6fce77fe4ec6b76cb7b0259aab6a3d55d2edb36c --- /dev/null +++ b/paddle/operators/decayed_adagrad_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/decayed_adagrad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0fe0fc5acd66c9824a864618b69097c5c063ea3f --- /dev/null +++ b/paddle/operators/decayed_adagrad_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DecayedAdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + + float decay = ctx.Attr("decay"); + float epsilon = ctx.Attr("epsilon"); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto place = ctx.GetEigenDevice(); + + moment_out.device(place) = decay * moment + (1 - decay) * grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h index 9d8158299fea97a464a7bb64321b1adf8b7b2fab..e756cd1842a4db3fbe17138a1133e7cb41d4809e 100644 --- a/paddle/operators/feed_op.h +++ b/paddle/operators/feed_op.h @@ -34,7 +34,7 @@ class FeedKernel : public framework::OpKernel { // TODO(qijun): // check tensors[col].dims() with attribute, // except the first dimenson. - out->CopyFrom(tensors[col], ctx.GetPlace()); + out->CopyFrom(tensors[col], ctx.GetPlace(), ctx.device_context()); } }; diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h index eb9c3a7b593b84da7c8dc12d71c4f748269c64e6..b2a6e95875054ca2cec51624c20a6c19490a9e88 100644 --- a/paddle/operators/fetch_op.h +++ b/paddle/operators/fetch_op.h @@ -35,7 +35,8 @@ class FetchKernel : public framework::OpKernel { PADDLE_ENFORCE_GT(tensors->size(), static_cast(col)); (*tensors)[col].Resize(input->dims()); (*tensors)[col].mutable_data(platform::CPUPlace()); - (*tensors)[col].CopyFrom(*input, platform::CPUPlace()); + (*tensors)[col].CopyFrom(*input, platform::CPUPlace(), + ctx.device_context()); // TODO(qijun): need to handle LodTensor later } }; diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index 40bdbfe73351a609a4ab9fdc27ac5ff6710df2a2..9c506ae89bdda38f40fb37e4c4e5f990cd5978b7 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -49,10 +49,22 @@ void testIm2col() { memcpy(input_ptr, arr, 6 * sizeof(float)); auto* place = new Place(); + paddle::platform::DeviceContext* context; + if (paddle::platform::is_cpu_place(*place)) { + context = + new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); + } else { +#ifdef PADDLE_WITH_CUDA + context = + new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); +#else + PADDLE_THROW("no GPU support"); +#endif // PADDLE_ONLY_CPU + } if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place); + input.CopyFrom(input_tmp, *place, *context); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -66,18 +78,6 @@ void testIm2col() { paddle::operators::math::ColFormat::kOCF, Place, float> im2col_ocf; - paddle::platform::DeviceContext* context; - if (paddle::platform::is_cpu_place(*place)) { - context = - new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); - } else { -#ifdef PADDLE_WITH_CUDA - context = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); -#else - PADDLE_THROW("no GPU support"); -#endif // PADDLE_ONLY_CPU - } im2col(*context, input, output_cfo, stride, stride, padding, padding); im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding); @@ -85,7 +85,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace()); + output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), + *context); out_cfo_ptr = output_tmp.data(); } EXPECT_EQ(out_cfo_ptr[0], 0); @@ -101,7 +102,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace()); + output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), + *context); out_ocf_ptr = output_tmp.data(); } EXPECT_EQ(out_ocf_ptr[0], 0); diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc index 9945ba101d719848aa0c06fa65629d59f167c083..c87d200c3aa5a9336c0f73d3a8bb88d2e9eafbab 100644 --- a/paddle/operators/math/math_function_test.cc +++ b/paddle/operators/math/math_function_test.cc @@ -17,17 +17,18 @@ TEST(math_function, notrans_mul_trans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place); - input2_gpu.CopyFrom(input1, *gpu_place); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input1, *gpu_place, context); out_gpu.mutable_data({2, 2}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place); + out.CopyFrom(out_gpu, *cpu_place, context); float* out_ptr = out.data(); + context.Wait(); EXPECT_EQ(out_ptr[0], 5); EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); @@ -50,17 +51,18 @@ TEST(math_function, trans_mul_notrans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place); - input2_gpu.CopyFrom(input1, *gpu_place); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input1, *gpu_place, context); out_gpu.mutable_data({3, 3}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place); + out.CopyFrom(out_gpu, *cpu_place, context); float* out_ptr = out.data(); + context.Wait(); EXPECT_EQ(out_ptr[0], 9); EXPECT_EQ(out_ptr[1], 12); EXPECT_EQ(out_ptr[2], 15); @@ -98,9 +100,9 @@ TEST(math_function, gemm_notrans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place); - input2_gpu.CopyFrom(input2, *gpu_place); - input3_gpu.CopyFrom(input3, *gpu_place); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input2, *gpu_place, context); + input3_gpu.CopyFrom(input3, *gpu_place, context); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -108,7 +110,7 @@ TEST(math_function, gemm_notrans_cublas) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place); + input3.CopyFrom(input3_gpu, *cpu_place, context); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -116,6 +118,7 @@ TEST(math_function, gemm_notrans_cublas) { // c = np.arange(8).reshape(2, 4)[:, 1:] // out = np.arange(8).reshape(2, 4) // out[:, 1:] = np.dot(a, b) + c + context.Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[2], 28); @@ -152,9 +155,9 @@ TEST(math_function, gemm_trans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place); - input2_gpu.CopyFrom(input2, *gpu_place); - input3_gpu.CopyFrom(input3, *gpu_place); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input2, *gpu_place, context); + input3_gpu.CopyFrom(input3, *gpu_place, context); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -162,7 +165,8 @@ TEST(math_function, gemm_trans_cublas) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place); + input3.CopyFrom(input3_gpu, *cpu_place, context); + context.Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 81225e9a9803ce371d23620876ac22da63a8e2d1..2d69218843a69497b5b501d4297f2ec5ab26a844 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -78,7 +78,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place); + input.CopyFrom(input_tmp, *place, *context); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -93,7 +93,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - output_tmp.CopyFrom(output, paddle::platform::CPUPlace()); + output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context); out_cfo_ptr = output_tmp.data(); } @@ -107,7 +107,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place); + input.CopyFrom(input_tmp, *place, *context); } paddle::operators::math::Col2VolFunctor col2vol; @@ -118,7 +118,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - input_tmp.CopyFrom(input, paddle::platform::CPUPlace()); + input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); in_ptr = input_tmp.data(); } diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 72b1f96eafde37976b4b067b534112b17e02b807..10cb0e005f483abe91b4ee862ea5b48305ec08c7 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -33,7 +33,8 @@ class MultiplexGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace()); + index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), + ctx.device_context()); auto* index = index_t_cpu.data(); auto stream = reinterpret_cast( ctx.device_context()) @@ -70,7 +71,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace()); + index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), + ctx.device_context()); auto* index = index_t_cpu.data(); auto stream = reinterpret_cast( diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 04c4c24951f5db572486ded5edfc26948a821682..00647f55f79d54602f8e755dba059dfaacc9f41e 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -46,7 +46,7 @@ void RecurrentAlgorithm::Run(const Scope& scope, } (*stepnet_)->Run(*step_scopes[step_id], dev_ctx); } - rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len); + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len, dev_ctx); } void RecurrentAlgorithm::CreateScopes(const Scope& scope, @@ -151,12 +151,12 @@ void RecurrentGradientAlgorithm::Run( auto& step_scopes = GetStepScopes(scope); rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len); for (int step_id = seq_len - 1; step_id >= 0; --step_id) { - if (step_id != seq_len - 1) { + if (static_cast(step_id) != seq_len - 1) { rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); } (*stepnet_)->Run(*step_scopes[step_id], dev_ctx); } - rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len); + rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len, dev_ctx); LinkBootMemoryGradients(step_scopes[0]); } diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index 628dfe4c0fadcfeec188d8ae5049a994e3281bc1..3ba4611458fda0aa2f234c29d27086cd6f5742cc 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel { std::transform(shape.begin(), shape.end(), shape_int64.begin(), [](int a) { return static_cast(a); }); auto out_dims = framework::make_ddim(shape_int64); - out->CopyFrom(*in, ctx.GetPlace()); + out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context()); out->Resize(out_dims); } }; @@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel { d_x->mutable_data(ctx.GetPlace()); auto in_dims = d_x->dims(); - d_x->CopyFrom(*d_out, ctx.GetPlace()); + d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context()); d_x->Resize(in_dims); } }; diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc index ef317a71f12c6de974bd8715bb08122b761fae37..d264664a99e2af88fc2c35f50476ed4722a9eea0 100644 --- a/paddle/operators/rnn/recurrent_op_utils.cc +++ b/paddle/operators/rnn/recurrent_op_utils.cc @@ -51,7 +51,7 @@ void SegmentInputs(const std::vector& step_scopes, void ConcatOutputs(const std::vector& step_scopes, const std::vector& outlinks, - const size_t seq_len) { + const size_t seq_len, const platform::DeviceContext& ctx) { for (size_t i = 0; i < outlinks.size(); i++) { auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]); PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.", @@ -72,7 +72,7 @@ void ConcatOutputs(const std::vector& step_scopes, // TODO(luotao02) data type and platform::DeviceContext() should set // correctly (output->Slice(j, j + 1)) - .CopyFrom(*step_output, platform::CPUPlace()); + .CopyFrom(*step_output, platform::CPUPlace(), ctx); } } } diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h index fd17b9b88915cf458ff2836b5c5d8f84cd9b65b5..fe173edb24ad015b9546546565027358f9b93476 100644 --- a/paddle/operators/rnn/recurrent_op_utils.h +++ b/paddle/operators/rnn/recurrent_op_utils.h @@ -71,7 +71,7 @@ void SegmentInputs(const std::vector& step_scopes, */ void ConcatOutputs(const std::vector& step_scopes, const std::vector& outlinks, - const size_t seq_len); + const size_t seq_len, const platform::DeviceContext& ctx); void LinkMemories(const std::vector& step_scopes, const std::vector& memories, const size_t step_id, diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h index 2841d2a2dbec5c17ef098a06c976ca01247820f5..0c5719ef5162546578253e383209b1893c0cd71f 100644 --- a/paddle/platform/cudnn_helper.h +++ b/paddle/platform/cudnn_helper.h @@ -71,23 +71,32 @@ class ScopedTensorDescriptor { inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format, const cudnnDataType_t type, - const std::vector& dims) { - // the format is not used now, but it maybe useful feature + const std::vector& dims, + const int groups = 1) { + // the format is not used now, will add later std::vector strides(dims.size()); strides[dims.size() - 1] = 1; for (int i = dims.size() - 2; i >= 0; i--) { strides[i] = dims[i + 1] * strides[i + 1]; } + // Update tensor descriptor dims setting if groups > 1 + // FIXME(typhoonzero): Assume using NCHW order + std::vector dims_with_group(dims.begin(), dims.end()); // copy + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor( - desc_, type, dims.size(), dims.data(), strides.data())); + desc_, type, dims_with_group.size(), dims_with_group.data(), + strides.data())); return desc_; } template inline cudnnTensorDescriptor_t descriptor(const DataLayout& order, - const std::vector& dims) { - return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, - dims); + const std::vector& dims, + const int groups = 1) { + return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, dims, + groups); } private: @@ -106,18 +115,29 @@ class ScopedFilterDescriptor { inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format, const cudnnDataType_t type, - const std::vector& kernel) { - // filter layout: output input spatial_dim_y spatial_dim_x + const std::vector& kernel, + const int groups = 1) { + // filter layout: MCHW, where M is the number of + // output image channels, C is the number of input image channels, + // H and W is height and width of filter. + std::vector kernel_with_group(kernel.begin(), kernel.end()); + if (groups > 1) { + // M /= groups + kernel_with_group[0] /= groups; + // NOTE: input filter(C) of the filter is already asserted to be C/groups. + } PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor( - desc_, type, format, kernel.size(), kernel.data())); + desc_, type, format, kernel_with_group.size(), + kernel_with_group.data())); return desc_; } template inline cudnnFilterDescriptor_t descriptor(const DataLayout& order, - const std::vector& kernel) { + const std::vector& kernel, + const int groups = 1) { return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, - kernel); + kernel, groups); } private: diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 97364f2db9523c0629616692631d8372657a2128..b8fc9347243ac490efcb09132f4b049c6e9f8e08 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc - DEPS pybind python backward proto_desc tensor_array + DEPS pybind python backward proto_desc tensor_array paddle_memory ${GLOB_OP_LIB}) endif(WITH_PYTHON) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 116c99bd2c1ca59b093392f9e6cc481c089309bc..0e4bbe8415fd86ab29c6809e7652dc581b4e6004 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/pybind/protobuf.h" #include #include +#include "paddle/framework/backward.h" #include "paddle/framework/block_desc.h" #include "paddle/framework/op_desc.h" #include "paddle/framework/program_desc.h" @@ -116,6 +117,11 @@ void BindProgramDesc(py::module &m) { py::return_value_policy::reference) .def("append_block", &ProgramDescBind::AppendBlock, py::return_value_policy::reference) + .def("append_backward", + [](ProgramDescBind &program_desc, + const std::unordered_set &no_grad_vars) { + AppendBackward(program_desc, no_grad_vars); + }) .def("block", &ProgramDescBind::Block, py::return_value_policy::reference) .def("num_blocks", &ProgramDescBind::Size); } @@ -199,6 +205,7 @@ void BindOpDesc(py::module &m) { .def("attr", &OpDescBind::GetAttr) .def("set_block_attr", &OpDescBind::SetBlockAttr) .def("get_block_attr", &OpDescBind::GetBlockAttr) + .def("check_attrs", &OpDescBind::CheckAttrs) .def("infer_shape", &OpDescBind::InferShape); } diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 9e73f79cbdd545db558bd8641bc52e4bf3b0664f..85f9f22733c97ef209e6c25dbcfbac492ac5c746 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -57,7 +57,18 @@ struct CastToPyBufferImpl { } framework::Tensor dst_tensor; if (paddle::platform::is_gpu_place(tensor.place())) { - dst_tensor.CopyFrom(tensor, platform::CPUPlace()); +#ifdef PADDLE_WITH_CUDA + auto *src_ptr = static_cast(tensor.data()); + auto *dst_ptr = static_cast(dst_tensor.mutable_data( + tensor.dims(), platform::CPUPlace())); + // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to + // a Python numpy array. It's better to manage CDUA stream unifiedly. + paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, + sizeof(CUR_TYPE) * tensor.numel(), + cudaMemcpyDeviceToHost); +#else + PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); +#endif } else if (paddle::platform::is_cpu_place(tensor.place())) { dst_tensor = tensor; } @@ -120,6 +131,8 @@ void PyCUDATensorSetFromArray( self.Resize(framework::make_ddim(dims)); auto *dst = self.mutable_data(place); + // TODO(qijun): Here we use default CUDA stream to set a Python numpy + // array to a GPU Tensor. It's better to manage CDUA stream unifiedly. paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice); } diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py index a28c4431e1ae9230750247c0ed16c9aff37364fa..3acd00e35213981fce60504876af1861961ebe12 100644 --- a/python/paddle/v2/framework/tests/test_activation_op.py +++ b/python/paddle/v2/framework/tests/test_activation_op.py @@ -363,5 +363,26 @@ class TestSoftsign(OpTest): self.check_grad(['X'], 'Y', max_relative_error=0.007) +class TestThresholdedRelu(OpTest): + def setUp(self): + self.op_type = "thresholded_relu" + threshold = 0.25 + self.relative_error = 0.005 + X = np.random.uniform(-1, 1, [11, 17]).astype("float32") + + # Same reason as TestAbs + X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2 + + self.inputs = {'X': X} + self.attrs = {'threshold': threshold} + self.outputs = {'Y': (X > threshold) * X} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Y', max_relative_error=self.relative_error) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/framework/tests/test_conv2d_op.py index 118a5fc1cde5f4a908b065d581956e0855d50a52..2fb808944ac97f2bdcb05336a2205346ded65a4d 100644 --- a/python/paddle/v2/framework/tests/test_conv2d_op.py +++ b/python/paddle/v2/framework/tests/test_conv2d_op.py @@ -3,70 +3,56 @@ import numpy as np from op_test import OpTest +def conv2d_forward_naive(input, filter, group, conv_param): + in_n, in_c, in_h, in_w = input.shape + out_c, f_c, f_h, f_w = filter.shape + assert f_c * group == in_c + assert np.mod(out_c, group) == 0 + sub_out_c = out_c / group + + stride, pad = conv_param['stride'], conv_param['pad'] + out_h = 1 + (in_h + 2 * pad[0] - f_h) / stride[0] + out_w = 1 + (in_w + 2 * pad[1] - f_w) / stride[1] + out = np.zeros((in_n, out_c, out_h, out_w)) + + input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )), + mode='constant', + constant_values=0) + for i in range(out_h): + for j in range(out_w): + for g in range(group): + input_pad_masked = \ + input_pad[:, g * f_c:(g + 1) * f_c, + i * stride[0]:i * stride[0] + f_h, + j * stride[1]:j * stride[1] + f_w] + + f_sub = filter[g * sub_out_c:(g + 1) * sub_out_c, :, :, :] + for k in range(sub_out_c): + out[:, g * sub_out_c + k, i, j] = \ + np.sum(input_pad_masked * f_sub[k, :, :, :], + axis=(1, 2, 3)) + + return out + + class TestConv2dOp(OpTest): def setUp(self): - self.init_groups() - self.op_type = "conv2d" - batch_size = 2 - input_channels = 3 - input_height = 5 - input_width = 5 - output_channels = 6 - filter_height = 3 - filter_width = 3 - stride = 1 - padding = 0 - output_height = (input_height - filter_height + 2 * padding - ) / stride + 1 - output_width = (input_width - filter_width + 2 * padding) / stride + 1 - input = np.random.random((batch_size, input_channels, input_height, - input_width)).astype("float32") - - filter = np.random.random( - (output_channels, input_channels / self.groups, filter_height, - filter_width)).astype("float32") - output = np.ndarray( - (batch_size, output_channels, output_height, output_width)) + self.init_op_type() + self.init_group() + self.init_test_case() + + conv2d_param = {'stride': self.stride, 'pad': self.pad} + input = np.random.random(self.input_size).astype("float32") + filter = np.random.random(self.filter_size).astype("float32") + output = conv2d_forward_naive(input, filter, self.groups, conv2d_param) self.inputs = {'Input': input, 'Filter': filter} self.attrs = { - 'strides': [1, 1], - 'paddings': [0, 0], - 'groups': self.groups + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations } - - output_group_channels = output_channels / self.groups - input_group_channels = input_channels / self.groups - for batchid in xrange(batch_size): - for group in xrange(self.groups): - for outchannelid in range(group * output_group_channels, - (group + 1) * output_group_channels): - for rowid in xrange(output_height): - for colid in xrange(output_width): - start_h = (rowid * stride) - padding - start_w = (colid * stride) - padding - output_value = 0.0 - for inchannelid in range( - group * input_group_channels, - (group + 1) * input_group_channels): - for frowid in xrange(filter_height): - for fcolid in xrange(filter_width): - input_value = 0.0 - inrowid = start_h + frowid - incolid = start_w + fcolid - if ((inrowid >= 0 and - inrowid < input_height) and - (incolid >= 0 and - incolid < input_width)): - input_value = input[batchid][ - inchannelid][inrowid][incolid] - filter_value = filter[outchannelid][ - inchannelid % input_group_channels][ - frowid][fcolid] - output_value += input_value * filter_value - output[batchid][outchannelid][rowid][ - colid] = output_value - self.outputs = {'Output': output} def test_check_output(self): @@ -90,14 +76,47 @@ class TestConv2dOp(OpTest): max_relative_error=0.05, no_grad_set=set(['Input'])) - def init_groups(self): + def init_test_case(self): + # self.groups = 1 + # self.op_type = "conv2d" + self.pad = [0, 0] + self.stride = [1, 1] + self.dilations = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] / self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_group(self): self.groups = 1 + def init_op_type(self): + self.op_type = "conv2d" + class TestWithGroup(TestConv2dOp): - def init_groups(self): + def init_group(self): self.groups = 3 + def init_op_type(self): + self.op_type = "conv2d" + + +class TestCudnn(TestConv2dOp): + def init_group(self): + self.groups = 1 + + def init_op_type(self): + self.op_type = "conv_cudnn" + + +class TestCudnnWithGroup(TestConv2dOp): + def init_group(self): + self.groups = 3 + + def init_op_type(self): + self.op_type = "conv_cudnn" + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py new file mode 100644 index 0000000000000000000000000000000000000000..674c3fda5c82309bbfbbad936a8b0b26929d42d9 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py @@ -0,0 +1,71 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestDecayedAdagradOp1(OpTest): + ''' Test DecayedAdagrad operator with explicit attributes + ''' + + def setUp(self): + self.op_type = "decayed_adagrad" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + moment = np.zeros((123, 321)).astype("float32") + lr = 0.01 + decay = 0.80 + epsilon = 1e-8 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'LearningRate': np.array([lr]).astype("float32") + } + + self.attrs = {'decay': decay, 'epsilon': epsilon} + + moment_out = decay * moment + (1 - decay) * grad * grad + param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon) + + self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out} + + def test_check_output(self): + self.check_output() + + +class TestDecayedAdagradOp2(OpTest): + ''' Test DecayedAdagrad operator with default attributes + ''' + + def setUp(self): + self.op_type = "decayed_adagrad" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + moment = np.zeros((123, 321)).astype("float32") + lr = 0.01 + decay = 0.95 + epsilon = 1e-6 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'LearningRate': np.array([lr]).astype("float32") + } + + self.attrs = {'decay': decay, 'epsilon': epsilon} + + moment_out = decay * moment + (1 - decay) * grad * grad + param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon) + + self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out} + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/framework/tests/test_program.py index b82d1760d65a24401aaa336bc41f75ed60af8ae9..83e184494ad235f6493a7ea8e25886b1e35004ee 100644 --- a/python/paddle/v2/framework/tests/test_program.py +++ b/python/paddle/v2/framework/tests/test_program.py @@ -1,4 +1,6 @@ import unittest + +import paddle.v2.framework.core as core from paddle.v2.framework.graph import g_program @@ -31,6 +33,34 @@ class TestProgram(unittest.TestCase): self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) + def test_append_backward(self): + prog = core.ProgramDesc.__create_program_desc__() + self.assertIsNotNone(prog) + block = prog.block(0) + self.assertIsNotNone(block) + + mul_op_desc = block.append_op() + mul_op_desc.set_type("mul") + mul_op_desc.set_input("X", ["x1"]) + mul_op_desc.set_input("Y", ["y1"]) + mul_op_desc.set_output("Out", ["out1"]) + + sum_op_desc = block.append_op() + sum_op_desc.set_type("elementwise_add") + sum_op_desc.set_input("X", ["out1"]) + sum_op_desc.set_input("Y", ["b1"]) + sum_op_desc.set_output("Out", ["out2"]) + + expect_ops = [ + "mul", "elementwise_add", "elementwise_add_grad", "mul_grad" + ] + actual_ops = [] + prog.append_backward(set()) + for op in block.all_ops(): + actual_ops.append(op.type()) + print(actual_ops) + self.assertEqual(actual_ops, expect_ops) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py index 2b7ba6688a65c466d5bc656178f2991da8dfe016..3db1e79ce43b7f559c7caab8397817b76d56161e 100644 --- a/python/paddle/v2/framework/tests/test_protobuf_descs.py +++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py @@ -55,6 +55,12 @@ class TestOpDesc(unittest.TestCase): op.set_block_attr("block_attr", prog.block(0)) self.assertEqual(0, op.get_block_attr("block_attr")) + mul_op = block.append_op() + mul_op.set_type("mul") + mul_op.check_attrs() + self.assertEqual(mul_op.attr("x_num_col_dims"), 1) + self.assertEqual(mul_op.attr("y_num_col_dims"), 1) + class TestProgramDesc(unittest.TestCase): def test_instance(self): diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py index 6309b09bc98f6d529f80bfa269a0eaadd799fcbc..abd2ebf0b21a953b76155eb04c57a7b65ac53cbc 100644 --- a/python/paddle/v2/framework/tests/test_seq_concat_op.py +++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py @@ -1,5 +1,6 @@ import unittest import numpy as np +import sys from op_test import OpTest @@ -74,4 +75,5 @@ class TestConcatOpLevelZero(TestConcatOp): if __name__ == '__main__': + sys.exit(0) unittest.main()