From 5ad1aef051349a73b00b8d611f0ae2508f02490b Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Sun, 14 Jan 2018 16:54:04 +0800 Subject: [PATCH] "cudnn operators change to cudnn kernel" (#6660) * "unified operators" * "add CUDNN register" * "add use cudnn attribute" * "add attribute" * "test conv tranpose op" * "remove duplicated attr" * "fix op test" * "add attribute to set cudnn" * "add more log" * "need layout op register support" * "add more log" * "change GetExpectedKernelType " * "fix Get attr in conv_op" * "fix CI" * "fix tests" * "removed kernel priority fallback" * "fix CI" * "fix stack pointer bug" * "refine buggy interface" * "add const cast to save life" * "fix get_output_with_grad" * "fix op test with dataformat" * ""fix pooling * "fix pooling test" * "fix CI" * "fix with_gpu error" * "add transform needed functional check" * "fix unpack list error" * "comment out parallel.do temporary" * "fix CI" * "fix compile doc error" * "make threshold larger" --- paddle/framework/data_device_transform.cc | 5 +- paddle/framework/data_device_transform.h | 3 +- paddle/framework/data_layout.h | 19 +++- paddle/framework/data_transform.cc | 10 +- paddle/framework/data_transform.h | 6 +- paddle/framework/op_kernel_type.h | 5 + paddle/framework/op_registry_test.cc | 18 ---- paddle/framework/operator.cc | 100 +++++------------- paddle/framework/operator.h | 26 +---- paddle/operators/CMakeLists.txt | 19 +++- paddle/operators/conv_cudnn_op.cc | 74 ------------- paddle/operators/conv_cudnn_op.cu.cc | 31 +++--- paddle/operators/conv_op.cc | 73 +++++++++++++ paddle/operators/conv_op.h | 8 ++ paddle/operators/conv_transpose_cudnn_op.cc | 78 -------------- .../operators/conv_transpose_cudnn_op.cu.cc | 36 +++---- paddle/operators/conv_transpose_op.cc | 72 +++++++++++++ paddle/operators/conv_transpose_op.h | 8 ++ paddle/operators/math/sequence2batch.cc | 1 + paddle/operators/pool_cudnn_op.cc | 39 ------- paddle/operators/pool_cudnn_op.cu.cc | 29 ++--- paddle/operators/pool_cudnn_op.h | 19 ---- paddle/operators/pool_op.cc | 65 +++++++++++- paddle/operators/pool_op.h | 8 ++ paddle/platform/dynload/cudnn.cc | 2 +- paddle/platform/dynload/cudnn.h | 2 +- paddle/platform/dynload/dynamic_loader.cc | 2 +- paddle/platform/dynload/dynamic_loader.h | 2 +- paddle/pybind/pybind.cc | 7 +- paddle/pybind/tensor_py.h | 23 +++- python/paddle/v2/fluid/layers/nn.py | 2 +- python/paddle/v2/fluid/tests/op_test.py | 64 ++++++----- .../paddle/v2/fluid/tests/test_conv2d_op.py | 93 ++++++++++------ .../fluid/tests/test_conv2d_transpose_op.py | 80 ++++++++++---- .../paddle/v2/fluid/tests/test_conv3d_op.py | 85 ++++++++++----- .../fluid/tests/test_conv3d_transpose_op.py | 80 ++++++++++---- .../paddle/v2/fluid/tests/test_parallel_op.py | 8 +- .../paddle/v2/fluid/tests/test_pool2d_op.py | 49 ++++++--- .../paddle/v2/fluid/tests/test_pool3d_op.py | 49 ++++++--- 39 files changed, 732 insertions(+), 568 deletions(-) delete mode 100644 paddle/operators/conv_cudnn_op.cc delete mode 100644 paddle/operators/conv_transpose_cudnn_op.cc delete mode 100644 paddle/operators/pool_cudnn_op.cc delete mode 100644 paddle/operators/pool_cudnn_op.h diff --git a/paddle/framework/data_device_transform.cc b/paddle/framework/data_device_transform.cc index b3fd48ae1..d38d87927 100644 --- a/paddle/framework/data_device_transform.cc +++ b/paddle/framework/data_device_transform.cc @@ -31,15 +31,14 @@ static const platform::DeviceContext* GetDeviceContext( } } -Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place) { +void DeviceTransform(const Tensor& in, const platform::Place& dst_place, + Tensor* out) { VLOG(3) << "DeviceTransform in, src_place " << in.place() << " dst_place: " << dst_place; - Tensor* out = new Tensor(); auto* dev_ctx = GetDeviceContext(in.place(), dst_place); dev_ctx->Wait(); Copy(in, dst_place, *dev_ctx, out); dev_ctx->Wait(); - return out; } } // namespace framework diff --git a/paddle/framework/data_device_transform.h b/paddle/framework/data_device_transform.h index bebf0d1b3..b21ed0be3 100644 --- a/paddle/framework/data_device_transform.h +++ b/paddle/framework/data_device_transform.h @@ -21,7 +21,8 @@ limitations under the License. */ namespace paddle { namespace framework { -Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place); +void DeviceTransform(const Tensor& in, const platform::Place& dst_place, + Tensor* out); } // namespace framework } // namespace paddle diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h index 3ab976eca..31817251e 100644 --- a/paddle/framework/data_layout.h +++ b/paddle/framework/data_layout.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once -#include +#include +#include + #include "paddle/platform/enforce.h" namespace paddle { @@ -27,12 +29,19 @@ enum class DataLayout { }; inline DataLayout StringToDataLayout(const std::string& str) { - if (str == "NHWC" || str == "nhwc") { + std::string s(str); + for (size_t i = 0; i < s.size(); ++i) { + s[i] = toupper(s[i]); + } + + if (s == "NHWC") { return DataLayout::kNHWC; - } else if (str == "NCHW" || str == "nchw") { + } else if (s == "NCHW") { return DataLayout::kNCHW; + } else if (s == "ANYLAYOUT") { + return DataLayout::kAnyLayout; } else { - PADDLE_THROW("Unknown storage order string: %s", str); + PADDLE_THROW("Unknown storage order string: %s", s); } } @@ -49,7 +58,7 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) { } } -inline std::ostream& operator<<(std::ostream& out, DataLayout l) { +inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) { out << DataLayoutToString(l); return out; } diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index e56edb953..d826f0eda 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -19,16 +19,14 @@ limitations under the License. */ namespace paddle { namespace framework { -Tensor* DataTransform(const OpKernelType& expected_kernel_type, - const OpKernelType& kernel_type_for_var, - const Tensor& input_tensor) { - Tensor* out = nullptr; +void DataTransform(const OpKernelType& expected_kernel_type, + const OpKernelType& kernel_type_for_var, + const Tensor& input_tensor, Tensor* out) { if (!platform::is_same_place(kernel_type_for_var.place_, expected_kernel_type.place_)) { - out = DeviceTransform(input_tensor, expected_kernel_type.place_); + DeviceTransform(input_tensor, expected_kernel_type.place_, out); } PADDLE_ENFORCE_NOT_NULL(out, "out should not be null"); - return out; } void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index ee95c7e85..a4b789023 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -30,9 +30,9 @@ limitations under the License. */ namespace paddle { namespace framework { -Tensor* DataTransform(const OpKernelType& expected_kernel_type, - const OpKernelType& kernel_type_for_var, - const Tensor& input_tensor); +void DataTransform(const OpKernelType& expected_kernel_type, + const OpKernelType& kernel_type_for_var, + const Tensor& input_tensor, Tensor* out); void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, Variable& out_var); diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h index 053897784..312bd5f89 100644 --- a/paddle/framework/op_kernel_type.h +++ b/paddle/framework/op_kernel_type.h @@ -85,5 +85,10 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) { return stream.str(); } +inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { + return (!platform::places_are_same_class(l.place_, r.place_)) || + (l.data_type_ != r.data_type_) || (l.data_layout_ != r.data_layout_); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index 66f07b675..341da8bef 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -368,24 +368,6 @@ TEST(OperatorRegistrar, OpWithMultiKernel) { // TODO(qiao) add priority back // use all available kernels - paddle::framework::UseALL(); op->Run(scope, cuda_place); EXPECT_EQ(op_test_value, -10); - - // remove cuda kernels - paddle::framework::UseCPU(); - op->Run(scope, cpu_place); - - EXPECT_EQ(op_test_value, -9); - - // add cuda kernels - paddle::framework::UseCUDA(); - op->Run(scope, cuda_place); - - EXPECT_EQ(op_test_value, -10); - - // use cudnn kernel - paddle::framework::UseCUDNN(); - op->Run(scope, cuda_place); - EXPECT_EQ(op_test_value, -20); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index be1373dc2..84c010df7 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -29,52 +29,12 @@ DEFINE_bool(op_sync, false, namespace paddle { namespace framework { -std::vector> kKernelPriority; - -void UseCPU() { - kKernelPriority.clear(); - /*Plain CPU*/ - auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kPlain); - kKernelPriority.insert(kKernelPriority.begin(), pair0); -} - -void UseMKLDNN() { - UseCPU(); -#if PADDLE_WITH_MKLML - { - /*MKLDNN Kernel*/ - auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN); - kKernelPriority.insert(kKernelPriority.begin(), pair0); - } -#endif -} - -void UseCUDA() { - UseMKLDNN(); -#if PADDLE_WITH_CUDA - /*Plain GPU*/ - auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain); - kKernelPriority.insert(kKernelPriority.begin(), pair0); -#endif -} - -void UseCUDNN() { - UseCUDA(); -#if PADDLE_WITH_CUDA - if (platform::dynload::HasCUDNN()) { - /*CUDNN Kernel*/ - auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN); - kKernelPriority.insert(kKernelPriority.begin(), pair0); - } -#endif -} - -void UseALL() { - UseCPU(); - UseMKLDNN(); - UseCUDA(); - UseCUDNN(); -} +std::vector> kKernelPriority = { + std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), + std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), + std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN), + std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), +}; static DDim GetDims(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); @@ -271,36 +231,33 @@ static bool VarIsTensor(const Variable* var) { return var->IsType() || var->IsType(); } -static const Tensor* GetTensorFromVar(const Variable* var) { - const Tensor* t = nullptr; +static const Tensor* GetTensorFromVar(Variable* var) { if (var->IsType()) { - t = &(var->Get()); + return var->GetMutable(); } else if (var->IsType()) { - t = &(var->Get().value()); + return var->GetMutable()->mutable_value(); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", var->Type().name()); } - return t; } static Tensor* GetMutableTensorFromVar(Variable* var) { - Tensor* t = nullptr; if (var->IsType()) { - t = var->GetMutable(); + return var->GetMutable(); } else if (var->IsType()) { - t = var->GetMutable()->mutable_value(); + return var->GetMutable()->mutable_value(); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", var->Type().name()); } - return t; } template <> const Tensor* ExecutionContext::Input(const std::string& name) const { auto* var = InputVar(name); - return var == nullptr ? nullptr : GetTensorFromVar(var); + return var == nullptr ? nullptr + : GetTensorFromVar(const_cast(var)); } template <> @@ -343,6 +300,7 @@ bool OpSupportGPU(const std::string& op_type) { auto it = all_kernels.find(op_type); if (it == all_kernels.end()) { // All control operator must support GPU + return true; } for (auto& kern_pair : it->second) { @@ -516,21 +474,17 @@ void OperatorWithKernel::Run(const Scope& scope, } ExecutionContext ctx(*this, scope, *dev_ctx); - auto expected_kernel_key = this->GetExpectedKernelType(ctx); OpKernelMap& kernels = kernels_iter->second; - for (auto& candidate : kKernelPriority) { - auto candidate_key = - OpKernelType(expected_kernel_key.data_type_, std::get<0>(candidate), - expected_kernel_key.data_layout_, std::get<1>(candidate)); + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - if ((candidate_key == expected_kernel_key) || - (kernels.count(candidate_key))) { - expected_kernel_key = candidate_key; - break; - } - } + // for (auto& candidate : kKernelPriority) { + // Do selection + // } + + auto expected_kernel_key = this->GetExpectedKernelType(ctx); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; @@ -544,7 +498,7 @@ void OperatorWithKernel::Run(const Scope& scope, if (tensor_in->IsInitialized()) { auto kernel_type_for_var = this->GetKernelTypeForVar( var_name_item.first, *tensor_in, expected_kernel_key); - if (kernel_type_for_var != expected_kernel_key) { + if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) { auto out_var_names = OutputVars(true); if (std::find(out_var_names.begin(), out_var_names.end(), var_name) != out_var_names.end()) { @@ -553,11 +507,13 @@ void OperatorWithKernel::Run(const Scope& scope, "does not support transform", var_name); } - VLOG(3) << "need to do transform for var " << var_name; + VLOG(3) << "Transform Variable " << var_name << " from " + << kernel_type_for_var << " to " << expected_kernel_key; auto* trans_var = new_scope.Var(var_name); - auto* out = DataTransform(expected_kernel_key, kernel_type_for_var, - *tensor_in); - CopyVariableWithTensor(*var, *out, *trans_var); + std::shared_ptr out(new Tensor); + DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in, + out.get()); + CopyVariableWithTensor(*var, *(out.get()), *trans_var); } } } diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index d5feb5986..c9140f304 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -54,33 +54,9 @@ constexpr char kGradVarSuffix[] = "@GRAD"; constexpr char kZeroVarSuffix[] = "@ZERO"; // define some kernel priority +/* Define multiple kernel type fallback order*/ extern std::vector> kKernelPriority; -/** - * @brief Use cpu kernel only - */ -void UseCPU(); - -/** - * @brief Perfer MKLDNN kernel than Plain CPU kernel - */ -void UseMKLDNN(); - -/** - * @brief Perfer CUDA kernel than Plain CPU kernel - */ -void UseCUDA(); - -/** - * @brief Perfer cudnn kernel than Plain CUDA kernel - */ -void UseCUDNN(); - -/** - * @brief Use all available kernels - */ -void UseALL(); - inline std::string GradVarName(const std::string& var_name) { return var_name + kGradVarSuffix; } diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index e1b695e8c..2569535c2 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -137,8 +137,6 @@ op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) op_library(print_op DEPS lod_tensor) op_library(adagrad_op DEPS selected_rows_functor) -op_library(conv_op DEPS vol2col) -op_library(pool_op DEPS pooling) op_library(maxout_op DEPS maxouting) op_library(unpool_op DEPS unpooling) op_library(pool_with_index_op DEPS pooling) @@ -149,12 +147,27 @@ op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) -op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding math_function) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) + +# Regist multiple Kernel to pybind +if (WITH_GPU) +op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col) +op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) +op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc + conv_transpose_cudnn_op.cu.cc DEPS vol2col) +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n") +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n") +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n") +else() +op_library(conv_op SRCS conv_op.cc DEPS vol2col) +op_library(pool_op SRCS pool_op.cc DEPS pooling) +op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col) +endif() + # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc deleted file mode 100644 index 84d9ce197..000000000 --- a/paddle/operators/conv_cudnn_op.cc +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/operators/conv_op.h" - -namespace paddle { -namespace operators { - -class CudnnConv2DOpMaker : public Conv2DOpMaker { - public: - CudnnConv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) - : Conv2DOpMaker(proto, op_checker) { - AddAttr("workspace_size_MB", - "workspace size for cudnn, in MB, " - "workspace is a section of GPU memory which will be " - "allocated/freed each time the operator runs, larger " - "workspace size can increase performance but also requires " - "better hardware. This size should be chosen carefully.") - .SetDefault(4096); - } -}; - -class CudnnConv3DOpMaker : public Conv3DOpMaker { - public: - CudnnConv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) - : Conv3DOpMaker(proto, op_checker) { - AddAttr("workspace_size_MB", - "workspace size for cudnn, in MB, " - "workspace is a section of GPU memory which will be " - "allocated/freed each time the operator runs, larger " - "workspace size can increase performance but also requires " - "better hardware. This size should be chosen carefully.") - .SetDefault(4096); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker, - conv2d_cudnn_grad, ops::ConvOpGrad); - -REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker, - conv3d_cudnn_grad, ops::ConvOpGrad); - -REGISTER_OP_CPU_KERNEL( - conv2d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv3d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 0c5ed3e4e..3a5409a7e 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -32,7 +32,7 @@ static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static_cast(1024) * 1024 * 1024; template -class CudnnConvOpKernel : public framework::OpKernel { +class CUDNNConvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -147,7 +147,7 @@ class CudnnConvOpKernel : public framework::OpKernel { }; template -class CudnnConvGradOpKernel : public framework::OpKernel { +class CUDNNConvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -315,17 +315,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -// TODO(dzhwinter) : below register should be removed -REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_CUDA_KERNEL(conv2d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); - -REGISTER_OP_CUDA_KERNEL(conv3d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_CUDA_KERNEL(conv3d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); +REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); + +REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index 1468e3eb9..424eccdb7 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -67,6 +67,23 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ctx->ShareLoD("Input", "Output"); } +framework::OpKernelType ConvOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( @@ -108,6 +125,26 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "dilations(h_dilation, w_dilation) of " "convolution operator.") .SetDefault({1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. Need set use_cudnn to true." + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); AddComment(R"DOC( Convolution Operator. @@ -181,6 +218,25 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "dilations(d_dilation, h_dilation, w_dilation) of " "convolution operator.") .SetDefault({1, 1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); AddComment(R"DOC( Convolution3D Operator. @@ -224,6 +280,23 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { } } +framework::OpKernelType ConvOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + } // namespace operators } // namespace paddle diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h index 83786e232..5a8933e79 100644 --- a/paddle/operators/conv_op.h +++ b/paddle/operators/conv_op.h @@ -62,12 +62,20 @@ class ConvOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; }; class ConvOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; }; template diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc deleted file mode 100644 index 2e5333a26..000000000 --- a/paddle/operators/conv_transpose_cudnn_op.cc +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/operators/conv_transpose_op.h" - -namespace paddle { -namespace operators { - -class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker { - public: - CudnnConv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker) - : Conv2DTransposeOpMaker(proto, op_checker) { - AddAttr("workspace_size_MB", - "workspace size for cudnn, in MB, " - "workspace is a section of GPU memory which will be " - "allocated/freed each time the operator runs, larger " - "workspace size can increase performance but also requires " - "better hardward. This size should be carefully setted.") - .SetDefault(4096); - } -}; - -class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker { - public: - CudnnConv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker) - : Conv3DTransposeOpMaker(proto, op_checker) { - AddAttr("workspace_size_MB", - "workspace size for cudnn, in MB, " - "workspace is a section of GPU memory which will be " - "allocated/freed each time the operator runs, larger " - "workspace size can increase performance but also requires " - "better hardward. This size should be carefully setted.") - .SetDefault(4096); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp, - ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad, - ops::ConvTransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - conv2d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); - -REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, - ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad, - ops::ConvTransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - conv3d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc index fc37776ba..23bc97e13 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc @@ -28,10 +28,10 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using DataLayout = platform::DataLayout; -static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024; +static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024; template -class CudnnConvTransposeOpKernel : public framework::OpKernel { +class CUDNNConvTransposeOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -77,7 +77,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { // ------------------- cudnn conv workspace --------------------- void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. - size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes; + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; if (user_workspace_size > 0) { workspace_size_limit = user_workspace_size * 1024 * 1024; } @@ -116,7 +116,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { }; template -class CudnnConvTransposeGradOpKernel : public framework::OpKernel { +class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -161,7 +161,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { cudnnConvolutionBwdFilterAlgo_t filter_algo; size_t bwd_filter_ws_size, fwd_ws_size; size_t workspace_size_in_bytes = 0; - size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes; + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; if (user_workspace_size > 0) { workspace_size_limit = user_workspace_size * 1024 * 1024; } @@ -236,16 +236,16 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); - -REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); +REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); + +REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 74636d138..cf4e8c0a3 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -58,6 +58,23 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); } +framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { @@ -94,6 +111,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto, "(vector default:{0, 0}), the paddings(h_pad, w_pad) of convolution " "transpose operator.") .SetDefault({0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); AddComment(R"DOC( Convolution2D Transpose Operator. @@ -163,6 +199,25 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto, "(vector default:{0, 0, 0}), paddings(d_pad, " "h_pad, w_pad) of convolution transpose operator.") .SetDefault({0, 0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); AddComment(R"DOC( Convolution3D Transpose Operator. @@ -205,6 +260,23 @@ void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const { } } +framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + } // namespace operators } // namespace paddle diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index 4c8f8a806..a42ade41b 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -42,12 +42,20 @@ class ConvTransposeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; }; class ConvTransposeOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; }; template diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc index 88977be1f..e459a42ca 100644 --- a/paddle/operators/math/sequence2batch.cc +++ b/paddle/operators/math/sequence2batch.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/math/sequence2batch.h" +#include "paddle/operators/math/math_function.h" namespace paddle { namespace operators { diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc deleted file mode 100644 index 77407f5cd..000000000 --- a/paddle/operators/pool_cudnn_op.cc +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/operators/pool_cudnn_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad, - ops::PoolOpGrad); - -REGISTER_OP_CPU_KERNEL( - pool2d_cudnn, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) - -REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad, - ops::PoolOpGrad); - -REGISTER_OP_CPU_KERNEL( - pool3d_cudnn, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool3d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc index 2d0001ba1..446fb0819 100644 --- a/paddle/operators/pool_cudnn_op.cu.cc +++ b/paddle/operators/pool_cudnn_op.cu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/pool_cudnn_op.h" +#include "paddle/framework/op_registry.h" +#include "paddle/operators/pool_op.h" #include "paddle/platform/cudnn_helper.h" namespace paddle { @@ -25,7 +26,7 @@ using DataLayout = platform::DataLayout; using PoolingMode = platform::PoolingMode; template -class PoolCudnnOpKernel : public framework::OpKernel { +class PoolCUDNNOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -86,7 +87,7 @@ class PoolCudnnOpKernel : public framework::OpKernel { }; template -class PoolCudnnGradOpKernel : public framework::OpKernel { +class PoolCUDNNGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -162,12 +163,16 @@ class PoolCudnnGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_CUDA_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); - -REGISTER_OP_CUDA_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_CUDA_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); +REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); +REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNGradOpKernel, + ops::PoolCUDNNGradOpKernel); + +REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); +REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNGradOpKernel, + ops::PoolCUDNNGradOpKernel); diff --git a/paddle/operators/pool_cudnn_op.h b/paddle/operators/pool_cudnn_op.h deleted file mode 100644 index 5adf27f5b..000000000 --- a/paddle/operators/pool_cudnn_op.h +++ /dev/null @@ -1,19 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/framework/op_registry.h" -#include "paddle/operators/pool_op.h" - -namespace paddle { -namespace operators {} // namespace operators -} // namespace paddle diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index d3cf5fa63..3e567efd0 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -61,6 +61,23 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { ctx->ShareLoD("X", "Out"); } +framework::OpKernelType PoolOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); +} + void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), @@ -68,6 +85,23 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } +framework::OpKernelType PoolOpGrad::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); +} + Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( @@ -101,15 +135,27 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) AddAttr>("strides", "(vector, default {1, 1}), strides(height, " "width) of pooling operator.") - .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, + .SetDefault({1, 1}); + // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) AddAttr>( "paddings", "(vector, default {0,0}), paddings(height, width) of pooling " "operator." "If global_pooling = true, paddings and ksize will be ignored.") - .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) + .SetDefault({0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function AddComment(R"DOC( Pool2d Operator. @@ -182,6 +228,19 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker) .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddComment(R"DOC( Pool3d Operator. diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h index 3860e295f..c3d82ecbd 100644 --- a/paddle/operators/pool_op.h +++ b/paddle/operators/pool_op.h @@ -29,6 +29,10 @@ class PoolOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; }; class PoolOpGrad : public framework::OperatorWithKernel { @@ -36,6 +40,10 @@ class PoolOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; }; class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc index 76ec82e10..701f6240f 100644 --- a/paddle/platform/dynload/cudnn.cc +++ b/paddle/platform/dynload/cudnn.cc @@ -44,7 +44,7 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #ifdef PADDLE_USE_DSO bool HasCUDNN() { - std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); + std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle); return cudnn_dso_handle != nullptr; } diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h index 8c937b37d..b92634794 100644 --- a/paddle/platform/dynload/cudnn.h +++ b/paddle/platform/dynload/cudnn.h @@ -36,7 +36,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); auto operator()(Args... args) -> decltype(__name(args...)) { \ using cudnn_func = decltype(__name(args...)) (*)(Args...); \ std::call_once(cudnn_dso_flag, \ - paddle::platform::dynload::GetCudnnDsoHandle, \ + paddle::platform::dynload::GetCUDNNDsoHandle, \ &cudnn_dso_handle); \ EnforceCUDNNLoaded(#__name); \ void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc index 7a82d06a0..c8c09ae60 100644 --- a/paddle/platform/dynload/dynamic_loader.cc +++ b/paddle/platform/dynload/dynamic_loader.cc @@ -134,7 +134,7 @@ void GetCublasDsoHandle(void** dso_handle) { #endif } -void GetCudnnDsoHandle(void** dso_handle) { +void GetCUDNNDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle, false); diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h index c0e5452e5..7b0c8c16d 100644 --- a/paddle/platform/dynload/dynamic_loader.h +++ b/paddle/platform/dynload/dynamic_loader.h @@ -32,7 +32,7 @@ void GetCublasDsoHandle(void** dso_handle); * @param **dso_handle dso handler * */ -void GetCudnnDsoHandle(void** dso_handle); +void GetCUDNNDsoHandle(void** dso_handle); /** * @brief load the DSO of CURAND diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 5d170c66e..c5d70bc9f 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -430,13 +430,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_glog", framework::InitGLOG); m.def("init_devices", &framework::InitDevices); - m.def("use_cpu", framework::UseCPU); - m.def("use_mkldnn", framework::UseMKLDNN); - m.def("use_cuda", framework::UseCUDA); - m.def("use_cudnn", framework::UseCUDNN); - m.def("use_all", framework::UseALL); - m.def("is_compile_gpu", IsCompileGPU); + m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 6b4290972..3b5210e2b 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once #include -#include "paddle/framework/tensor.h" +#include "paddle/framework/lod_tensor.h" #include "paddle/memory/memcpy.h" #include "paddle/platform/device_context.h" #include "pybind11/numpy.h" @@ -97,14 +97,27 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { template T TensorGetElement(framework::Tensor &self, size_t offset) { - PADDLE_ENFORCE(platform::is_cpu_place(self.place())); - return self.data()[offset]; + if (platform::is_cpu_place(self.place())) { + return self.data()[offset]; + } else { + std::shared_ptr dst(new framework::Tensor); + framework::Copy(self, platform::CPUPlace(), dst.get()); + return dst->data()[offset]; + } } +// TODO(dzhwinter) : fix the redundent Tensor allocate and free template void TensorSetElement(framework::Tensor &self, size_t offset, T elem) { - PADDLE_ENFORCE(platform::is_cpu_place(self.place())); - self.data()[offset] = elem; + if (platform::is_gpu_place(self.place())) { + std::shared_ptr dst(new framework::Tensor); + framework::Copy(self, platform::CPUPlace(), dst.get()); + dst->data()[offset] = elem; + framework::Copy(*dst.get(), self.place(), &self); + + } else if (platform::is_cpu_place(self.place())) { + self.data()[offset] = elem; + } } template diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 94184d59f..99a40ce45 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -775,7 +775,7 @@ def conv2d(input, pre_bias = helper.create_tmp_variable(dtype) helper.append_op( - type='conv2d_cudnn', + type='conv2d', inputs={ 'Input': input, 'Filter': filter_param, diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index b77d2b126..276cf2c5f 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -31,7 +31,8 @@ def create_op(scope, op_type, inputs, outputs, attrs): kwargs[in_name] = [] if in_dup: sub_in = inputs[in_name] - for sub_in_name, _ in sub_in: + for item in sub_in: + sub_in_name, _ = item[0], item[1] __create_var__(in_name, sub_in_name) else: __create_var__(in_name, in_name) @@ -41,7 +42,8 @@ def create_op(scope, op_type, inputs, outputs, attrs): kwargs[out_name] = [] if out_dup: sub_out = outputs[out_name] - for sub_out_name, _ in sub_out: + for item in sub_out: + sub_out_name, _ = item[0], item[1] __create_var__(out_name, sub_out_name) else: __create_var__(out_name, out_name) @@ -71,13 +73,15 @@ def set_input(scope, op, inputs, place): if in_name in inputs: if in_dup: sub_in = inputs[in_name] - for sub_in_name, sub_in_val in sub_in: + for item in sub_in: + sub_in_name, sub_in_val = item[0], item[1] __set_input__(sub_in_name, sub_in_val) else: __set_input__(in_name, inputs[in_name]) -def get_numeric_gradient(scope, +def get_numeric_gradient(place, + scope, op, inputs, input_to_check, @@ -85,7 +89,7 @@ def get_numeric_gradient(scope, delta=0.005, in_place=False): # FIXME: change this method by compile time concepts - set_input(scope, op, inputs, core.CPUPlace()) + set_input(scope, op, inputs, place) def product(dim): return reduce(lambda a, b: a * b, dim, 1) @@ -93,7 +97,7 @@ def get_numeric_gradient(scope, def get_output(): sum = [] for output_name in output_names: - op.run(scope, core.CPUPlace()) + op.run(scope, place) sum.append( np.array(scope.find_var(output_name).get_tensor()).mean()) return np.array(sum).mean() @@ -127,7 +131,7 @@ def get_numeric_gradient(scope, # we use a for loop to compute the gradient of every element. for i in xrange(tensor_size): if in_place: - set_input(scope, op, inputs, core.CPUPlace()) + set_input(scope, op, inputs, place) # get one input element throw it's index i. origin = __get_elem__(tensor_to_check, i) @@ -137,7 +141,7 @@ def get_numeric_gradient(scope, y_pos = get_output() if in_place: - set_input(scope, op, inputs, core.CPUPlace()) + set_input(scope, op, inputs, place) x_neg = origin - delta __set_elem__(tensor_to_check, i, x_neg) @@ -283,7 +287,8 @@ class OpTest(unittest.TestCase): if not isinstance(sub_out, list): raise AssertionError("sub_out type %s is not list", type(sub_out)) - for sub_out_name, expect in sub_out: + for item in sub_out: + sub_out_name, expect = item[0], item[1] idx = find_actual(sub_out_name, fetch_list) actual = outs[idx] actual_t = np.array(actual) @@ -347,6 +352,24 @@ class OpTest(unittest.TestCase): in_place=False, max_relative_error=0.005, user_defined_grads=None): + places = [core.CPUPlace()] + if core.is_compile_gpu() and core.op_support_gpu(self.op_type): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_grad_with_place(place, inputs_to_check, output_names, + no_grad_set, numeric_grad_delta, + in_place, max_relative_error, + user_defined_grads) + + def check_grad_with_place(self, + place, + inputs_to_check, + output_names, + no_grad_set=None, + numeric_grad_delta=0.005, + in_place=False, + max_relative_error=0.005, + user_defined_grads=None): self.scope = core.Scope() op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_outputs = self.outputs if hasattr(self, "outputs") else dict() @@ -362,6 +385,7 @@ class OpTest(unittest.TestCase): numeric_grads = user_defined_grads or [ get_numeric_gradient( + place, self.scope, self.op, self.inputs, @@ -370,22 +394,12 @@ class OpTest(unittest.TestCase): delta=numeric_grad_delta, in_place=in_place) for input_to_check in inputs_to_check ] - cpu_place = core.CPUPlace() - cpu_analytic_grads = self._get_gradient(inputs_to_check, cpu_place, - output_names, no_grad_set) - - self.__assert_is_close(numeric_grads, cpu_analytic_grads, - inputs_to_check, max_relative_error, - "Gradient Check On %s" % str(cpu_place)) - - if core.is_compile_gpu() and self.op.support_gpu(): - gpu_place = core.CUDAPlace(0) - gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place, - output_names, no_grad_set) - - self.__assert_is_close(numeric_grads, gpu_analytic_grads, - inputs_to_check, max_relative_error, - "Gradient Check On %s" % str(gpu_place)) + analytic_grads = self._get_gradient(inputs_to_check, place, + output_names, no_grad_set) + + self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check, + max_relative_error, + "Gradient Check On %s" % str(place)) @staticmethod def _create_var_descs_(block, var_dict): diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index 958300e65..e9a19d177 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -49,7 +49,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): class TestConv2dOp(OpTest): def setUp(self): - core.use_cuda() + self.use_cudnn = False self.init_op_type() self.init_group() self.init_dilation() @@ -70,30 +70,59 @@ class TestConv2dOp(OpTest): 'strides': self.stride, 'paddings': self.pad, 'groups': self.groups, - 'dilations': self.dilations + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn } self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad(self): - self.check_grad( - set(['Input', 'Filter']), 'Output', max_relative_error=0.02) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + set(['Input', 'Filter']), + 'Output', + max_relative_error=0.02) + else: + self.check_grad( + set(['Input', 'Filter']), 'Output', max_relative_error=0.02) def test_check_grad_no_filter(self): - self.check_grad( - ['Input'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Filter'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter'])) + else: + self.check_grad( + ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter'])) def test_check_grad_no_input(self): - self.check_grad( - ['Filter'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Input'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Filter'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Input'])) + else: + self.check_grad( + ['Filter'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Input'])) def init_test_case(self): self.pad = [0, 0] @@ -167,39 +196,39 @@ class TestWithDilation(TestConv2dOp): self.groups = 3 -#----------------Conv2dCudnn---------------- -class TestCudnn(TestConv2dOp): +#----------------Conv2dCUDNN---------------- +class TestCUDNN(TestConv2dOp): def init_op_type(self): - core.use_cudnn() - self.op_type = "conv2d_cudnn" + self.use_cudnn = True + self.op_type = "conv2d" -class TestCudnnWithPad(TestWithPad): +class TestCUDNNWithPad(TestWithPad): def init_op_type(self): - core.use_cudnn() - self.op_type = "conv2d_cudnn" + self.use_cudnn = True + self.op_type = "conv2d" -class TestCudnnWithStride(TestWithStride): +class TestCUDNNWithStride(TestWithStride): def init_op_type(self): - core.use_cudnn() - self.op_type = "conv2d_cudnn" + self.use_cudnn = True + self.op_type = "conv2d" -class TestCudnnWithGroup(TestWithGroup): +class TestCUDNNWithGroup(TestWithGroup): def init_op_type(self): - core.use_cudnn() - self.op_type = "conv2d_cudnn" + self.use_cudnn = True + self.op_type = "conv2d" -class TestCudnnWith1x1(TestWith1x1): +class TestCUDNNWith1x1(TestWith1x1): def init_op_type(self): - core.use_cudnn() - self.op_type = "conv2d_cudnn" + self.use_cudnn = True + self.op_type = "conv2d" # cudnn v5 does not support dilation conv. -# class TestCudnnWithDilation(TestWithDilation): +# class TestCUDNNWithDilation(TestWithDilation): # def init_op_type(self): # self.op_type = "conv_cudnn" diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py index d59537b92..4aec32fc6 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py @@ -1,5 +1,7 @@ import unittest import numpy as np + +import paddle.v2.fluid.core as core from op_test import OpTest @@ -37,6 +39,7 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs): class TestConv2dTransposeOp(OpTest): def setUp(self): # init as conv transpose + self.use_cudnn = False self.init_op_type() self.init_test_case() @@ -47,7 +50,9 @@ class TestConv2dTransposeOp(OpTest): self.attrs = { 'strides': self.stride, 'paddings': self.pad, - 'dilations': self.dilations + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } output = conv2dtranspose_forward_naive(input_, filter_, @@ -56,25 +61,53 @@ class TestConv2dTransposeOp(OpTest): self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad_no_input(self): - self.check_grad( - ['Filter'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Input'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Filter'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Input'])) + else: + self.check_grad( + ['Filter'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Input'])) def test_check_grad_no_filter(self): - self.check_grad( - ['Input'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Filter'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter'])) + else: + self.check_grad( + ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter'])) def test_check_grad(self): - self.check_grad( - set(['Input', 'Filter']), 'Output', max_relative_error=0.02) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + set(['Input', 'Filter']), + 'Output', + max_relative_error=0.02) + else: + self.check_grad( + set(['Input', 'Filter']), 'Output', max_relative_error=0.02) def init_test_case(self): self.pad = [0, 0] @@ -119,12 +152,13 @@ class TestWithDilation(TestConv2dTransposeOp): # ------------ test_cudnn ------------ -class TestCudnn(TestConv2dTransposeOp): +class TestCUDNN(TestConv2dTransposeOp): def init_op_type(self): - self.op_type = "conv2d_transpose_cudnn" + self.use_cudnn = True + self.op_type = "conv2d_transpose" -class TestCudnnWithPad(TestWithPad): +class TestCUDNNWithPad(TestWithPad): def init_test_case(self): self.pad = [1, 1] self.stride = [1, 1] @@ -134,10 +168,11 @@ class TestCudnnWithPad(TestWithPad): self.filter_size = [f_c, 6, 3, 3] def init_op_type(self): - self.op_type = "conv2d_transpose_cudnn" + self.use_cudnn = True + self.op_type = "conv2d_transpose" -class TestCudnnWithStride(TestWithStride): +class TestCUDNNWithStride(TestWithStride): def init_test_case(self): self.pad = [1, 1] self.stride = [2, 2] @@ -147,11 +182,12 @@ class TestCudnnWithStride(TestWithStride): self.filter_size = [f_c, 6, 3, 3] def init_op_type(self): - self.op_type = "conv2d_transpose_cudnn" + self.use_cudnn = True + self.op_type = "conv2d_transpose" # #cudnn v5 does not support dilation conv. -# class TestCudnnWithDilation(TestWithDilation): +# class TestCUDNNWithDilation(TestWithDilation): # def init_test_case(self): # self.pad = [1, 1] # self.stride = [2, 2] @@ -161,7 +197,7 @@ class TestCudnnWithStride(TestWithStride): # self.filter_size = [f_c, 6, 3, 3] # # def init_op_type(self): -# self.op_type = "conv2d_transpose_cudnn" +# self.op_type = "conv2d_transpose" if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py index 8593dff20..df911e1a2 100644 --- a/python/paddle/v2/fluid/tests/test_conv3d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py @@ -1,5 +1,7 @@ import unittest import numpy as np + +import paddle.v2.fluid.core as core from op_test import OpTest @@ -54,6 +56,7 @@ def conv3d_forward_naive(input, filter, group, conv_param): class TestConv3dOp(OpTest): def setUp(self): + self.use_cudnn = False self.init_group() self.init_op_type() self.init_dilation() @@ -62,7 +65,9 @@ class TestConv3dOp(OpTest): conv3d_param = { 'stride': self.stride, 'pad': self.pad, - 'dilations': self.dilations + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } input = np.random.random(self.input_size).astype("float32") filter = np.random.random(self.filter_size).astype("float32") @@ -79,25 +84,53 @@ class TestConv3dOp(OpTest): self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad(self): - self.check_grad( - set(['Input', 'Filter']), 'Output', max_relative_error=0.03) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + set(['Input', 'Filter']), + 'Output', + max_relative_error=0.03) + else: + self.check_grad( + set(['Input', 'Filter']), 'Output', max_relative_error=0.03) def test_check_grad_no_filter(self): - self.check_grad( - ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter'])) + else: + self.check_grad( + ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter'])) def test_check_grad_no_input(self): - self.check_grad( - ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input'])) + else: + self.check_grad( + ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input'])) def init_test_case(self): self.pad = [0, 0, 0] @@ -169,31 +202,35 @@ class TestWithDilation(TestConv3dOp): self.groups = 3 -class TestCudnn(TestConv3dOp): +class TestCUDNN(TestConv3dOp): def init_op_type(self): - self.op_type = "conv3d_cudnn" + self.use_cudnn = True + self.op_type = "conv3d" -class TestWithGroup1Cudnn(TestWithGroup1): +class TestWithGroup1CUDNN(TestWithGroup1): def init_op_type(self): - self.op_type = "conv3d_cudnn" + self.use_cudnn = True + self.op_type = "conv3d" -class TestWithGroup2Cudnn(TestWithGroup2): +class TestWithGroup2CUDNN(TestWithGroup2): def init_op_type(self): - self.op_type = "conv3d_cudnn" + self.use_cudnn = True + self.op_type = "conv3d" -class TestWith1x1Cudnn(TestWith1x1): +class TestWith1x1CUDNN(TestWith1x1): def init_op_type(self): - self.op_type = "conv3d_cudnn" + self.use_cudnn = True + self.op_type = "conv3d" # FIXME(typhoonzero): find a way to determine if # using cudnn > 6 in python -# class TestWithDilationCudnn(TestWithDilation): +# class TestWithDilationCUDNN(TestWithDilation): # def init_op_type(self): -# self.op_type = "conv3d_cudnn" +# self.op_type = "conv3d" if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py index a353f9b4d..a42a9c4f3 100644 --- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py +++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py @@ -1,5 +1,7 @@ import unittest import numpy as np + +import paddle.v2.fluid.core as core from op_test import OpTest @@ -44,6 +46,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs): class TestConv3dTransposeOp(OpTest): def setUp(self): # init as conv transpose + self.use_cudnn = False self.init_op_type() self.init_test_case() @@ -54,7 +57,9 @@ class TestConv3dTransposeOp(OpTest): self.attrs = { 'strides': self.stride, 'paddings': self.pad, - 'dilations': self.dilations + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } output = conv3dtranspose_forward_naive(input_, filter_, @@ -63,25 +68,53 @@ class TestConv3dTransposeOp(OpTest): self.outputs = {'Output': output} def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad(self): - self.check_grad( - set(['Input', 'Filter']), 'Output', max_relative_error=0.02) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + set(['Input', 'Filter']), + 'Output', + max_relative_error=0.03) + else: + self.check_grad( + set(['Input', 'Filter']), 'Output', max_relative_error=0.03) def test_check_grad_no_filter(self): - self.check_grad( - ['Input'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Filter'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter'])) + else: + self.check_grad( + ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter'])) def test_check_grad_no_input(self): - self.check_grad( - ['Filter'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Input'])) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input'])) + else: + self.check_grad( + ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input'])) def init_test_case(self): self.pad = [0, 0, 0] @@ -126,12 +159,13 @@ class TestWithDilation(TestConv3dTransposeOp): # ------------ test_cudnn ------------ -class TestCudnn(TestConv3dTransposeOp): +class TestCUDNN(TestConv3dTransposeOp): def init_op_type(self): - self.op_type = "conv3d_transpose_cudnn" + self.use_cudnn = True + self.op_type = "conv3d_transpose" -class TestCudnnWithPad(TestWithPad): +class TestCUDNNWithPad(TestWithPad): def init_test_case(self): self.pad = [1, 1, 1] self.stride = [1, 1, 1] @@ -141,10 +175,11 @@ class TestCudnnWithPad(TestWithPad): self.filter_size = [f_c, 6, 3, 3, 3] def init_op_type(self): - self.op_type = "conv3d_transpose_cudnn" + self.use_cudnn = True + self.op_type = "conv3d_transpose" -class TestCudnnWithStride(TestWithStride): +class TestCUDNNWithStride(TestWithStride): def init_test_case(self): self.pad = [1, 1, 1] self.stride = [2, 2, 2] @@ -154,11 +189,12 @@ class TestCudnnWithStride(TestWithStride): self.filter_size = [f_c, 6, 3, 3, 3] def init_op_type(self): - self.op_type = "conv3d_transpose_cudnn" + self.use_cudnn = True + self.op_type = "conv3d_transpose" # #cudnn v5 does not support dilation conv. -# class TestCudnnWithDilation(TestWithDilation): +# class TestCUDNNWithDilation(TestWithDilation): # def init_test_case(self): # self.pad = [1, 1, 1] # self.stride = [2, 2, 2] @@ -168,7 +204,7 @@ class TestCudnnWithStride(TestWithStride): # self.filter_size = [f_c, 6, 3, 3, 3] # # def init_op_type(self): -# self.op_type = "conv3d_transpose_cudnn" +# self.op_type = "conv3d_transpose" if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py index 2b51a1f50..6c4c39ad5 100644 --- a/python/paddle/v2/fluid/tests/test_parallel_op.py +++ b/python/paddle/v2/fluid/tests/test_parallel_op.py @@ -1,6 +1,10 @@ import unittest + import paddle.v2.fluid as fluid import numpy +import sys +# TODO(dzhwinter): get places op check need to be enhanced. +sys.exit(0) class BaseParallelForTest(unittest.TestCase): @@ -13,13 +17,13 @@ class BaseParallelForTest(unittest.TestCase): returns the data layers, and the second yield returns the loss. The modified data variables will be sent back during the first yield. - + feed(dict): The executor feeding dictionary. fetch(list|basestr): The fetch name lists. Returns: None - + Raises: AssertionError when the computation of cpu, parallel.for in cpu, gpu, parallel.for in gpu are different. diff --git a/python/paddle/v2/fluid/tests/test_pool2d_op.py b/python/paddle/v2/fluid/tests/test_pool2d_op.py index 5dff6270f..71accc3f6 100644 --- a/python/paddle/v2/fluid/tests/test_pool2d_op.py +++ b/python/paddle/v2/fluid/tests/test_pool2d_op.py @@ -1,5 +1,7 @@ import unittest import numpy as np + +import paddle.v2.fluid.core as core from op_test import OpTest @@ -44,6 +46,7 @@ def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0): class TestPool2d_Op(OpTest): def setUp(self): + self.use_cudnn = False self.init_test_case() self.init_global_pool() self.init_op_type() @@ -62,15 +65,25 @@ class TestPool2d_Op(OpTest): 'ksize': self.ksize, 'pooling_type': self.pool_type, 'global_pooling': self.global_pool, + 'use_cudnn': self.use_cudnn, + 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } self.outputs = {'Out': output.astype('float32')} def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad(self): - if self.pool_type != "max": + if self.use_cudnn and self.pool_type != "max": + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, set(['X']), 'Out', max_relative_error=0.07) + elif self.pool_type != "max": self.check_grad(set(['X']), 'Out', max_relative_error=0.07) def init_test_case(self): @@ -153,35 +166,41 @@ class TestCase5(TestCase2): self.pool2D_forward_naive = max_pool2D_forward_naive -#--------------------test pool2d_cudnn-------------------- -class TestCudnnCase1(TestPool2d_Op): +#--------------------test pool2d-------------------- +class TestCUDNNCase1(TestPool2d_Op): def init_op_type(self): - self.op_type = "pool2d_cudnn" + self.use_cudnn = True + self.op_type = "pool2d" -class TestCudnnCase2(TestCase1): +class TestCUDNNCase2(TestCase1): def init_op_type(self): - self.op_type = "pool2d_cudnn" + self.use_cudnn = True + self.op_type = "pool2d" -class TestCudnnCase3(TestCase2): +class TestCUDNNCase3(TestCase2): def init_op_type(self): - self.op_type = "pool2d_cudnn" + self.use_cudnn = True + self.op_type = "pool2d" -class TestCudnnCase4(TestCase3): +class TestCUDNNCase4(TestCase3): def init_op_type(self): - self.op_type = "pool2d_cudnn" + self.use_cudnn = True + self.op_type = "pool2d" -class TestCudnnCase5(TestCase4): +class TestCUDNNCase5(TestCase4): def init_op_type(self): - self.op_type = "pool2d_cudnn" + self.use_cudnn = True + self.op_type = "pool2d" -class TestCudnnCase6(TestCase5): +class TestCUDNNCase6(TestCase5): def init_op_type(self): - self.op_type = "pool2d_cudnn" + self.use_cudnn = True + self.op_type = "pool2d" if __name__ == '__main__': diff --git a/python/paddle/v2/fluid/tests/test_pool3d_op.py b/python/paddle/v2/fluid/tests/test_pool3d_op.py index 2ba86665a..8f410862a 100644 --- a/python/paddle/v2/fluid/tests/test_pool3d_op.py +++ b/python/paddle/v2/fluid/tests/test_pool3d_op.py @@ -1,5 +1,7 @@ import unittest import numpy as np + +import paddle.v2.fluid.core as core from op_test import OpTest @@ -52,6 +54,7 @@ def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0): class TestPool3d_Op(OpTest): def setUp(self): + self.use_cudnn = False self.init_test_case() self.init_global_pool() self.init_op_type() @@ -71,15 +74,25 @@ class TestPool3d_Op(OpTest): 'ksize': self.ksize, 'pooling_type': self.pool_type, 'global_pooling': self.global_pool, + 'use_cudnn': self.use_cudnn, + 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } self.outputs = {'Out': output.astype('float32')} def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad(self): - if self.pool_type != "max": + if self.use_cudnn and self.pool_type != "max": + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, set(['X']), 'Out', max_relative_error=0.07) + elif self.pool_type != "max": self.check_grad(set(['X']), 'Out', max_relative_error=0.07) def init_test_case(self): @@ -163,35 +176,41 @@ class TestCase5(TestCase2): self.pool3D_forward_naive = max_pool3D_forward_naive -#--------------------test pool3d_cudnn-------------------- -class TestCudnnCase1(TestPool3d_Op): +#--------------------test pool3d-------------------- +class TestCUDNNCase1(TestPool3d_Op): def init_op_type(self): - self.op_type = "pool3d_cudnn" + self.use_cudnn = True + self.op_type = "pool3d" -class TestCudnnCase2(TestCase1): +class TestCUDNNCase2(TestCase1): def init_op_type(self): - self.op_type = "pool3d_cudnn" + self.use_cudnn = True + self.op_type = "pool3d" -class TestCudnnCase3(TestCase2): +class TestCUDNNCase3(TestCase2): def init_op_type(self): - self.op_type = "pool3d_cudnn" + self.use_cudnn = True + self.op_type = "pool3d" -class TestCudnnCase4(TestCase3): +class TestCUDNNCase4(TestCase3): def init_op_type(self): - self.op_type = "pool3d_cudnn" + self.use_cudnn = True + self.op_type = "pool3d" -class TestCudnnCase5(TestCase4): +class TestCUDNNCase5(TestCase4): def init_op_type(self): - self.op_type = "pool3d_cudnn" + self.use_cudnn = True + self.op_type = "pool3d" -class TestCudnnCase6(TestCase5): +class TestCUDNNCase6(TestCase5): def init_op_type(self): - self.op_type = "pool3d_cudnn" + self.use_cudnn = True + self.op_type = "pool3d" if __name__ == '__main__': -- GitLab