diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index fb8c9ab96d372bde1fb4e1d86488cd5b831b93e0..528e45b51099d97a1f6f0dfc971b6231f928af94 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -73,8 +73,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) - -cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece) +cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator) cc_test(init_test SRCS init_test.cc DEPS init) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index ac6e40a3ae8f6655eecc8279543d6afe6bbb1399..6b1780968867067fe6d1e7fc576811f0a07340b3 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -37,6 +37,28 @@ auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), DataLayout::kNCHW, LibraryType::kPlain); +// TODO(dzhwinter): Only for testing multiple op kernel. +// Dummy transform function for library_type +// should be removed. +auto KernelPlain = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0), + DataLayout::kAnyLayout, LibraryType::kPlain); + +auto KernelCUDNN = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0), + DataLayout::kAnyLayout, LibraryType::kCUDNN); + +void DummyTrans(const platform::DeviceContext* ctx, + const KernelTypePair& kernel_pair, const Variable& in, + Variable* out) { + PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!."); + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_pair.first.place_, + kernel_pair.second.place_), + "TransDataType Only Support DataType transform on same place!"); + auto src = in.Get(); + auto* dst = out->GetMutable(); + *dst = src; +} + void TransDataType(const platform::DeviceContext* ctx, const KernelTypePair& kernel_pair, const Variable& in, Variable* out) { @@ -121,6 +143,8 @@ std::vector NCHW2NHWC = {0, 2, 3, 1}; } REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType); +REGISTER_DATA_TRANSFORM_FN(f::KernelPlain, f::KernelCUDNN, f::DummyTrans); +REGISTER_DATA_TRANSFORM_FN(f::KernelCUDNN, f::KernelPlain, f::DummyTrans); REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, std::bind(f::TransDataLayout, NHWC2NCHW, std::placeholders::_1, diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index 3bea8f3d0a31a54ef9eebe5731696b64ceed13f8..7ec8d18b0e886948f4fb951e17875584413771db 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/framework/init.h" +#include "paddle/framework/operator.h" #include "paddle/platform/device_context.h" #include "paddle/platform/place.h" #include "paddle/string/piece.h" @@ -24,7 +25,6 @@ namespace framework { std::once_flag gflags_init_flag; -// TODO(qijun) move init gflags to init.cc void InitGflags(std::vector &argv) { std::call_once(gflags_init_flag, [&]() { int argc = argv.size(); @@ -72,6 +72,7 @@ bool InitDevices(const std::vector &devices) { LOG(WARNING) << "Not specified CPU device, create CPU by Default."; } platform::DeviceContextPool::Init(places); + framework::UseALL(); return true; } diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index cef530c6e639f6e2188869fa57d114ec6b885aa8..a286925bbe4cc455a5956b4ac1800a2bafa3bfdb 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -12,13 +12,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/framework/op_registry.h" +#include #include +#include "paddle/framework/op_registry.h" + namespace pd = paddle::framework; namespace paddle { namespace framework { + class CosineOp : public OperatorBase { public: using OperatorBase::OperatorBase; @@ -252,7 +255,6 @@ TEST(OperatorRegistrar, CPU) { op->Run(scope, cpu_place); } -#ifdef PADDLE_WITH_CUDA TEST(OperatorRegistrar, CUDA) { paddle::framework::proto::OpDesc op_desc; paddle::platform::CUDAPlace cuda_place(0); @@ -263,4 +265,131 @@ TEST(OperatorRegistrar, CUDA) { op->Run(scope, cuda_place); } -#endif + +static int op_test_value = 0; + +using paddle::platform::DeviceContext; +using paddle::platform::CPUDeviceContext; +using paddle::platform::CUDADeviceContext; + +namespace paddle { +namespace framework { + +class OpWithMultiKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetActualKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(proto::DataType::FP32, ctx.device_context()); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::OpKernelType& kernel) const override { + return framework::OpKernelType(kernel.data_type_, platform::CUDAPlace(0), + kernel.data_layout_, + framework::LibraryType::kCUDNN); + } +}; + +template +class OpMultiKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + ++op_test_value; + } +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + --op_test_value; + } +}; + +template +class OpMultiKernelTest2 : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value += 10; + } +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value -= 10; + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel, + paddle::framework::OpWithMultiKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CPU, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest2); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest2); + +TEST(OperatorRegistrar, OpWithMultiKernel) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_multi_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + // use all available kernels + paddle::framework::UseALL(); + op->Run(scope, cuda_place); + EXPECT_EQ(op_test_value, -10); + + // remove cuda kernels + paddle::framework::UseCPU(); + op->Run(scope, cpu_place); + + EXPECT_EQ(op_test_value, -9); + + // add cuda kernels + paddle::framework::UseCUDA(); + op->Run(scope, cuda_place); + + EXPECT_EQ(op_test_value, -10); + + // use cudnn kernel + paddle::framework::UseCUDNN(); + op->Run(scope, cuda_place); + EXPECT_EQ(op_test_value, -20); +} diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index fc7091f1c89f8b3f998f6d1b68f032b76bad2197..70a9c4b5554e28592299cccc4b6dcfd8636b970c 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -25,6 +26,53 @@ limitations under the License. */ namespace paddle { namespace framework { +std::vector> kKernelPriority; + +void UseCPU() { + kKernelPriority.clear(); + /*Plain CPU*/ + auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kPlain); + kKernelPriority.insert(kKernelPriority.begin(), pair0); +} + +void UseMKLDNN() { + UseCPU(); +#if PADDLE_WITH_MKLML + { + /*MKLDNN Kernel*/ + auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN); + kKernelPriority.insert(kKernelPriority.begin(), pair0); + } +#endif +} + +void UseCUDA() { + UseMKLDNN(); +#if PADDLE_WITH_CUDA + /*Plain GPU*/ + auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain); + kKernelPriority.insert(kKernelPriority.begin(), pair0); +#endif +} + +void UseCUDNN() { + UseCUDA(); +#if PADDLE_WITH_CUDA + if (platform::dynload::HasCUDNN()) { + /*CUDNN Kernel*/ + auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN); + kKernelPriority.insert(kKernelPriority.begin(), pair0); + } +#endif +} + +void UseALL() { + UseCPU(); + UseMKLDNN(); + UseCUDA(); + UseCUDNN(); +} + std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE(ins.size(), 1UL, @@ -402,6 +450,12 @@ const platform::DeviceContext* GetDeviceContext( } } +const platform::DeviceContext* GetDeviceContext( + const framework::OpKernelType& kernel) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + return pool.Get(kernel.place_); +} + void OperatorWithKernel::Run(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); @@ -422,13 +476,8 @@ void OperatorWithKernel::Run(const Scope& scope, ExecutionContext ctx(*this, scope, *dev_ctx); auto actual_kernel_key = GetActualKernelType(ctx); - auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key); - auto kernel_iter = kernels.find(expected_kernel_key); - if (kernel_iter == kernels.end()) { - PADDLE_THROW("The operator %s does not support %s", type_, - expected_kernel_key); - } + auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key); if (actual_kernel_key == expected_kernel_key) { PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_, @@ -436,9 +485,24 @@ void OperatorWithKernel::Run(const Scope& scope, "CPU and other devices. For example, multi-GPU model " "parallelism will failed."); } else { + // find the best key candidate + const DataTransformFnMap& trans_map = DataTransformFnMap::Instance(); + for (auto& candidate : kKernelPriority) { + auto candidate_key = + OpKernelType(actual_kernel_key.data_type_, std::get<0>(candidate), + actual_kernel_key.data_layout_, std::get<1>(candidate)); + + auto candidate_pair = std::make_pair(actual_kernel_key, candidate_key); + if ((actual_kernel_key == candidate_key) || + (kernels.count(candidate_key) && + trans_map.GetNullable(candidate_pair))) { + expected_kernel_key = candidate_key; + break; + } + } + auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key); - const DataTransformFn* trans_fun = - DataTransformFnMap::Instance().GetNullable(kernel_pair); + const DataTransformFn* trans_fun = trans_map.GetNullable(kernel_pair); if (trans_fun) { auto input_vars = this->InputVars(); // TODO(qijun) filter the input vars that do not need to be transformed @@ -471,7 +535,20 @@ void OperatorWithKernel::Run(const Scope& scope, } } - kernel_iter->second->Compute(ctx); + VLOG(10) << "Actual kernel: " << actual_kernel_key + << "Expected kernel: " << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); + + if (kernel_iter == kernels.end()) { + PADDLE_THROW("The operator %s does not support %s", type_, + expected_kernel_key); + } + + auto* expected_dev_ctx = GetDeviceContext(expected_kernel_key); + ExecutionContext expected_ctx(*this, scope, *expected_dev_ctx); + + kernel_iter->second->Compute(expected_ctx); } OpKernelType OperatorWithKernel::GetActualKernelType( diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index d0a9b643d565d6651fd7ec0b515f088362852ba3..1f5a4af58c5a9ad2fa8f4ac08ece67084b8f741a 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include @@ -52,10 +53,33 @@ constexpr char kGradVarSuffix[] = "@GRAD"; /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; -// define some kernel hint -const std::string kUseCPU = "use_cpu"; -const std::string kUseCUDNN = "use_cudnn"; -const std::string kUseMKLDNN = "use_mkldnn"; +// define some kernel priority +extern std::vector> kKernelPriority; + +/** + * @brief Use cpu kernel only + */ +void UseCPU(); + +/** + * @brief Perfer MKLDNN kernel than Plain CPU kernel + */ +void UseMKLDNN(); + +/** + * @brief Perfer CUDA kernel than Plain CPU kernel + */ +void UseCUDA(); + +/** + * @brief Perfer cudnn kernel than Plain CUDA kernel + */ +void UseCUDNN(); + +/** + * @brief Use all available kernels + */ +void UseALL(); inline std::string GradVarName(const std::string& var_name) { return var_name + kGradVarSuffix; diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 0aa7dd48cafc3e2387ac902882d84ce9029cfcd0..0c5ed3e4e80304c6fd174975166804347feb18b1 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -315,10 +315,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); - +// TODO(dzhwinter) : below register should be removed REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, paddle::operators::CudnnConvOpKernel, paddle::operators::CudnnConvOpKernel); diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h index 83786e2329e7ae3c2908fdfdaeb1f79d19a53f47..fe3c0bc9302257d444c7431c40c8ab7e4c1fe0e7 100644 --- a/paddle/operators/conv_op.h +++ b/paddle/operators/conv_op.h @@ -62,12 +62,25 @@ class ConvOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + framework::OpKernelType GetExpectedKernelType( + const framework::OpKernelType& kernel) const override { + return framework::OpKernelType(kernel.data_type_, platform::CUDAPlace(0), + kernel.data_layout_, + framework::LibraryType::kCUDNN); + } }; class ConvOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override; + + framework::OpKernelType GetExpectedKernelType( + const framework::OpKernelType& kernel) const override { + return framework::OpKernelType(kernel.data_type_, platform::CUDAPlace(0), + kernel.data_layout_, + framework::LibraryType::kCUDNN); + } }; template diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc index 761635aa5e5eac445c2ec8331b0dc37ffd11248c..b13ad42ea29453354798d88bff8ef47339d1a614 100644 --- a/paddle/pybind/const_value.cc +++ b/paddle/pybind/const_value.cc @@ -23,11 +23,6 @@ void BindConstValue(pybind11::module& m) { m.def("kTempVarName", [] { return framework::kTempVarName; }); m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); - - // for kernel_hint key - m.def("kUseCPU", [] { return framework::kUseCPU; }); - m.def("kUseCUDNN", [] { return framework::kUseCUDNN; }); - m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; }); } } // namespace pybind diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 364db62cba6acd7ac380b5017d09f22eefa24813..5d170c66e97f56440968ba568167e6845631e1cc 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -430,6 +430,12 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_glog", framework::InitGLOG); m.def("init_devices", &framework::InitDevices); + m.def("use_cpu", framework::UseCPU); + m.def("use_mkldnn", framework::UseMKLDNN); + m.def("use_cuda", framework::UseCUDA); + m.def("use_cudnn", framework::UseCUDNN); + m.def("use_all", framework::UseALL); + m.def("is_compile_gpu", IsCompileGPU); m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index b66a8bce5f4f15539007876c113afd3f878b00bc..7340dd23d160d9074b84fa7581dd84c2638e11f3 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -17,10 +17,6 @@ TEMP_VAR_NAME = core.kTempVarName() GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() -USE_CPU = core.kUseCPU() -USE_CUDNN = core.kUseMKLDNN() -USE_MKLDNN = core.kUseMKLDNN() - def grad_var_name(var_name): """ diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py index e82e3ab0c9c0bc75a13a8948fda925bc4f0b6512..958300e655e012b91598360105ca2734c3bd2c37 100644 --- a/python/paddle/v2/fluid/tests/test_conv2d_op.py +++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py @@ -1,5 +1,7 @@ import unittest import numpy as np + +import paddle.v2.fluid.core as core from op_test import OpTest @@ -47,6 +49,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): class TestConv2dOp(OpTest): def setUp(self): + core.use_cuda() self.init_op_type() self.init_group() self.init_dilation() @@ -167,26 +170,31 @@ class TestWithDilation(TestConv2dOp): #----------------Conv2dCudnn---------------- class TestCudnn(TestConv2dOp): def init_op_type(self): + core.use_cudnn() self.op_type = "conv2d_cudnn" class TestCudnnWithPad(TestWithPad): def init_op_type(self): + core.use_cudnn() self.op_type = "conv2d_cudnn" class TestCudnnWithStride(TestWithStride): def init_op_type(self): + core.use_cudnn() self.op_type = "conv2d_cudnn" class TestCudnnWithGroup(TestWithGroup): def init_op_type(self): + core.use_cudnn() self.op_type = "conv2d_cudnn" class TestCudnnWith1x1(TestWith1x1): def init_op_type(self): + core.use_cudnn() self.op_type = "conv2d_cudnn"