提交 8c71adaa 编写于 作者: P pzelazko-intel 提交者: Tao Luo

MKLDNN conv2d kernel added (#8451)

* MKLDNN conv2 OP kernel added

* TODOs added

* mkldnn conv2d OP refactor

* CanCUDNNBeUsed and CanMKLDNNBeUsed moved
上级 049383c6
file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
list(REMOVE_DUPLICATES GENERAL_OPS)
set(DEPS_OPS "") set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h) set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
...@@ -13,6 +15,8 @@ function(op_library TARGET) ...@@ -13,6 +15,8 @@ function(op_library TARGET)
set(cu_cc_srcs) set(cu_cc_srcs)
set(cudnn_cu_cc_srcs) set(cudnn_cu_cc_srcs)
set(CUDNN_FILE) set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function) set(op_common_deps operator op_registry math_function)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
...@@ -36,12 +40,20 @@ function(op_library TARGET) ...@@ -36,12 +40,20 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif() endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
endif()
endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$") if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src}) list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src}) list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
...@@ -62,11 +74,11 @@ function(op_library TARGET) ...@@ -62,11 +74,11 @@ function(op_library TARGET)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif() endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
endif() endif()
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
...@@ -101,7 +113,8 @@ function(op_library TARGET) ...@@ -101,7 +113,8 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP # pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -112,6 +125,11 @@ function(op_library TARGET) ...@@ -112,6 +125,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
# pybind USE_OP # pybind USE_OP
if (${pybind_flag} EQUAL 0) if (${pybind_flag} EQUAL 0)
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; // Note: paddle has also "memory" namespace
using mkldnn::primitive;
using mkldnn::convolution_forward;
using mkldnn::convolution_backward_weights;
using mkldnn::convolution_backward_data;
using mkldnn::convolution_direct;
using mkldnn::prop_kind;
using mkldnn::padding_kind;
using mkldnn::stream;
namespace {
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine);
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
} // anonymous namespace
template <typename T>
class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Output("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
// allocate memory for output
T* output_data = output->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
// memory descriptors for convolution src/weight/dst
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
// create memory primitives
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data);
std::unique_ptr<convolution_forward::primitive_desc> conv_pd =
ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides,
paddings, mkldnn_engine);
// save p_conv_pd into dev_ctx to be referred in backward path
auto p_conv_pd = conv_pd.get();
std::shared_ptr<void> conv_pd_value = std::move(conv_pd);
dev_ctx.SetBlob(key_conv_pd, conv_pd_value);
// create convolution op primitive
auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory,
conv_weights_memory, conv_dst_memory);
// push op to stream and wait MKLDNN until it's executed
std::vector<primitive> pipeline{conv_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
};
template <typename T>
class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
const Tensor* output = ctx.Input<Tensor>("Output");
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
if (!input_grad && !filter_grad) return;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Input("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
const T* output_grad_data = output_grad->data<T>();
T* input_grad_data = nullptr;
T* filter_grad_data = nullptr;
// allocate memory for gradient of input/filter
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (filter_grad) {
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_diff_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
// create memory
auto conv_diff_dst_memory =
memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data);
// Retrieve conv_pd from device context
std::shared_ptr<void> conv_pd;
convolution_forward::primitive_desc* p_conv_pd;
conv_pd = dev_ctx.GetBlob(key_conv_pd);
PADDLE_ENFORCE(conv_pd != nullptr,
"Fail to find conv_pd in device context");
p_conv_pd =
static_cast<convolution_forward::primitive_desc*>(conv_pd.get());
// create backward conv primitive for weights
if (filter_grad) {
// create primitive descriptor
convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
// create memory
auto conv_diff_weights_memory = memory(
{conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
// create backward conv primitive for weights
auto conv_bwd_weights_prim = convolution_backward_weights(
conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory,
conv_diff_weights_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_weights_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
if (input_grad) {
// create primitive descriptor
convolution_backward_data::primitive_desc conv_bwd_data_pd =
ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
// create memory
auto conv_diff_src_memory =
memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
// create backward conv primitive for data
auto conv_bwd_data_prim =
convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory,
conv_weights_memory, conv_diff_src_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_data_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
} // Compute()
};
namespace {
std::unique_ptr<convolution_forward::primitive_desc> ConvFwdPrimitiveDesc(
const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine) {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_weights_desc = convolution_backward_weights::desc(
convolution_direct, src, diff_weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc,
engine, conv_pd);
}
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_data_desc = convolution_backward_data::desc(
convolution_direct, diff_src, weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine,
conv_pd);
}
} // anonymous namespace
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvOpMkldnnKernel<float>);
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvGradOpMkldnnKernel<float>);
...@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOp::GetExpectedKernelType( framework::OpKernelType ConvOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); framework::LibraryType library_{framework::LibraryType::kPlain};
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::CanCUDNNBeUsed(ctx)) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); library_ = framework::LibraryType::kCUDNN;
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
} }
#endif #endif
framework::LibraryType library_; #ifdef PADDLE_WITH_MKLDNN
if (use_cudnn) { if (library_ == framework::LibraryType::kPlain &&
library_ = framework::LibraryType::kCUDNN; platform::CanMKLDNNBeUsed(ctx)) {
} else { library_ = framework::LibraryType::kMKLDNN;
library_ = framework::LibraryType::kPlain;
} }
#endif
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
...@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) ...@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn", "use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn") "(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) ...@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn", "use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn") "(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOpGrad::GetExpectedKernelType( framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); framework::LibraryType library_{framework::LibraryType::kPlain};
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::CanCUDNNBeUsed(ctx)) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); library_ = framework::LibraryType::kCUDNN;
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
} }
#endif #endif
#ifdef PADDLE_WITH_MKLDNN
framework::LibraryType library_; if (library_ == framework::LibraryType::kPlain &&
if (use_cudnn) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kMKLDNN;
} else {
library_ = framework::LibraryType::kPlain;
} }
#endif
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor { ...@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
}; };
inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
if (use_cudnn) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
}
#endif
return use_cudnn;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool( ...@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) { for (size_t i = 0; i < places.size(); i++) {
if (platform::is_cpu_place(places[i])) { if (platform::is_cpu_place(places[i])) {
#ifdef PADDLE_WITH_MKLDNN
device_contexts_.emplace(places[i],
new platform::MKLDNNDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
#else
device_contexts_.emplace(places[i], device_contexts_.emplace(places[i],
new platform::CPUDeviceContext( new platform::CPUDeviceContext(
boost::get<platform::CPUPlace>(places[i]))); boost::get<platform::CPUPlace>(places[i])));
#endif
} else if (platform::is_gpu_place(places[i])) { } else if (platform::is_gpu_place(places[i])) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(places[i], device_contexts_.emplace(places[i],
...@@ -170,64 +176,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; } ...@@ -170,64 +176,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), ready_(false) { : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
} }
template <typename T> void MKLDNNDeviceContext::SetBlob(const std::string& name,
void MKLDNNDeviceContext::AddElement(const std::string& op_key, std::shared_ptr<void> data) const {
const T& value) { std::unordered_map<std::string, std::shared_ptr<void>>* p;
if (GetElement<T>(op_key)) { p = p_blobs_.get();
return;
}
GetElementPool<T>().emplace(op_key, std::move(value));
}
template <typename T> auto it = p->find(name);
const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
auto it = GetElementPool<T>().find(op_key);
return it == GetElementPool<T>().end() ? nullptr : it->second;
}
template <> if (it == p->end()) {
const std::unordered_map<const std::string, const MKLDNNMemoryPtr, (*p)[name] = data; // create new blob
std::hash<std::string>>& } else {
MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const { it->second = data; // set data to existing blob
return memory_pool_; }
}
template <> return;
const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
return primitive_pool_;
} }
template <> std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr, const std::string& name) const {
std::hash<std::string>>& std::unordered_map<std::string, std::shared_ptr<void>>* p;
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const { p = p_blobs_.get();
return primitive_desc_pool_;
}
void MKLDNNDeviceContext::Execute(bool block) { auto it = p->find(name);
if (pipeline_.empty()) {
return;
}
ResetStream();
stream_->submit(pipeline_).wait(block);
ready_ = false;
pipeline_.clear();
}
void MKLDNNDeviceContext::ResetStream() { if (it != p->end()) {
if (ready_) { return it->second;
return;
} }
// TODO(TJ): change me when mkldnn have specific method to reset this state
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); return nullptr;
ready_ = true;
} }
#endif #endif
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#endif #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include <mkldnn.hpp>
#endif #endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -114,46 +114,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -114,46 +114,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
public: public:
explicit MKLDNNDeviceContext(CPUPlace place); explicit MKLDNNDeviceContext(CPUPlace place);
/* \brief Add new element: memory, primitive or primitive desc */
template <typename T>
void AddElement(const std::string& op_key, const T& value);
/* \brief Get existed element: memory, primitive or primitive desc */
template <typename T>
const T& GetElement(const std::string& op_key) const;
/* \brief Get element pool: memory, primitive or primitive desc pool */
template <typename T>
const std::unordered_map<const std::string, const T, std::hash<std::string>>&
GetElementPool() const;
/* \brief Get the active engine */ /* \brief Get the active engine */
const MKLDNNEngine& engine() const { return *engine_; } const mkldnn::engine& GetEngine() const { return engine_; }
/* \brief Submit primitive to pipeline */
void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
/*! \brief Execute all submitted primitives in pipeline */ // Set data to blob (i.e. name/data pair). Create blob if not existing
void Execute(bool block = true); void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
protected: // Find a saved blob. Return nullptr if not found
/*! \brief Reset the stream to prepare next exectue */ std::shared_ptr<void> GetBlob(const std::string& name) const;
void ResetStream();
private: private:
std::unordered_map<const std::string, const MKLDNNMemoryPtr, mkldnn::engine engine_;
std::hash<std::string>> std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
memory_pool_; p_blobs_;
std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>
primitive_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>
primitive_desc_pool_;
std::vector<MKLDNNPrimitive> pipeline_;
MKLDNNStreamPtr stream_;
MKLDNNEnginePtr engine_;
bool ready_;
}; };
#endif #endif
......
...@@ -16,12 +16,15 @@ limitations under the License. */ ...@@ -16,12 +16,15 @@ limitations under the License. */
#include <mkldnn.hpp> #include <mkldnn.hpp>
#include "paddle/fluid/framework/operator.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
using MKLDNNStream = mkldnn::stream; using MKLDNNStream = mkldnn::stream;
using MKLDNNEngine = mkldnn::engine; using MKLDNNEngine = mkldnn::engine;
using MKLDNNMemory = mkldnn::memory; using MKLDNNMemory = mkldnn::memory;
using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
using MKLDNNPrimitive = mkldnn::primitive; using MKLDNNPrimitive = mkldnn::primitive;
using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>; using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
...@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr; ...@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr; typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr; typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
mkldnn::memory::data_type data_type,
mkldnn::memory::format format) {
mkldnn::memory::dims tz = dims;
return mkldnn::memory::desc({tz}, data_type, format);
}
inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -1111,6 +1111,7 @@ def conv2d(input, ...@@ -1111,6 +1111,7 @@ def conv2d(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=True, use_cudnn=True,
use_mkldnn=False,
act=None): act=None):
""" """
**Convlution2D Layer** **Convlution2D Layer**
...@@ -1252,7 +1253,8 @@ def conv2d(input, ...@@ -1252,7 +1253,8 @@ def conv2d(input,
'strides': stride, 'strides': stride,
'paddings': padding, 'paddings': padding,
'groups': groups, 'groups': groups,
'use_cudnn': use_cudnn 'use_cudnn': use_cudnn,
'use_mkldnn': use_mkldnn
}) })
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
......
...@@ -29,14 +29,16 @@ def simple_img_conv_pool(input, ...@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
act, act,
param_attr=None, param_attr=None,
pool_type='max', pool_type='max',
use_cudnn=True): use_cudnn=True,
use_mkldnn=False):
conv_out = layers.conv2d( conv_out = layers.conv2d(
input=input, input=input,
num_filters=num_filters, num_filters=num_filters,
filter_size=filter_size, filter_size=filter_size,
param_attr=param_attr, param_attr=param_attr,
act=act, act=act,
use_cudnn=use_cudnn) use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
pool_out = layers.pool2d( pool_out = layers.pool2d(
input=conv_out, input=conv_out,
...@@ -58,7 +60,8 @@ def img_conv_group(input, ...@@ -58,7 +60,8 @@ def img_conv_group(input,
conv_batchnorm_drop_rate=0.0, conv_batchnorm_drop_rate=0.0,
pool_stride=1, pool_stride=1,
pool_type=None, pool_type=None,
use_cudnn=True): use_cudnn=True,
use_mkldnn=False):
""" """
Image Convolution Group, Used for vgg net. Image Convolution Group, Used for vgg net.
""" """
...@@ -90,7 +93,8 @@ def img_conv_group(input, ...@@ -90,7 +93,8 @@ def img_conv_group(input,
padding=conv_padding[i], padding=conv_padding[i],
param_attr=param_attr[i], param_attr=param_attr[i],
act=local_conv_act, act=local_conv_act,
use_cudnn=use_cudnn) use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
if conv_with_batchnorm[i]: if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act) tmp = layers.batch_norm(input=tmp, act=conv_act)
......
...@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): ...@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
class TestConv2dOp(OpTest): class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.use_cudnn = False self.use_cudnn = False
self.use_mkldnn = False
self.init_op_type() self.init_op_type()
self.init_group() self.init_group()
self.init_dilation() self.init_dilation()
...@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest): ...@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
'paddings': self.pad, 'paddings': self.pad,
'groups': self.groups, 'groups': self.groups,
'dilations': self.dilations, 'dilations': self.dilations,
'use_cudnn': self.use_cudnn 'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
...@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp): ...@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
# def init_op_type(self): # def init_op_type(self):
# self.op_type = "conv_cudnn" # self.op_type = "conv_cudnn"
#----------------Conv2dMKLDNN----------------
class TestMKLDNN(TestConv2dOp):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithPad(TestWithPad):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithStride(TestWithStride):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册