提交 8c71adaa 编写于 作者: P pzelazko-intel 提交者: Tao Luo

MKLDNN conv2d kernel added (#8451)

* MKLDNN conv2 OP kernel added

* TODOs added

* mkldnn conv2d OP refactor

* CanCUDNNBeUsed and CanMKLDNNBeUsed moved
上级 049383c6
file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
list(REMOVE_DUPLICATES GENERAL_OPS)
set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
......@@ -13,6 +15,8 @@ function(op_library TARGET)
set(cu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function)
set(options "")
set(oneValueArgs "")
......@@ -36,12 +40,20 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
......@@ -62,10 +74,10 @@ function(op_library TARGET)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif()
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
......@@ -101,7 +113,8 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
......@@ -112,6 +125,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
# pybind USE_OP
if (${pybind_flag} EQUAL 0)
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; // Note: paddle has also "memory" namespace
using mkldnn::primitive;
using mkldnn::convolution_forward;
using mkldnn::convolution_backward_weights;
using mkldnn::convolution_backward_data;
using mkldnn::convolution_direct;
using mkldnn::prop_kind;
using mkldnn::padding_kind;
using mkldnn::stream;
namespace {
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine);
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
} // anonymous namespace
template <typename T>
class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Output("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
// allocate memory for output
T* output_data = output->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
// memory descriptors for convolution src/weight/dst
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
// create memory primitives
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data);
std::unique_ptr<convolution_forward::primitive_desc> conv_pd =
ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides,
paddings, mkldnn_engine);
// save p_conv_pd into dev_ctx to be referred in backward path
auto p_conv_pd = conv_pd.get();
std::shared_ptr<void> conv_pd_value = std::move(conv_pd);
dev_ctx.SetBlob(key_conv_pd, conv_pd_value);
// create convolution op primitive
auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory,
conv_weights_memory, conv_dst_memory);
// push op to stream and wait MKLDNN until it's executed
std::vector<primitive> pipeline{conv_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
};
template <typename T>
class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
const Tensor* output = ctx.Input<Tensor>("Output");
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
if (!input_grad && !filter_grad) return;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Input("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
const T* output_grad_data = output_grad->data<T>();
T* input_grad_data = nullptr;
T* filter_grad_data = nullptr;
// allocate memory for gradient of input/filter
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (filter_grad) {
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_diff_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
// create memory
auto conv_diff_dst_memory =
memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data);
// Retrieve conv_pd from device context
std::shared_ptr<void> conv_pd;
convolution_forward::primitive_desc* p_conv_pd;
conv_pd = dev_ctx.GetBlob(key_conv_pd);
PADDLE_ENFORCE(conv_pd != nullptr,
"Fail to find conv_pd in device context");
p_conv_pd =
static_cast<convolution_forward::primitive_desc*>(conv_pd.get());
// create backward conv primitive for weights
if (filter_grad) {
// create primitive descriptor
convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
// create memory
auto conv_diff_weights_memory = memory(
{conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
// create backward conv primitive for weights
auto conv_bwd_weights_prim = convolution_backward_weights(
conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory,
conv_diff_weights_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_weights_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
if (input_grad) {
// create primitive descriptor
convolution_backward_data::primitive_desc conv_bwd_data_pd =
ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
// create memory
auto conv_diff_src_memory =
memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
// create backward conv primitive for data
auto conv_bwd_data_prim =
convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory,
conv_weights_memory, conv_diff_src_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_data_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
} // Compute()
};
namespace {
std::unique_ptr<convolution_forward::primitive_desc> ConvFwdPrimitiveDesc(
const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine) {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_weights_desc = convolution_backward_weights::desc(
convolution_direct, src, diff_weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc,
engine, conv_pd);
}
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_data_desc = convolution_backward_data::desc(
convolution_direct, diff_src, weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine,
conv_pd);
}
} // anonymous namespace
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvOpMkldnnKernel<float>);
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvGradOpMkldnnKernel<float>);
......@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
......@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
......@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
};
inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
if (use_cudnn) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
}
#endif
return use_cudnn;
}
} // namespace platform
} // namespace paddle
......@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) {
if (platform::is_cpu_place(places[i])) {
#ifdef PADDLE_WITH_MKLDNN
device_contexts_.emplace(places[i],
new platform::MKLDNNDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
#else
device_contexts_.emplace(places[i],
new platform::CPUDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
#endif
} else if (platform::is_gpu_place(places[i])) {
#ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(places[i],
......@@ -170,64 +176,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), ready_(false) {
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
: CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
}
template <typename T>
void MKLDNNDeviceContext::AddElement(const std::string& op_key,
const T& value) {
if (GetElement<T>(op_key)) {
return;
}
GetElementPool<T>().emplace(op_key, std::move(value));
}
void MKLDNNDeviceContext::SetBlob(const std::string& name,
std::shared_ptr<void> data) const {
std::unordered_map<std::string, std::shared_ptr<void>>* p;
p = p_blobs_.get();
template <typename T>
const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
auto it = GetElementPool<T>().find(op_key);
return it == GetElementPool<T>().end() ? nullptr : it->second;
}
auto it = p->find(name);
template <>
const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
return memory_pool_;
}
if (it == p->end()) {
(*p)[name] = data; // create new blob
} else {
it->second = data; // set data to existing blob
}
template <>
const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
return primitive_pool_;
return;
}
template <>
const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
return primitive_desc_pool_;
}
std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
const std::string& name) const {
std::unordered_map<std::string, std::shared_ptr<void>>* p;
p = p_blobs_.get();
void MKLDNNDeviceContext::Execute(bool block) {
if (pipeline_.empty()) {
return;
}
ResetStream();
stream_->submit(pipeline_).wait(block);
ready_ = false;
pipeline_.clear();
}
auto it = p->find(name);
void MKLDNNDeviceContext::ResetStream() {
if (ready_) {
return;
if (it != p->end()) {
return it->second;
}
// TODO(TJ): change me when mkldnn have specific method to reset this state
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
ready_ = true;
return nullptr;
}
#endif
......
......@@ -22,7 +22,7 @@ limitations under the License. */
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#include <mkldnn.hpp>
#endif
#include "paddle/fluid/platform/enforce.h"
......@@ -114,46 +114,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
public:
explicit MKLDNNDeviceContext(CPUPlace place);
/* \brief Add new element: memory, primitive or primitive desc */
template <typename T>
void AddElement(const std::string& op_key, const T& value);
/* \brief Get existed element: memory, primitive or primitive desc */
template <typename T>
const T& GetElement(const std::string& op_key) const;
/* \brief Get element pool: memory, primitive or primitive desc pool */
template <typename T>
const std::unordered_map<const std::string, const T, std::hash<std::string>>&
GetElementPool() const;
/* \brief Get the active engine */
const MKLDNNEngine& engine() const { return *engine_; }
/* \brief Submit primitive to pipeline */
void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
const mkldnn::engine& GetEngine() const { return engine_; }
/*! \brief Execute all submitted primitives in pipeline */
void Execute(bool block = true);
// Set data to blob (i.e. name/data pair). Create blob if not existing
void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
protected:
/*! \brief Reset the stream to prepare next exectue */
void ResetStream();
// Find a saved blob. Return nullptr if not found
std::shared_ptr<void> GetBlob(const std::string& name) const;
private:
std::unordered_map<const std::string, const MKLDNNMemoryPtr,
std::hash<std::string>>
memory_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>
primitive_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>
primitive_desc_pool_;
std::vector<MKLDNNPrimitive> pipeline_;
MKLDNNStreamPtr stream_;
MKLDNNEnginePtr engine_;
bool ready_;
mkldnn::engine engine_;
std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
p_blobs_;
};
#endif
......
......@@ -16,12 +16,15 @@ limitations under the License. */
#include <mkldnn.hpp>
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace platform {
using MKLDNNStream = mkldnn::stream;
using MKLDNNEngine = mkldnn::engine;
using MKLDNNMemory = mkldnn::memory;
using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
using MKLDNNPrimitive = mkldnn::primitive;
using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
......@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
mkldnn::memory::data_type data_type,
mkldnn::memory::format format) {
mkldnn::memory::dims tz = dims;
return mkldnn::memory::desc({tz}, data_type, format);
}
inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
}
} // namespace platform
} // namespace paddle
......@@ -1111,6 +1111,7 @@ def conv2d(input,
param_attr=None,
bias_attr=None,
use_cudnn=True,
use_mkldnn=False,
act=None):
"""
**Convlution2D Layer**
......@@ -1252,7 +1253,8 @@ def conv2d(input,
'strides': stride,
'paddings': padding,
'groups': groups,
'use_cudnn': use_cudnn
'use_cudnn': use_cudnn,
'use_mkldnn': use_mkldnn
})
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
......
......@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
act,
param_attr=None,
pool_type='max',
use_cudnn=True):
use_cudnn=True,
use_mkldnn=False):
conv_out = layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
param_attr=param_attr,
act=act,
use_cudnn=use_cudnn)
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
pool_out = layers.pool2d(
input=conv_out,
......@@ -58,7 +60,8 @@ def img_conv_group(input,
conv_batchnorm_drop_rate=0.0,
pool_stride=1,
pool_type=None,
use_cudnn=True):
use_cudnn=True,
use_mkldnn=False):
"""
Image Convolution Group, Used for vgg net.
"""
......@@ -90,7 +93,8 @@ def img_conv_group(input,
padding=conv_padding[i],
param_attr=param_attr[i],
act=local_conv_act,
use_cudnn=use_cudnn)
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act)
......
......@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
class TestConv2dOp(OpTest):
def setUp(self):
self.use_cudnn = False
self.use_mkldnn = False
self.init_op_type()
self.init_group()
self.init_dilation()
......@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn
}
self.outputs = {'Output': output}
......@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
# def init_op_type(self):
# self.op_type = "conv_cudnn"
#----------------Conv2dMKLDNN----------------
class TestMKLDNN(TestConv2dOp):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithPad(TestWithPad):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithStride(TestWithStride):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册