提交 35e820dc 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into infer_api

...@@ -92,6 +92,9 @@ if(WITH_GPU) ...@@ -92,6 +92,9 @@ if(WITH_GPU)
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
endif() endif()
if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
endif()
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
endif() endif()
elseif(WITH_AMD_GPU) elseif(WITH_AMD_GPU)
......
...@@ -27,6 +27,7 @@ enum class DataLayout { ...@@ -27,6 +27,7 @@ enum class DataLayout {
kNHWC = 0, kNHWC = 0,
kNCHW = 1, kNCHW = 1,
kAnyLayout = 2, kAnyLayout = 2,
kMKLDNN = 3, // all layouts supported by MKLDNN internally
}; };
inline DataLayout StringToDataLayout(const std::string& str) { inline DataLayout StringToDataLayout(const std::string& str) {
...@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) { ...@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
return DataLayout::kNCHW; return DataLayout::kNCHW;
} else if (s == "ANYLAYOUT") { } else if (s == "ANYLAYOUT") {
return DataLayout::kAnyLayout; return DataLayout::kAnyLayout;
} else if (s == "MKLDNNLAYOUT") {
return DataLayout::kMKLDNN;
} else { } else {
PADDLE_THROW("Unknown storage order string: %s", s); PADDLE_THROW("Unknown storage order string: %s", s);
} }
...@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) { ...@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
return "NCHW"; return "NCHW";
case DataLayout::kAnyLayout: case DataLayout::kAnyLayout:
return "ANY_LAYOUT"; return "ANY_LAYOUT";
case DataLayout::kMKLDNN:
return "MKLDNNLAYOUT";
default: default:
PADDLE_THROW("unknown DataLayou %d", data_layout); PADDLE_THROW("unknown DataLayout %d", data_layout);
} }
} }
......
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, ...@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
out->set_layout(expected_kernel_type.data_layout_); out->set_layout(expected_kernel_type.data_layout_);
} }
#ifdef PADDLE_WITH_MKLDNN
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::reorder;
void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
switch (type) {
case mkldnn::memory::data_type::f32:
return platform::to_void_cast(tensor.data<float>());
case mkldnn::memory::data_type::s8:
return platform::to_void_cast(tensor.data<char>());
case mkldnn::memory::data_type::u8:
return platform::to_void_cast(tensor.data<unsigned char>());
case mkldnn::memory::data_type::s16:
return platform::to_void_cast(tensor.data<int16_t>());
case mkldnn::memory::data_type::s32:
return platform::to_void_cast(tensor.data<int32_t>());
default:
PADDLE_THROW("wrong mkldnn type provided");
}
}
#endif
void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type,
const Tensor& in, Tensor* out) {
auto in_layout = kernel_type_for_var.data_layout_;
auto out_layout = expected_kernel_type.data_layout_;
PADDLE_ENFORCE(
in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
"TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
"non-MKLDNN");
#ifdef PADDLE_WITH_MKLDNN
PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
in.format() != memory::format::any,
"Input tensor should have specified memory format");
// Set default as NCHW in case not specified
out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz;
memory::data_type in_type = ToMKLDNNDataType(in.type());
PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
"Input tensor type is not supported: ", in.type().name());
memory::data_type out_type = in_type;
memory::format in_format =
in_tz.size() == 2 ? memory::format::nc : in.format();
memory::format out_format =
out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
void* in_data = GetDataFromTensor(in, in_type);
// output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims());
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
platform::Reorder(in_memory, out_memory);
out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
#endif
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -22,6 +23,50 @@ ...@@ -22,6 +23,50 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#ifdef PADDLE_WITH_MKLDNN
using MKLDNNFormat = mkldnn::memory::format;
using MKLDNNDataType = mkldnn::memory::data_type;
inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
switch (layout) {
case DataLayout::kNHWC:
return MKLDNNFormat::nhwc;
case DataLayout::kNCHW:
return MKLDNNFormat::nchw;
default:
PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
DataLayoutToString(layout));
}
}
inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
switch (format) {
case MKLDNNFormat::nhwc:
return DataLayout::kNHWC;
case MKLDNNFormat::nchw:
return DataLayout::kNCHW;
default:
PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
}
}
inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
static const std::map<std::type_index, MKLDNNDataType> dict{
{std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT
{std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT
{std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
{std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
{std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
auto iter = dict.find(type);
if (iter != dict.end()) return iter->second;
return MKLDNNDataType::data_undef;
}
#endif
void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type,
const Tensor& in, Tensor* out);
std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to); std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
void TransDataLayout(const OpKernelType& kernel_type_for_var, void TransDataLayout(const OpKernelType& kernel_type_for_var,
......
...@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type, ...@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type,
Tensor in; Tensor in;
in.ShareDataWith(input_tensor); in.ShareDataWith(input_tensor);
Tensor out; Tensor out;
DataLayout lin = kernel_type_for_var.data_layout_;
DataLayout lout = expected_kernel_type.data_layout_;
// do layout transform // do layout transform
if (NeedTransformLayout(expected_kernel_type.data_layout_, if (NeedTransformLayout(lout, lin)) {
kernel_type_for_var.data_layout_)) { if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); PADDLE_ENFORCE(
!(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
"No layout transform needed between two MKLDNN OPKernels");
if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN);
out.set_format(ToMKLDNNFormat(lin));
#endif
} else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
// Do transform via MKLDNN lib
TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
&out);
}
} else {
// Case3 - transfrom between Non-MKLDNN OPKernels
TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
}
transformed = true; transformed = true;
PassTensorData(&out, &in); PassTensorData(&out, &in);
} }
......
...@@ -13,7 +13,7 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro ...@@ -13,7 +13,7 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
if(WITH_GPU) if(WITH_GPU)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda) dynload_cuda variable_visitor)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
...@@ -25,6 +25,7 @@ else() ...@@ -25,6 +25,7 @@ else()
endif() endif()
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
void FuseVarsOpHandle::RunImpl() {
WaitInputVarGenerated(place_);
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto out_var_handle = out_var_handles[0];
auto out_var = scope->Var(out_var_handle->name_);
auto out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
int64_t s = 0;
for (size_t i = 1; i < out_var_handles.size(); ++i) {
auto out_name = out_var_handles[i]->name_;
auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
auto numel = this->inputs_numel_.at(out_name);
out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
s += numel;
}
this->RunAndRecordEvent([this] {});
}
std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct FuseVarsOpHandle : public OpHandleBase {
public:
FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
const std::unordered_map<std::string, int64_t> &inputs_numel,
const std::type_index &var_type)
: local_scope_(local_scope),
place_(place),
inputs_numel_(inputs_numel),
type_(var_type) {
total_numel_ = 0;
for (auto in_numel : inputs_numel) {
PADDLE_ENFORCE_GT(in_numel.second, 0);
total_numel_ += in_numel.second;
}
}
std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; };
protected:
void RunImpl() override;
private:
Scope *local_scope_;
const platform::Place place_;
const std::unordered_map<std::string, int64_t> inputs_numel_;
const std::type_index type_;
int64_t total_numel_;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -11,10 +11,12 @@ ...@@ -11,10 +11,12 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( ...@@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
} }
void NCCLAllReduceOpHandle::RunImpl() { void NCCLAllReduceOpHandle::RunImpl() {
if (inputs_.size() == 1) { if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
} else { } else {
// Wait input done // Wait input done
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_; auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
int dtype = -1; PADDLE_ENFORCE_EQ(
size_t numel = 0; in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
std::vector<const LoDTensor *> lod_tensors; std::vector<const LoDTensor *> lod_tensors;
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *s = local_scopes_[i]; auto *s = local_scopes_[i];
auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>(); auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &lod_tensor =
auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>(); local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
lod_tensors.emplace_back(&lod_tensor); lod_tensors.emplace_back(&lod_tensor);
PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
"The name of input and output should be equal.");
} }
if (platform::is_gpu_place(lod_tensors[0]->place())) { if (platform::is_gpu_place(lod_tensors[0]->place())) {
int dtype = -1;
size_t numel = 0;
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i]; auto &p = places_[i];
...@@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() { ...@@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
auto &scope = auto &scope =
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>(); *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i]; auto &p = places_[i];
auto *var = scope.FindVar(var_name); auto *var = scope.FindVar(in_var_handles[i]->name_);
auto *dev_ctx = dev_ctxes_[p]; auto *dev_ctx = dev_ctxes_[p];
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
......
...@@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { ...@@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
} }
} }
size_t OpHandleBase::NoDummyInputSize() const {
size_t cnt = 0;
for (auto *in : inputs_) {
if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
++cnt;
}
}
return cnt;
}
bool OpHandleBase::NeedWait(VarHandleBase *in_var) { bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
return in_var && in_var->generated_op_; return in_var && in_var->generated_op_;
} }
......
...@@ -80,6 +80,8 @@ class OpHandleBase { ...@@ -80,6 +80,8 @@ class OpHandleBase {
const std::vector<VarHandleBase *> &Outputs() const { return outputs_; } const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
size_t NoDummyInputSize() const;
protected: protected:
void RunAndRecordEvent(const std::function<void()> &callback); void RunAndRecordEvent(const std::function<void()> &callback);
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
class SSAGraph; struct SSAGraph;
class SSAGraphPrinter { class SSAGraphPrinter {
public: public:
virtual ~SSAGraphPrinter() {} virtual ~SSAGraphPrinter() {}
......
...@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) { ...@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
} }
inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; bool ret =
(l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
#ifdef PADDLE_WITH_MKLDNN
// Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
#endif
return ret;
} }
inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
......
...@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { ...@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
void operator()(const char* op_type, const char* library_type) const { void operator()(const char* op_type, const char* library_type) const {
using T = typename KERNEL_TYPE::ELEMENT_TYPE; using T = typename KERNEL_TYPE::ELEMENT_TYPE;
std::string library(library_type);
std::string data_layout = "ANYLAYOUT";
if (library == "MKLDNN") {
data_layout = "MKLDNNLAYOUT";
}
OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
DataLayout::kAnyLayout, StringToLibraryType(library_type)); StringToDataLayout(data_layout),
StringToLibraryType(library_type));
OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
...@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> { ...@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
void operator()(const char* op_type, const char* library_type) const {} void operator()(const char* op_type, const char* library_type) const {}
}; };
// User can register many kernel in one place. The data type could be different. // User can register many kernel in one place. The data type could be
// different.
template <typename PlaceType, typename... KernelType> template <typename PlaceType, typename... KernelType>
class OpKernelRegistrar : public Registrar { class OpKernelRegistrar : public Registrar {
public: public:
......
...@@ -444,10 +444,25 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -444,10 +444,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
auto* out_tensor = out_var->GetMutable<LoDTensor>(); auto* out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->set_lod(in_tensor.lod()); out_tensor->set_lod(in_tensor.lod());
// TODO(dzhwinter) : reuse ShareLoD in most operators. // TODO(dzhwinter) : reuse ShareLoD in most operators.
// Need to call ShareLayout explicitly in sequence related ops. // Need to call ShareLayout explicitly in sequence related ops.
// Shall we have a better method to shared info between in/out Tensor? // Shall we have a better method to shared info between in/out Tensor?
out_tensor->set_layout(in_tensor.layout()); #ifdef PADDLE_WITH_MKLDNN
// Fix me: ugly workaround below
// Correct solution:
// set_layout() should NOT be called here (i.e. ShareLoD). Instead,
// layout of output tensor should be set "manually" in Compute()
// of each OPKernel. The reason layout should NOT be shared between
// input and output "automatically" (now by InferShape()->ShareLoD())
// is that layout transform may occur after InferShape().
// Workaround:
// Skip set_layout() when input layout is kMKLDNN
// This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
// OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
// in Compute()
if (in_tensor.layout() != DataLayout::kMKLDNN)
#endif
out_tensor->set_layout(in_tensor.layout());
} }
void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
...@@ -665,7 +680,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( ...@@ -665,7 +680,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
OpKernelType OperatorWithKernel::GetKernelTypeForVar( OpKernelType OperatorWithKernel::GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor, const std::string& var_name, const Tensor& tensor,
const OpKernelType& expected_kernel_type) const { const OpKernelType& expected_kernel_type) const {
return OpKernelType(expected_kernel_type.data_type_, tensor.place()); return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
tensor.layout());
} }
} // namespace framework } // namespace framework
......
...@@ -34,6 +34,28 @@ namespace framework { ...@@ -34,6 +34,28 @@ namespace framework {
class LoDTensor; class LoDTensor;
class Tensor { class Tensor {
#ifdef PADDLE_WITH_MKLDNN
public:
inline mkldnn::memory::format format() const { return format_; }
inline void set_format(const mkldnn::memory::format format) {
format_ = format;
}
protected:
/**
* @brief the detail format of memory block which have layout as kMKLDNN
*
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, layout will be set as
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
*/
mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
#endif
public: public:
template <typename T, size_t D, int MajorType, typename IndexType> template <typename T, size_t D, int MajorType, typename IndexType>
friend struct EigenTensor; friend struct EigenTensor;
...@@ -195,8 +217,10 @@ class Tensor { ...@@ -195,8 +217,10 @@ class Tensor {
* N,C,H,W for respectively the batch size, the number of * N,C,H,W for respectively the batch size, the number of
* feature maps, the height. * feature maps, the height.
*/ */
// Fix me: here just change the default layout to kNCHW
DataLayout layout_ = DataLayout::kNHWC; // it doesn't fix the real issue, i.e. feeder should set up tensor layout
// according to actual input data
DataLayout layout_ = DataLayout::kNCHW;
/** /**
* @brief A PlaceHolder may be shared by more than one tensor. * @brief A PlaceHolder may be shared by more than one tensor.
......
...@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) { ...@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) {
TEST(Tensor, Layout) { TEST(Tensor, Layout) {
framework::Tensor src; framework::Tensor src;
ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
src.set_layout(framework::DataLayout::kAnyLayout); src.set_layout(framework::DataLayout::kAnyLayout);
ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
} }
# Add TRT tests # Add TRT tests
# This test is not stable
# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828
#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
# SERIAL)
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc
DEPS tensorrt_engine mul_op) DEPS tensorrt_engine mul_op)
...@@ -16,3 +11,5 @@ nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc ...@@ -16,3 +11,5 @@ nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
...@@ -22,7 +22,8 @@ namespace tensorrt { ...@@ -22,7 +22,8 @@ namespace tensorrt {
class ReluOpConverter : public OpConverter { class ReluOpConverter : public OpConverter {
public: public:
ReluOpConverter() {} ReluOpConverter() {}
void operator()(const framework::proto::OpDesc& op) override { void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
...@@ -33,7 +34,12 @@ class ReluOpConverter : public OpConverter { ...@@ -33,7 +34,12 @@ class ReluOpConverter : public OpConverter {
nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor), engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
nvinfer1::ActivationType::kRELU); nvinfer1::ActivationType::kRELU);
engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0)); auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
USE_OP(relu);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
void Compare(const std::string op_type, float input, float expect) { TEST(ReluOpConverter, main) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; std::unordered_set<std::string> parameters;
platform::CUDADeviceContext ctx(place); TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
// init fluid op and variable validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
auto x_var = scope.Var("X");
auto x_tensor = x_var->GetMutable<framework::LoDTensor>(); // Prepare Op description
x_tensor->Resize({1, 1}); framework::OpDesc desc;
x_tensor->mutable_data<float>(place); desc.SetType("relu");
std::vector<float> init; desc.SetInput("X", {"relu-X"});
init.push_back(input); desc.SetOutput("Out", {"relu-Out"});
framework::TensorFromVector(init, ctx, x_tensor);
LOG(INFO) << "set OP";
auto out_var = scope.Var("Out"); validator.SetOp(*desc.Proto());
auto out_tensor = out_var->GetMutable<framework::LoDTensor>(); LOG(INFO) << "execute";
out_tensor->Resize({1, 1});
out_tensor->mutable_data<float>(place); validator.Execute(10);
framework::OpDesc op_desc;
op_desc.SetType(op_type);
op_desc.SetInput("X", {"X"});
op_desc.SetOutput("Out", {"Out"});
auto op = framework::OpRegistry::CreateOp(*op_desc.Proto());
// run fluid op
op->Run(scope, place);
// get fluid output
std::vector<float> out1;
framework::TensorToVector(*out_tensor, ctx, &out1);
// init tensorrt op
cudaStream_t stream;
ASSERT_EQ(0, cudaStreamCreate(&stream));
TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream);
engine->InitNetwork();
engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 1, 1});
// convert op
OpConverter op_converter;
op_converter.ConvertOp(*op_desc.Proto(), engine);
engine->DeclareOutput("Out");
engine->FreezeNetwork();
// convert LoDTensor to ITensor
size_t size = x_tensor->memory_size();
EngineIOConverter::ConvertInput(op_type, *x_tensor,
engine->buffer("X").buffer, size, &stream);
// run tensorrt Outp
engine->Execute(1);
// convert ITensor to LoDTensor
EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer,
out_tensor, size, &stream);
// get tensorrt output
std::vector<float> out2;
framework::TensorToVector(*out_tensor, ctx, &out2);
// compare
ASSERT_EQ(out1[0], out2[0]);
ASSERT_EQ(out1[0], expect);
delete engine;
cudaStreamDestroy(stream);
}
TEST(OpConverter, ConvertRelu) {
Compare("relu", 1, 1); // relu(1) = 1
Compare("relu", -5, 0); // relu(-5) = 0
} }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(activation); USE_OP(relu);
...@@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, ...@@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
const framework::OperatorWithKernel& oper, const framework::OperatorWithKernel& oper,
const std::string& name) { const std::string& name) {
framework::LibraryType library{framework::LibraryType::kPlain}; framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
auto it = oper.Attrs().find("use_mkldnn"); auto it = oper.Attrs().find("use_mkldnn");
if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
} }
#endif #endif
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()), framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
ctx.GetPlace(), layout, library); ctx.GetPlace(), layout, library);
......
...@@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel {
"Variance input should be of float type"); "Variance input should be of float type");
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain && if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
} }
#endif #endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library_); library_);
} }
...@@ -367,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -367,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
} }
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain && if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
} }
#endif #endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout, library_); layout_, library_);
} }
}; };
......
...@@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOp::GetExpectedKernelType( framework::OpKernelType ConvOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
framework::LibraryType library{framework::LibraryType::kPlain}; framework::LibraryType library{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library = framework::LibraryType::kCUDNN; library = framework::LibraryType::kCUDNN;
...@@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
if (library == framework::LibraryType::kPlain && if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
} }
#endif #endif
...@@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"float16 can only be used when CUDNN is used"); "float16 can only be used when CUDNN is used");
} }
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::StringToDataLayout(data_format);
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library); library);
} }
...@@ -309,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -309,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOpGrad::GetExpectedKernelType( framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
...@@ -318,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( ...@@ -318,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
if (library_ == framework::LibraryType::kPlain && if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
} }
#endif #endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout_, library_); layout_, library_);
......
...@@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType FCOp::GetExpectedKernelType( framework::OpKernelType FCOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
framework::LibraryType library{framework::LibraryType::kMKLDNN}; framework::LibraryType library{framework::LibraryType::kMKLDNN};
framework::DataLayout layout{framework::DataLayout::kAnyLayout}; framework::DataLayout layout{framework::DataLayout::kMKLDNN};
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
...@@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType FCOpGrad::GetExpectedKernelType( framework::OpKernelType FCOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
framework::LibraryType library{framework::LibraryType::kMKLDNN}; framework::LibraryType library{framework::LibraryType::kMKLDNN};
framework::DataLayout layout{framework::DataLayout::kAnyLayout}; framework::DataLayout layout{framework::DataLayout::kMKLDNN};
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
......
...@@ -124,16 +124,17 @@ namespace { ...@@ -124,16 +124,17 @@ namespace {
framework::OpKernelType GetExpectedLRNKernel( framework::OpKernelType GetExpectedLRNKernel(
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain && if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
} }
#endif #endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout_, library_); layout_, library_);
......
...@@ -24,10 +24,13 @@ using mkldnn::pooling_backward; ...@@ -24,10 +24,13 @@ using mkldnn::pooling_backward;
// Generate keys for storing/retriving primitives for this operator // Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial // TODO(jczaja): Make hashing function more optimial
static std::string gethash(memory::dims& input_dims, std::string& pooling_type, static std::string gethash(const memory::dims& input_dims,
std::vector<int>& ksize, std::vector<int>& strides, const std::string& pooling_type,
std::vector<int>& paddings, std::string suffix) { const std::vector<int>& ksize,
auto dims2str = [](memory::dims& operand_dims) { const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& suffix) {
auto dims2str = [](const memory::dims& operand_dims) {
std::string dstr = ""; std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) { for (size_t i = 0; i < operand_dims.size(); ++i) {
dstr += std::to_string(operand_dims[i]) + "-"; dstr += std::to_string(operand_dims[i]) + "-";
......
...@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { ...@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
framework::OpKernelType PoolOp::GetExpectedKernelType( framework::OpKernelType PoolOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const { const framework::ExecutionContext &ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
...@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( ...@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
if (library_ == framework::LibraryType::kPlain && if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
} }
#endif #endif
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout_, library_); layout_, library_);
...@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { ...@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
framework::OpKernelType PoolOpGrad::GetExpectedKernelType( framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const { const framework::ExecutionContext &ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
...@@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( ...@@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
if (library_ == framework::LibraryType::kPlain && if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
} }
#endif #endif
...@@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( ...@@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used"); "float16 can only be used when CUDNN is used");
} }
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
library_); library_);
} }
......
...@@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported. // choose cudnn kernel if the runtime supported.
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
...@@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
if (library_ == framework::LibraryType::kPlain && if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
} }
#endif #endif
...@@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
"float16 can only be used on GPU place"); "float16 can only be used on GPU place");
} }
std::string data_format = ctx.Attr<std::string>("data_format"); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::StringToDataLayout(data_format),
library_); library_);
} }
}; };
...@@ -142,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ...@@ -142,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported. // choose cudnn kernel if the runtime supported.
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <mkldnn.h> #include <mkldnn.h>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<float>() { ...@@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
return mkldnn::memory::f32; return mkldnn::memory::f32;
} }
inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
auto reorder_prim = mkldnn::reorder(src, dst);
std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(reorder_prim);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
return static_cast<mkldnn::memory::format>(
memory.get_primitive_desc().desc().data.format);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册