diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 682614742cf1bd3130c638020a2545e16226d4d6..4158d0528a1aea52c2a3f0880fe1000183a9df53 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -92,6 +92,9 @@ if(WITH_GPU) if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") endif() + if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) + message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() elseif(WITH_AMD_GPU) diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h index 9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec..b611bb77b4e1ec05b8bd029ac37cefba346c6eb0 100644 --- a/paddle/fluid/framework/data_layout.h +++ b/paddle/fluid/framework/data_layout.h @@ -27,6 +27,7 @@ enum class DataLayout { kNHWC = 0, kNCHW = 1, kAnyLayout = 2, + kMKLDNN = 3, // all layouts supported by MKLDNN internally }; inline DataLayout StringToDataLayout(const std::string& str) { @@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) { return DataLayout::kNCHW; } else if (s == "ANYLAYOUT") { return DataLayout::kAnyLayout; + } else if (s == "MKLDNNLAYOUT") { + return DataLayout::kMKLDNN; } else { PADDLE_THROW("Unknown storage order string: %s", s); } @@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) { return "NCHW"; case DataLayout::kAnyLayout: return "ANY_LAYOUT"; + case DataLayout::kMKLDNN: + return "MKLDNNLAYOUT"; default: - PADDLE_THROW("unknown DataLayou %d", data_layout); + PADDLE_THROW("unknown DataLayout %d", data_layout); } } diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 60ec60a427ba9046ce690eb75c27cd322fdd726d..5b8dfc57ba020cea259041f55a66472ea26b4eec 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -16,6 +16,9 @@ #include #include "paddle/fluid/operators/math/math_function.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace framework { @@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, out->set_layout(expected_kernel_type.data_layout_); } +#ifdef PADDLE_WITH_MKLDNN +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; + +void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { + switch (type) { + case mkldnn::memory::data_type::f32: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::u8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s16: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s32: + return platform::to_void_cast(tensor.data()); + default: + PADDLE_THROW("wrong mkldnn type provided"); + } +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out) { + auto in_layout = kernel_type_for_var.data_layout_; + auto out_layout = expected_kernel_type.data_layout_; + + PADDLE_ENFORCE( + in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, + "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " + "non-MKLDNN"); + +#ifdef PADDLE_WITH_MKLDNN + PADDLE_ENFORCE(in.format() != memory::format::format_undef && + in.format() != memory::format::any, + "Input tensor should have specified memory format"); + + // Set default as NCHW in case not specified + out_layout = + out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + + std::vector in_tz = paddle::framework::vectorize2int(in.dims()); + std::vector out_tz = in_tz; + + memory::data_type in_type = ToMKLDNNDataType(in.type()); + PADDLE_ENFORCE(in_type != memory::data_type::data_undef, + "Input tensor type is not supported: ", in.type().name()); + memory::data_type out_type = in_type; + + memory::format in_format = + in_tz.size() == 2 ? memory::format::nc : in.format(); + memory::format out_format = + out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout); + + void* in_data = GetDataFromTensor(in, in_type); + + // output tensor has the same dims as input. Reorder don't change dims + out->Resize(in.dims()); + + auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); + + auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + + platform::Reorder(in_memory, out_memory); + + out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 06b638663dd334837a3bcb7737e507fcbc871c7a..2ba84ce57fd8aa3d9aa651bdaa2930e459c74e88 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/tensor.h" @@ -22,6 +23,50 @@ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_MKLDNN +using MKLDNNFormat = mkldnn::memory::format; +using MKLDNNDataType = mkldnn::memory::data_type; + +inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) { + switch (layout) { + case DataLayout::kNHWC: + return MKLDNNFormat::nhwc; + case DataLayout::kNCHW: + return MKLDNNFormat::nchw; + default: + PADDLE_THROW("Fail to convert layout %s to MKLDNN format", + DataLayoutToString(layout)); + } +} + +inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { + switch (format) { + case MKLDNNFormat::nhwc: + return DataLayout::kNHWC; + case MKLDNNFormat::nchw: + return DataLayout::kNCHW; + default: + PADDLE_THROW("Fail to convert MKLDNN format to paddle layout"); + } +} + +inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { + static const std::map dict{ + {std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT + {std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT + {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8}, + {std::type_index(typeid(int16_t)), MKLDNNDataType::s16}, + {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}}; + auto iter = dict.find(type); + if (iter != dict.end()) return iter->second; + return MKLDNNDataType::data_undef; +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out); + std::vector GetAxis(const DataLayout& from, const DataLayout& to); void TransDataLayout(const OpKernelType& kernel_type_for_var, diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 9c277a27da5af34fc9fb18ca073e369c05ecdf22..b8fcc92697ca1bf1d971f8fef020f31d405605a9 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type, Tensor in; in.ShareDataWith(input_tensor); Tensor out; + DataLayout lin = kernel_type_for_var.data_layout_; + DataLayout lout = expected_kernel_type.data_layout_; // do layout transform - if (NeedTransformLayout(expected_kernel_type.data_layout_, - kernel_type_for_var.data_layout_)) { - TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + if (NeedTransformLayout(lout, lin)) { + if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) { + PADDLE_ENFORCE( + !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN), + "No layout transform needed between two MKLDNN OPKernels"); + + if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) { +#ifdef PADDLE_WITH_MKLDNN + // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel + // Just set layout/format. No real transform occur + out.ShareDataWith(input_tensor); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(ToMKLDNNFormat(lin)); +#endif + } else { + // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel + // Do transform via MKLDNN lib + TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in, + &out); + } + } else { + // Case3 - transfrom between Non-MKLDNN OPKernels + TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + } transformed = true; PassTensorData(&out, &in); } diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c106761f72e689ff53867ecad8e36b6038173d0e..c43826b64cc5140c539df17fdd13d9bee7fefdcd 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -13,7 +13,7 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro if(WITH_GPU) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory - dynload_cuda) + dynload_cuda variable_visitor) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) @@ -25,6 +25,7 @@ else() endif() cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) +cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..32415c192f0be51bf0850fe533c212c635779a30 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseVarsOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); + PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); + + auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); + + auto out_var_handle = out_var_handles[0]; + auto out_var = scope->Var(out_var_handle->name_); + + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); + + int64_t s = 0; + for (size_t i = 1; i < out_var_handles.size(); ++i) { + auto out_name = out_var_handles[i]->name_; + auto out_t = scope->Var(out_name)->GetMutable(); + auto numel = this->inputs_numel_.at(out_name); + out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); + s += numel; + } + this->RunAndRecordEvent([this] {}); +} + +std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b --- /dev/null +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct FuseVarsOpHandle : public OpHandleBase { + public: + FuseVarsOpHandle(Scope *local_scope, const platform::Place &place, + const std::unordered_map &inputs_numel, + const std::type_index &var_type) + : local_scope_(local_scope), + place_(place), + inputs_numel_(inputs_numel), + type_(var_type) { + total_numel_ = 0; + for (auto in_numel : inputs_numel) { + PADDLE_ENFORCE_GT(in_numel.second, 0); + total_numel_ += in_numel.second; + } + } + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + private: + Scope *local_scope_; + const platform::Place place_; + const std::unordered_map inputs_numel_; + const std::type_index type_; + int64_t total_numel_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 95aa599cd3e403e9cc66b2b5ad35d0d214d1ab5b..5bba089ade801a06e0364835efe5249105dcfcac 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -11,10 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#include #include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" namespace paddle { namespace framework { @@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( } void NCCLAllReduceOpHandle::RunImpl() { - if (inputs_.size() == 1) { + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { // Wait input done WaitInputVarGenerated(); - - auto &var_name = static_cast(this->inputs_[0])->name_; - int dtype = -1; - size_t numel = 0; + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { auto *s = local_scopes_[i]; auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - - auto &lod_tensor = local_scope.FindVar(var_name)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); } if (platform::is_gpu_place(lod_tensors[0]->place())) { + int dtype = -1; + size_t numel = 0; std::vector> all_reduce_calls; for (size_t i = 0; i < local_scopes_.size(); ++i) { auto &p = places_[i]; @@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() { auto &scope = *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; - auto *var = scope.FindVar(var_name); + auto *var = scope.FindVar(in_var_handles[i]->name_); auto *dev_ctx = dev_ctxes_[p]; RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 6b064650b4f09737836bda4a43fa421720077929..3849cca59a3347137b769f97261cfbf97da8d6ff 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { } } +size_t OpHandleBase::NoDummyInputSize() const { + size_t cnt = 0; + for (auto *in : inputs_) { + if (dynamic_cast(in) == nullptr) { + ++cnt; + } + } + return cnt; +} + bool OpHandleBase::NeedWait(VarHandleBase *in_var) { return in_var && in_var->generated_op_; } diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 8f94206a87dbae8a81727ca48718886bbabbe25c..dc92b0fe9f760d95d4869fdd56c0400b6710437f 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -80,6 +80,8 @@ class OpHandleBase { const std::vector &Outputs() const { return outputs_; } + size_t NoDummyInputSize() const; + protected: void RunAndRecordEvent(const std::function &callback); diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index 5287be3b6a05ec7067ca433ba976b0314d05fe02..b4c90013789759d17646d95efdc81fc6a0a4f3e7 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -20,7 +20,7 @@ namespace paddle { namespace framework { namespace details { -class SSAGraph; +struct SSAGraph; class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index fab20d75f5a45257f243333c1998d7b2549a25f9..f51a184e7bae2283f335fe9462a77b9c5fb831a5 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) { } inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { - return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; + bool ret = + (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r); +#ifdef PADDLE_WITH_MKLDNN + // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa + ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN); + ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN); +#endif + return ret; } inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 748317438b44bc4af84f13b25f8e4f88386388fb..e57c2ff3d0006f12d77f107182946ad6e4eb40bf 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; + std::string library(library_type); + std::string data_layout = "ANYLAYOUT"; + if (library == "MKLDNN") { + data_layout = "MKLDNNLAYOUT"; + } OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), - DataLayout::kAnyLayout, StringToLibraryType(library_type)); + StringToDataLayout(data_layout), + StringToLibraryType(library_type)); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); constexpr auto size = std::tuple_size>::value; @@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor { void operator()(const char* op_type, const char* library_type) const {} }; -// User can register many kernel in one place. The data type could be different. +// User can register many kernel in one place. The data type could be +// different. template class OpKernelRegistrar : public Registrar { public: diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f87d5521492418d2daf5b7fba1500c4bb31e10f5..4d1e8d0ebaaf4bdae085c08d834f4dc288fcd41c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -444,10 +444,25 @@ class RuntimeInferShapeContext : public InferShapeContext { auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); - // TODO(dzhwinter) : reuse ShareLoD in most operators. - // Need to call ShareLayout explicitly in sequence related ops. - // Shall we have a better method to shared info between in/out Tensor? - out_tensor->set_layout(in_tensor.layout()); +// TODO(dzhwinter) : reuse ShareLoD in most operators. +// Need to call ShareLayout explicitly in sequence related ops. +// Shall we have a better method to shared info between in/out Tensor? +#ifdef PADDLE_WITH_MKLDNN + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) +#endif + out_tensor->set_layout(in_tensor.layout()); } void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, @@ -665,7 +680,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( OpKernelType OperatorWithKernel::GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const { - return OpKernelType(expected_kernel_type.data_type_, tensor.place()); + return OpKernelType(expected_kernel_type.data_type_, tensor.place(), + tensor.layout()); } } // namespace framework diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 29566aaa53370b1fffc9ff9a90ae9b740b24f69e..ef224d68f1fc561f45e9d7a81425e62655457648 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -34,6 +34,28 @@ namespace framework { class LoDTensor; class Tensor { +#ifdef PADDLE_WITH_MKLDNN + + public: + inline mkldnn::memory::format format() const { return format_; } + + inline void set_format(const mkldnn::memory::format format) { + format_ = format; + } + + protected: + /** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; +#endif + public: template friend struct EigenTensor; @@ -195,8 +217,10 @@ class Tensor { * N,C,H,W for respectively the batch size, the number of * feature maps, the height. */ - - DataLayout layout_ = DataLayout::kNHWC; + // Fix me: here just change the default layout to kNCHW + // it doesn't fix the real issue, i.e. feeder should set up tensor layout + // according to actual input data + DataLayout layout_ = DataLayout::kNCHW; /** * @brief A PlaceHolder may be shared by more than one tensor. diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index e1012de2ec36eb4a858202d56a678b6a204c2f0a..0a1cb6d5703dace5e6be73285655ecd9d2ad89fb 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) { TEST(Tensor, Layout) { framework::Tensor src; - ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW); src.set_layout(framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 0dd0e5c9a2b08e406bf500f40e2fc8926012ac0e..748f5a084e8c880df215a60fe51c835ba5cd3110 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,9 +1,4 @@ # Add TRT tests -# This test is not stable -# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 -#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc -# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine -# SERIAL) nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc DEPS tensorrt_engine mul_op) @@ -16,3 +11,5 @@ nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) +nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 7facf30d781a26c2c6eb0a8966ef1b87e5dfdf0b..e1cace9cc1b06f036f52e82b7b86c99a02d50f50 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -22,7 +22,8 @@ namespace tensorrt { class ReluOpConverter : public OpConverter { public: ReluOpConverter() {} - void operator()(const framework::proto::OpDesc& op) override { + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); @@ -33,7 +34,12 @@ class ReluOpConverter : public OpConverter { nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), nvinfer1::ActivationType::kRELU); - engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0)); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 86ca2ca08eb14265e1bfe7abd5eb6af5c83b8a5c..0a02a7bebf9efbd0555707e6cfa701ef1e7d9659 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -1,106 +1,47 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #include -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/place.h" - -USE_OP(relu); +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" namespace paddle { namespace inference { namespace tensorrt { -void Compare(const std::string op_type, float input, float expect) { +TEST(ReluOpConverter, main) { framework::Scope scope; - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - - // init fluid op and variable - auto x_var = scope.Var("X"); - auto x_tensor = x_var->GetMutable(); - x_tensor->Resize({1, 1}); - x_tensor->mutable_data(place); - std::vector init; - init.push_back(input); - framework::TensorFromVector(init, ctx, x_tensor); - - auto out_var = scope.Var("Out"); - auto out_tensor = out_var->GetMutable(); - out_tensor->Resize({1, 1}); - out_tensor->mutable_data(place); - - framework::OpDesc op_desc; - op_desc.SetType(op_type); - op_desc.SetInput("X", {"X"}); - op_desc.SetOutput("Out", {"Out"}); - - auto op = framework::OpRegistry::CreateOp(*op_desc.Proto()); - - // run fluid op - op->Run(scope, place); - // get fluid output - std::vector out1; - framework::TensorToVector(*out_tensor, ctx, &out1); - - // init tensorrt op - cudaStream_t stream; - ASSERT_EQ(0, cudaStreamCreate(&stream)); - TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream); - engine->InitNetwork(); - engine->DeclareInput("X", nvinfer1::DataType::kFLOAT, - nvinfer1::DimsCHW{1, 1, 1}); - // convert op - OpConverter op_converter; - op_converter.ConvertOp(*op_desc.Proto(), engine); - - engine->DeclareOutput("Out"); - engine->FreezeNetwork(); - - // convert LoDTensor to ITensor - size_t size = x_tensor->memory_size(); - EngineIOConverter::ConvertInput(op_type, *x_tensor, - engine->buffer("X").buffer, size, &stream); - // run tensorrt Outp - engine->Execute(1); - // convert ITensor to LoDTensor - EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer, - out_tensor, size, &stream); - // get tensorrt output - std::vector out2; - framework::TensorToVector(*out_tensor, ctx, &out2); - - // compare - ASSERT_EQ(out1[0], out2[0]); - ASSERT_EQ(out1[0], expect); - - delete engine; - cudaStreamDestroy(stream); -} - -TEST(OpConverter, ConvertRelu) { - Compare("relu", 1, 1); // relu(1) = 1 - Compare("relu", -5, 0); // relu(-5) = 0 + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6)); + validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("relu"); + desc.SetInput("X", {"relu-X"}); + desc.SetOutput("Out", {"relu-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(10); } } // namespace tensorrt } // namespace inference } // namespace paddle -USE_OP(activation); +USE_OP(relu); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index dd71c66a75a039429f6e4b1771bb31508bb6b56d..8478ae20a59250f45daf9e8e4e18fddfe61b945e 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const framework::OperatorWithKernel& oper, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; + + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType( framework::ToDataType(ctx.Input(name)->type()), ctx.GetPlace(), layout, library); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 6ec8c9d18b466142acdb46b0f46826a2aca7a47e..d7e0af28c1bfa6a9073b25b0a301234cc5d194f5 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel { "Variance input should be of float type"); framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library_); } @@ -367,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel { } framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::DataLayout::kAnyLayout; return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout, library_); + layout_, library_); } }; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 697d91484257984b104a13b0572cf19b16f8d37e..850297a2327f33a4a765f64f201e217fce5db89b 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kPlain}; + + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; @@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif @@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library); } @@ -309,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -318,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 8843a1c44b7004ba5d7935f75d3c99d9c30fc6c0..a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), @@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FCOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kMKLDNN}; - framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + framework::DataLayout layout{framework::DataLayout::kMKLDNN}; return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 52b9cd7fb7019b738098a8649f23277afd40e938..52b459a6a2e56b7c256efdb535b4652c64bae23c 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -124,16 +124,17 @@ namespace { framework::OpKernelType GetExpectedLRNKernel( const framework::ExecutionContext& ctx) { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_MKLDNN if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 60e936298defe7c6ce8a33bdc7de05b52eb950e7..a045f9e98dd7348973c3c4506f44d3e261599a14 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -24,10 +24,13 @@ using mkldnn::pooling_backward; // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial -static std::string gethash(memory::dims& input_dims, std::string& pooling_type, - std::vector& ksize, std::vector& strides, - std::vector& paddings, std::string suffix) { - auto dims2str = [](memory::dims& operand_dims) { +static std::string gethash(const memory::dims& input_dims, + const std::string& pooling_type, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string& suffix) { + auto dims2str = [](const memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { dstr += std::to_string(operand_dims[i]) + "-"; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index f4fb2b132fe8d59cb50f5a1f7359240ac50445fe..18aa2bd352c5d184b5748e57b4af17c1ae0d7a82 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout_, library_); @@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); } - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index cc256aa627bdda0609f496cab93a2dec7d95f348..c90a3be964a3a309a182d3620abec619c366dd84 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; @@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { if (library_ == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } #endif @@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { "float16 can only be used on GPU place"); } - std::string data_format = ctx.Attr("data_format"); - return framework::OpKernelType(input_data_type, ctx.GetPlace(), - framework::StringToDataLayout(data_format), + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); } }; @@ -142,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f1187620d81ff3bc1deef2106edb54d6199fa927..de711b7d23ef01d57a62087c552ea090f01f0386 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { @@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType() { return mkldnn::memory::f32; } +inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) { + auto reorder_prim = mkldnn::reorder(src, dst); + std::vector pipeline; + pipeline.push_back(reorder_prim); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} + +inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) { + return static_cast( + memory.get_primitive_desc().desc().data.format); +} + } // namespace platform } // namespace paddle