diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 4a146288765f69b238eae15c11b147efa5f7c409..d58ac991ab39d6634da89e51411009b6ff9e2bd4 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -19,7 +19,7 @@ SET(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_REPOSITORY https://github.com/intel/mkl-dnn.git) -SET(MKLDNN_TAG aef88b7c233f48f8b945da310f1b973da31ad033) +SET(MKLDNN_TAG 518a316a8cd6deb82dc7866bc04bd0355a25c3a4) # Introduce variables: # * CMAKE_INSTALL_LIBDIR @@ -35,13 +35,6 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. -IF(${CBLAS_PROVIDER} STREQUAL "MKLML") - SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) - MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") -ELSE() - MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") -ENDIF() - IF(NOT WIN32) SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") @@ -63,7 +56,8 @@ ExternalProject_Add( DEPENDS ${MKLDNN_DEPENDS} PREFIX ${MKLDNN_PREFIX_DIR} SOURCE_DIR ${MKLDNN_SOURCE_DIR} - UPDATE_COMMAND "" + BUILD_ALWAYS 1 + # UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} @@ -77,9 +71,8 @@ ExternalProject_Add( -DMKLROOT=${MKLML_ROOT} -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} - -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF + -DMKLDNN_BUILD_TESTS=OFF -DMKLDNN_BUILD_EXAMPLES=OFF CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} - -DMKLROOT:PATH=${MKLML_ROOT} ) if(WIN32) SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) @@ -98,7 +91,7 @@ add_definitions(-DPADDLE_WITH_MKLDNN) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c) FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") ADD_LIBRARY(mkldnn STATIC ${dummyfile}) -TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB}) +TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) # copy the real so.0 lib to install dir @@ -107,6 +100,9 @@ if(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) + SET(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libmkldnn.so.1) ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}) + ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1}) endif(WIN32) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 7b2459c9b4dac5bdf061b2879d78a1f95f4734bd..1e7d5114a4528a58f7114300daa63656fce7963a 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -84,8 +84,8 @@ function(copy_part_of_thrid_party TARGET DST) DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib) else() copy(${TARGET} - SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} - DSTS ${dst_dir} ${dst_dir}/lib) + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_SHARED_LIB_1} + DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib) endif() endif() diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 394930baffece7da1a96d19e1abe9a2fb58e32f0..0643c81ce602702505e1854b4b4fcf5bb7e6aa51 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -105,8 +105,6 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::u8: return platform::to_void_cast(tensor.data()); - case mkldnn::memory::data_type::s16: - return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s32: return platform::to_void_cast(tensor.data()); default: @@ -134,7 +132,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, const Tensor& in, Tensor* out, platform::Place place) { - PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Input tensor format is invalid. Input tensor should " "have specified memory format.")); @@ -151,12 +149,12 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, auto* dev_ctx = dynamic_cast(pool.Get(place)); auto& cpu_engine = dev_ctx->GetEngine(); - auto in_tz = paddle::framework::vectorize(in.dims()); + auto in_tz = paddle::framework::vectorize(in.dims()); auto out_tz = in_tz; memory::data_type in_type = ToMKLDNNDataType(in.type()); - PADDLE_ENFORCE(in_type != memory::data_type::data_undef, - "Input tensor type is not supported: %s", in.type()); + PADDLE_ENFORCE_NE(in_type, memory::data_type::undef, + "Input tensor type is not supported: %s", in.type()); auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); auto out_format = @@ -167,8 +165,8 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, if (in_format != out_format) { void* in_data = GetDataFromTensor(in, in_type); - const std::string key = platform::CreateKey(in_tz, in_format, out_format, - std::to_string(in_type)); + const std::string key = + platform::CreateKey(in_tz, in_format, out_format, in_type); platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx, cpu_engine, key); @@ -179,9 +177,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - std::vector pipeline; - pipeline.push_back(*reorder_p); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(cpu_engine); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); } else { out->ShareDataWith(in); } @@ -193,7 +191,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, } out->set_layout(out_layout); // reset format since the out tensor will be feed to non-MKLDNN OPkernel - out->set_format(MKLDNNMemoryFormat::format_undef); + out->set_format(MKLDNNMemoryFormat::undef); } #endif diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 8a8342992181bfd00bf10d67cb825c956101d0b8..711146efd267b80260c17dc89bb35932e534c9c6 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -59,11 +59,10 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { {DataTypeTrait::DataType(), MKLDNNDataType::f32}, {DataTypeTrait::DataType(), MKLDNNDataType::s8}, {DataTypeTrait::DataType(), MKLDNNDataType::u8}, - {DataTypeTrait::DataType(), MKLDNNDataType::s16}, {DataTypeTrait::DataType(), MKLDNNDataType::s32}}; auto iter = dict.find(static_cast(type)); if (iter != dict.end()) return iter->second; - return MKLDNNDataType::data_undef; + return MKLDNNDataType::undef; } void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 2c89afcf88732f9f10af4311ad7a18b230ed1081..f4d3457003253ca9dab2715bdfd156f0b71910b2 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -38,9 +38,9 @@ class Tensor { #ifdef PADDLE_WITH_MKLDNN public: - inline mkldnn::memory::format format() const { return format_; } + inline mkldnn::memory::format_tag format() const { return format_; } - inline void set_format(const mkldnn::memory::format format) { + inline void set_format(const mkldnn::memory::format_tag format) { format_ = format; } @@ -54,7 +54,7 @@ class Tensor { * this field. */ - mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; + mkldnn::memory::format_tag format_ = mkldnn::memory::format_tag::undef; #endif public: diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index adace7821cd497f750c4023587251a45b434ce73..cbc3c2281db4bc5d2ff9f92c4848f6fbcc5f3417 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -248,19 +248,22 @@ if(WITH_MKLDNN) inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) # resnet101 int8 - set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101") - download_int8_data(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # TODO(grygielski) Enable after MKL-DNN 1.0 merge +# set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101") +# download_int8_data(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" ) +# inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH}) # vgg16 int8 - set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16") - download_int8_data(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # TODO(grygielski) Enable after MKL-DNN 1.0 merge +# set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16") +# download_int8_data(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" ) +# inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) # vgg19 int8 - set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19") - download_int8_data(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # TODO(grygielski) Enable after MKL-DNN 1.0 merge +# set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19") +# download_int8_data(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" ) +# inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH}) # googlenet int8 set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet") diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 586a8c1d87aa342ce9c6f318c420c4166ff2c7d6..73ea39362e3c470c172649097b346bc99b056b75 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -31,7 +31,7 @@ class ElementwiseMulOp : public ElementwiseOp { #ifdef PADDLE_WITH_MKLDNN static bool AreDimsAndFormatCorrect(const framework::ExecutionContext& ctx, int simd_width, - mkldnn::memory::format x_format) { + mkldnn::memory::format_tag x_format) { using Tensor = framework::Tensor; using paddle::framework::vectorize; using mkldnn::memory; @@ -54,7 +54,7 @@ class ElementwiseMulOp : public ElementwiseOp { if (platform::CanMKLDNNBeUsed(ctx)) { bool can_use_avx512_kernel = platform::MayIUse(platform::avx512f) && - AreDimsAndFormatCorrect(ctx, 16, memory::format::nChw16c); + AreDimsAndFormatCorrect(ctx, 16, memory::format_tag::nChw16c); if (can_use_avx512_kernel) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), framework::DataLayout::kMKLDNN, diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 2fd38e7c95fb1ca3049be6acbaac7ff4a153a9c0..3490353c7558d6b793548773cac39048379155f0 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -50,12 +50,14 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto y_dims_untrimed = y->dims(); auto z_dims = z->dims(); + mkldnn::stream astream(mkldnn_engine); + // Execute default elementwise_add operator when // broadcast operations need to performed. if (x_dims != y_dims_untrimed) { Tensor _x; MKLDNNMemoryFormat format; - std::vector src_x_tz = framework::vectorize(x_dims); + auto src_x_tz = framework::vectorize(x_dims); if ((src_x_tz.size() == 3 && x->format() != (format = MKLDNNMemoryFormat::ncw)) || @@ -69,8 +71,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto out_format = platform::MKLDNNFormatForSize( x_dims.size(), MKLDNNMemoryFormat::nchw); - const std::string key = platform::CreateKey( - src_x_tz, x->format(), out_format, std::to_string(in_type)); + const std::string key = + platform::CreateKey(src_x_tz, x->format(), out_format, in_type); platform::ReorderMKLDNNHandler handler(src_x_tz, x->type(), in_type, dev_ctx, mkldnn_engine, key); @@ -83,9 +85,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto x_reorder = handler.AcquireReorder(x_memory_p, user_x_memory_p); - std::vector pipeline; - pipeline.push_back(*x_reorder); - stream(stream::kind::eager).submit(pipeline).wait(); + x_reorder->execute(astream, *user_x_memory_p, *x_memory_p); + astream.wait(); } else { format = x->format(); _x.ShareDataWith(*x); @@ -122,19 +123,18 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { } else { PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, "Wrong layout set for X tensor"); - PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, "Wrong format set for X tensor"); PADDLE_ENFORCE_EQ(y->layout(), DataLayout::kMKLDNN, "Wrong layout set for Y tensor"); - PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Y tensor"); - std::vector src_x_tz = framework::vectorize(x_dims); - std::vector src_y_tz = framework::vectorize(y_dims_untrimed); - std::vector dst_tz = framework::vectorize(z_dims); + auto src_x_tz = framework::vectorize(x_dims); + auto src_y_tz = framework::vectorize(y_dims_untrimed); + auto dst_tz = framework::vectorize(z_dims); - std::vector srcs_pd; std::vector scales = {1.0f, 1.0f}; const std::string key = @@ -156,18 +156,17 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto sum_pd = handler.AcquireSumPrimitiveDescriptor( {src_x_memory, src_y_memory}, scales, dst_md); - T* z_data = z->mutable_data(ctx.GetPlace(), - sum_pd->dst_primitive_desc().get_size()); + T* z_data = + z->mutable_data(ctx.GetPlace(), sum_pd->dst_desc().get_size()); auto dst_memory = handler.AcquireDstMemoryFromPrimitive(z_data); - std::vector inputs({*src_x_memory, *src_y_memory}); - - auto sum_prim = handler.AcquireSum(dst_memory, &inputs); + auto sum_prim = handler.AcquireSum(); - std::vector pipeline; - pipeline.push_back(*sum_prim); - stream(stream::kind::eager).submit(pipeline).wait(); + sum_prim->execute(astream, {{MKLDNN_ARG_MULTIPLE_SRC, *src_x_memory}, + {MKLDNN_ARG_MULTIPLE_SRC + 1, *src_y_memory}, + {MKLDNN_ARG_DST, *dst_memory}}); + astream.wait(); z->set_layout(DataLayout::kMKLDNN); z->set_format(platform::GetMKLDNNFormat(*dst_memory)); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index bc74a67f3e526e54120ddbd641815d564123b753..3f91dde73f11557d20fe440372fe8e4063f7597a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -70,7 +70,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { auto x_dims = x->dims(); auto y_dims_untrimmed = y->dims(); - auto x_int_dims = paddle::framework::vectorize(x_dims); + auto x_int_dims = paddle::framework::vectorize(x_dims); int pre, num, post, is_run_common_broadcast; get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &num, &post, diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index b9a0a7915dab9965e51e687983616cdf38f249b9..3b367c9a5bcd48a25b82869affb4ddd6ff699ca4 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -35,7 +35,7 @@ class MKLDNNActivationKernel const auto *x = ctx.Input("X"); PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, "Wrong layout set for X tensor"); - PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, "Wrong format set for X tensor"); Functor functor; @@ -51,7 +51,7 @@ class MKLDNNActivationGradKernel const auto *diff_y = ctx.Input(framework::GradVarName("Out")); PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input OutGrad tensor"); - PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input OutGrad tensor"); PADDLE_ENFORCE_EQ( @@ -80,7 +80,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx, x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4, "Input dim must be with 2, 3 or 4"); - auto src_tz = framework::vectorize(x->dims()); + auto src_tz = framework::vectorize(x->dims()); auto src_format = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format(); @@ -92,13 +92,12 @@ void eltwise_forward(const framework::ExecutionContext &ctx, auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = handler.AcquireDstMemory(y); - auto activation_p = - handler.AcquireForwardPrimitive(*src_memory_p, *dst_memory_p); + auto activation_p = handler.AcquireForwardPrimitive(); - // push primitive to stream and wait until it's executed - std::vector pipeline; - pipeline.push_back(*activation_p); - stream(stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(dev_ctx.GetEngine()); + activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p}, + {MKLDNN_ARG_TO, *dst_memory_p}}); + astream.wait(); y->set_layout(DataLayout::kMKLDNN); y->set_format(GetMKLDNNFormat(*dst_memory_p)); @@ -116,7 +115,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx, const T alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; const T beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - auto diff_dst_tz = framework::vectorize(diff_y->dims()); + auto diff_dst_tz = framework::vectorize(diff_y->dims()); // diff_dst and src dims should be the same auto src_format = @@ -132,13 +131,14 @@ void eltwise_grad(const framework::ExecutionContext &ctx, auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x); - auto activation_backward_p = handler.AcquireBackwardPrimitive( - *src_memory_p, *diff_dst_memory_p, *diff_src_memory_p); - - // push primitive to stream and wait until it's executed - std::vector pipeline; - pipeline.push_back(*activation_backward_p); - stream(stream::kind::eager).submit(pipeline).wait(); + auto activation_backward_p = handler.AcquireBackwardPrimitive(); + + mkldnn::stream astream(dev_ctx.GetEngine()); + activation_backward_p->execute(astream, + {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, + {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); diff_x->set_layout(DataLayout::kMKLDNN); diff_x->set_format(GetMKLDNNFormat(*diff_src_memory_p)); diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index ad51de386ed30f369c8d90895db8e90b2f3f8b13..919bdabe1b3abf4e75b19662d65c98d453504181 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -31,9 +31,9 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerT { public: - BatchNormMKLDNNHandler(const std::vector &dims, const float &epsilon, - const unsigned &flags, const bool &global_stats, - const MKLDNNMemoryFormat fmt, + BatchNormMKLDNNHandler(const std::vector &dims, const float &epsilon, + const mkldnn::normalization_flags &flags, + const bool &global_stats, const MKLDNNMemoryFormat fmt, const platform::MKLDNNDeviceContext &dev_ctx, platform::Place cpu_place, const std::string &uniq_name) @@ -48,8 +48,8 @@ class BatchNormMKLDNNHandler : mkldnn::prop_kind::forward_training, md, epsilon, flags); } - BatchNormMKLDNNHandler(const std::vector &dims, const float &epsilon, - const unsigned &flags, + BatchNormMKLDNNHandler(const std::vector &dims, const float &epsilon, + const mkldnn::normalization_flags &flags, const MKLDNNMemoryFormat diff_fmt, const MKLDNNMemoryFormat src_fmt, const platform::MKLDNNDeviceContext &dev_ctx, @@ -70,47 +70,44 @@ class BatchNormMKLDNNHandler std::shared_ptr AcquireScaleShiftMemory(T *scaleshift_data) { return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->weights_primitive_desc(), scaleshift_data, - "@scaleshift_mem_p"); + this->fwd_pd_->weights_desc(), scaleshift_data, "@scaleshift_mem_p"); } std::shared_ptr AcquireDiffScaleShiftMemory( T *diff_scaleshift_data) { - return this->AcquireMemoryFromPrimitive( - this->bwd_pd_->diff_weights_primitive_desc(), diff_scaleshift_data, - "@diff_scaleshift_mem_p"); + return this->AcquireMemoryFromPrimitive(this->bwd_pd_->diff_weights_desc(), + diff_scaleshift_data, + "@diff_scaleshift_mem_p"); } std::shared_ptr AcquireMeanMemory( const framework::Tensor *mean) { const T *mean_data = mean->data(); return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->mean_primitive_desc(), to_void_cast(mean_data), - "@mean_mem_p"); + this->fwd_pd_->mean_desc(), to_void_cast(mean_data), "@mean_mem_p"); } std::shared_ptr AcquireMeanMemory(framework::Tensor *mean) { - T *mean_data = mean->mutable_data( - this->place_, this->fwd_pd_->mean_primitive_desc().get_size()); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->mean_primitive_desc(), mean_data, "@mean_mem_p"); + T *mean_data = mean->mutable_data(this->place_, + this->fwd_pd_->mean_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(), + mean_data, "@mean_mem_p"); } std::shared_ptr AcquireVarianceMemory( const framework::Tensor *variance) { const T *variance_data = variance->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->variance_primitive_desc(), - to_void_cast(variance_data), "@variance_mem_p"); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(), + to_void_cast(variance_data), + "@variance_mem_p"); } std::shared_ptr AcquireVarianceMemory( framework::Tensor *variance) { T *variance_data = variance->mutable_data( - this->place_, this->fwd_pd_->variance_primitive_desc().get_size()); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->variance_primitive_desc(), variance_data, - "@variance_mem_p"); + this->place_, this->fwd_pd_->variance_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(), + variance_data, "@variance_mem_p"); } }; @@ -140,11 +137,11 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, "Wrong layout set for X tensor"); - PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, "Wrong format set for X tensor"); - auto src_tz = paddle::framework::vectorize(x->dims()); - auto scale_tz = paddle::framework::vectorize(scale->dims()); + auto src_tz = paddle::framework::vectorize(x->dims()); + auto scale_tz = paddle::framework::vectorize(scale->dims()); PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); const unsigned int C = scale_tz[0]; @@ -156,9 +153,11 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { shift->data() + C); // Flags are added by bitwise OR operation - unsigned flags = mkldnn::use_scale_shift; // 001 - if (global_stats) flags |= mkldnn::use_global_stats; // 010 - if (fuse_with_relu && is_test) flags |= mkldnn::fuse_bn_relu; // 100 + auto flags = mkldnn::normalization_flags::use_scale_shift; // 001 + if (global_stats) + flags |= mkldnn::normalization_flags::use_global_stats; // 010 + if (fuse_with_relu && is_test) + flags |= mkldnn::normalization_flags::fuse_norm_relu; // 100 BatchNormMKLDNNHandler handler( src_tz, epsilon, flags, global_stats, @@ -170,38 +169,35 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { handler.AcquireScaleShiftMemory(scaleshift_data.data()); auto dst_memory = handler.AcquireDstMemory(y); - std::shared_ptr batch_norm_p; + auto batch_norm_p = handler.AcquireForwardPrimitive(); + + std::shared_ptr mean_memory; + std::shared_ptr variance_memory; + if (global_stats) { // mean and variance are taken from input Tensor const auto *mean = ctx.Input("Mean"); const auto *variance = ctx.Input("Variance"); - std::shared_ptr mean_memory = handler.AcquireMeanMemory(mean); - std::shared_ptr variance_memory = - handler.AcquireVarianceMemory(variance); - - batch_norm_p = handler.AcquireForwardPrimitive( - *src_memory, (const mkldnn::primitive::at &)*mean_memory, - (const mkldnn::primitive::at &)*variance_memory, *scaleshift_memory, - *dst_memory); + mean_memory = handler.AcquireMeanMemory(mean); + variance_memory = handler.AcquireVarianceMemory(variance); } else { // mean and variance are calculated and saved in output Tensor - std::shared_ptr mean_memory = - handler.AcquireMeanMemory(batch_mean); - std::shared_ptr variance_memory = - handler.AcquireVarianceMemory(batch_variance); - - batch_norm_p = handler.AcquireForwardPrimitive( - *src_memory, *scaleshift_memory, *dst_memory, *mean_memory, - *variance_memory); + mean_memory = handler.AcquireMeanMemory(batch_mean); + variance_memory = handler.AcquireVarianceMemory(batch_variance); } y->set_layout(DataLayout::kMKLDNN); y->set_format(platform::GetMKLDNNFormat(*dst_memory)); - std::vector pipeline; - pipeline.push_back(*batch_norm_p); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(dev_ctx.GetEngine()); + batch_norm_p->execute(astream, + {{MKLDNN_ARG_SRC, *src_memory}, + {MKLDNN_ARG_SCALE_SHIFT, *scaleshift_memory}, + {MKLDNN_ARG_MEAN, *mean_memory}, + {MKLDNN_ARG_VARIANCE, *variance_memory}, + {MKLDNN_ARG_DST, *dst_memory}}); + astream.wait(); if (!global_stats) { // mkldnn only compute stats for current batch @@ -245,11 +241,11 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input diff_y tensor"); - PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input diff_y tensor"); - auto src_tz = paddle::framework::vectorize(x->dims()); - auto scale_tz = paddle::framework::vectorize(scale->dims()); + auto src_tz = paddle::framework::vectorize(x->dims()); + auto scale_tz = paddle::framework::vectorize(scale->dims()); PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); const unsigned int C = scale_tz[0]; @@ -261,8 +257,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { platform::MKLDNNFormatForSize(src_tz.size(), x->format()); BatchNormMKLDNNHandler handler( - src_tz, epsilon, mkldnn::use_scale_shift, dst_format, input_format, - dev_ctx, ctx.GetPlace(), ctx.InputName("SavedMean")); + src_tz, epsilon, mkldnn::normalization_flags::use_scale_shift, + dst_format, input_format, dev_ctx, ctx.GetPlace(), + ctx.InputName("SavedMean")); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * C; @@ -285,13 +282,18 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data()); // finally create batch_norm backward primitive - auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive( - *src_memory, *mean_memory, *variance_memory, *diff_dst_memory, - *scaleshift_memory, *diff_src_memory, *diff_scaleshift_memory); - - std::vector pipeline; - pipeline.push_back(*batch_norm_bwd_p); - stream(stream::kind::eager).submit(pipeline).wait(); + auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive(); + + mkldnn::stream astream(dev_ctx.GetEngine()); + batch_norm_bwd_p->execute( + astream, {{MKLDNN_ARG_SRC, *src_memory}, + {MKLDNN_ARG_MEAN, *mean_memory}, + {MKLDNN_ARG_VARIANCE, *variance_memory}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}, + {MKLDNN_ARG_SCALE_SHIFT, *scaleshift_memory}, + {MKLDNN_ARG_DIFF_SRC, *diff_src_memory}, + {MKLDNN_ARG_DIFF_SCALE_SHIFT, *diff_scaleshift_memory}}); + astream.wait(); T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index e51db0208ef3edc9ad48ec8dcfe7900b0a88a504..adad812b838cd822545b45d908e7ee51213311b2 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -32,19 +32,17 @@ static void EnforceLayouts(const std::vector inputs) { for (auto* input : inputs) { PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input tensor"); } } -static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, - const mkldnn::engine& engine, - const memory::data_type& dt) { - const auto dims = paddle::framework::vectorize(input.dims()); +static memory::desc CreateMemDesc(const Tensor& input, + const memory::data_type& dt) { + const auto dims = paddle::framework::vectorize(input.dims()); const auto format = input.format(); - auto description = memory::desc(dims, dt, format); - auto mem_prim_desc = memory::primitive_desc(description, engine); - return mem_prim_desc; + auto mem_desc = memory::desc(dims, dt, format); + return mem_desc; } static platform::CPUPlace GetCpuPlace( @@ -70,14 +68,15 @@ class ConcatPrimitiveFactory { const memory::data_type& dt = memory::data_type::f32) { CreateSourcesDescriptors(multi_input, mkldnn_engine, dt); auto dst_desc = CreateDstMemDescriptor(output, dt); - return concat::primitive_desc(dst_desc, concat_axis, srcs_pd); + return concat::primitive_desc(dst_desc, concat_axis, srcs_d, mkldnn_engine); } concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd, - Tensor* output, platform::CPUPlace place) { - CreateSourcePrimitiveAts(); - dst_mem = CreateDstMemory(concat_pd, output, place); - return concat(concat_pd, inputs, dst_mem.get()); + Tensor* output, platform::CPUPlace place, + const mkldnn::engine& mkldnn_engine) { + dst_mem = mkldnn::memory(concat_pd.dst_desc(), mkldnn_engine, + output->mutable_data(place)); + return concat(concat_pd); } void SetSrcDataHandleByIndex(const std::vector& srcs, const size_t& i, @@ -96,41 +95,25 @@ class ConcatPrimitiveFactory { private: memory::desc CreateDstMemDescriptor(Tensor* output, const memory::data_type& dt) { - auto dst_dims = paddle::framework::vectorize(output->dims()); + auto dst_dims = paddle::framework::vectorize(output->dims()); return memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any); } - mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, - Tensor* output, - const platform::CPUPlace& place) { - return memory(concat_pd.dst_primitive_desc(), - output->mutable_data(place)); - } - void CreateSourcesDescriptors(const std::vector multi_input, const mkldnn::engine& mkldnn_engine, const memory::data_type& dt) { for (size_t i = 0; i < multi_input.size(); i++) { - auto mem_prim_desc = - CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt); - srcs_pd.push_back(mem_prim_desc); - srcs.push_back( - memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); - } - } - - void CreateSourcePrimitiveAts() { - inputs.reserve(srcs.size()); - for (size_t i = 0; i < srcs.size(); i++) { - inputs.push_back(srcs[i]); + auto mem_desc = CreateMemDesc(*multi_input[i], dt); + srcs_d.push_back(mem_desc); + srcs.push_back(memory(mem_desc, mkldnn_engine, + to_void_cast(multi_input[i]->data()))); } } private: - std::vector srcs_pd; - std::vector srcs; - std::vector inputs; - boost::optional dst_mem; + std::vector srcs_d; + std::vector srcs; + boost::optional dst_mem; }; template @@ -140,7 +123,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { auto multi_input = ctx.MultiInput("X"); EnforceLayouts(multi_input); Tensor* output = ctx.Output("Out"); - int64_t concat_axis = static_cast(ctx.Attr("axis")); + int concat_axis = ctx.Attr("axis"); auto& dev_ctx = ctx.template device_context(); auto place = GetCpuPlace(ctx); @@ -152,6 +135,7 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { std::string key = platform::CreateKey( paddle::framework::vectorize(multi_input[0]->dims()), ctx.OutputName("Out"), dt, platform::ThreadIDasStr()); + const std::string key_prim = key + "@concat_p"; const std::string key_concat_pd = key + "@concat_pd"; const std::string key_srcs = key + "@concat_srcs"; @@ -162,14 +146,13 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { std::shared_ptr dst_mem; auto concat_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); + const auto& mkldnn_engine = dev_ctx.GetEngine(); if (concat_p == nullptr) { - const auto& mkldnn_engine = dev_ctx.GetEngine(); concat_pd = std::make_shared( - prim_creator.CreateConcatPrimDescriptor(multi_input, output, - static_cast(concat_axis), - mkldnn_engine, dt)); - concat_p = std::make_shared( - prim_creator.CreateConcatPrimitive(*concat_pd, output, place)); + prim_creator.CreateConcatPrimDescriptor( + multi_input, output, concat_axis, mkldnn_engine, dt)); + concat_p = std::make_shared(prim_creator.CreateConcatPrimitive( + *concat_pd, output, place, mkldnn_engine)); srcs = std::make_shared>(prim_creator.GetSrcs()); dst_mem = std::make_shared(prim_creator.GetDst()); dev_ctx.SetBlob(key_prim, concat_p); @@ -189,7 +172,15 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data(place)); } - stream(stream::kind::eager).submit({*concat_p}).wait(); + mkldnn::stream astream(mkldnn_engine); + std::unordered_map args; + for (size_t i = 0; i < multi_input.size(); ++i) { + args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, (*srcs).at(i)}); + } + args.insert({MKLDNN_ARG_DST, *dst_mem}); + + concat_p->execute(astream, args); + astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(platform::GetMKLDNNFormat(*dst_mem)); diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 52391077eee54c0004f663eb3f7ba664e5796a0e..7faf66a6e832acd17ef3b9788d4f8fedacd66d88 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -29,8 +29,8 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; -inline void GetWeightsTz(std::vector& weights_tz, int groups, // NOLINT - bool is_conv3d) { +inline void GetWeightsTz(std::vector& weights_tz, // NOLINT + int groups, bool is_conv3d) { if (groups > 1) { if (is_conv3d) { int output = weights_tz[0]; @@ -131,12 +131,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input tensor"); PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, "Wrong layout set for Filter tensor"); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Filter tensor"); PADDLE_ENFORCE_GE( @@ -156,16 +156,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { if (bias) { PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN, "Wrong layout set for Bias tensor"); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Bias tensor"); PADDLE_ENFORCE_EQ(bias->dims().size(), 1, "Bias must only have 1 dimension, i.e. X"); } - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + std::vector dilations_temp = ctx.Attr>("dilations"); + std::vector dilations(begin(dilations_temp), end(dilations_temp)); + std::string fuse_activation = ctx.Attr("fuse_activation"); float fuse_alpha = ctx.Attr("fuse_alpha"); float fuse_beta = ctx.Attr("fuse_beta"); @@ -180,11 +186,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + auto ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); + std::vector pipeline; + PADDLE_ENFORCE( is_conv3d ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && @@ -195,18 +203,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* filter_data = filter->data(); - auto src_tz = paddle::framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto weights_tz = paddle::framework::vectorize(filter->dims()); int g = std::max(groups, 1); + GetWeightsTz(weights_tz, g, is_conv3d); - auto dst_tz = paddle::framework::vectorize(output->dims()); + + auto dst_tz = paddle::framework::vectorize(output->dims()); // Get unique name for storing MKLDNN primitives const std::string key = platform::CreateKey( src_tz, ctx.InputName("Input") + ctx.InputName("Filter")); - std::vector pipeline; - auto src_format = input->format(); MKLDNNMemoryFormat weights_format = GetWeightsFormat(filter->format(), g, is_conv3d); @@ -242,7 +250,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( weights_tz, platform::MKLDNNGetDataType(), weights_format); - std::vector bias_tz; + std::vector bias_tz; auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -253,7 +261,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); + bias_tz = paddle::framework::vectorize(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); conv_pd = handler.AcquireConvolutionPrimitiveDescriptor( @@ -296,7 +304,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto output_data = output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); auto residual_data_tz = - paddle::framework::vectorize(residual_param->dims()); + paddle::framework::vectorize(residual_param->dims()); auto residual_data_type = paddle::framework::ToMKLDNNDataType(residual_param->type()); @@ -320,28 +328,30 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } - // create convolution op primitive - std::shared_ptr conv_p; - std::shared_ptr user_bias_memory_p, bias_memory_p; + auto conv_p = handler.AcquireConvolution(); + + mkldnn::stream astream(mkldnn_engine); if (bias) { const T* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); - user_bias_memory_p = + auto user_bias_memory_p = handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); - bias_memory_p = + auto bias_memory_p = handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); - conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, - bias_memory_p, dst_memory_p); + + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_BIAS, *bias_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); + } else { - conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, - dst_memory_p); + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); } - - // push primitive to stream and wait until it's executed - pipeline.push_back(*conv_p); - stream(stream::kind::eager).submit(pipeline).wait(); + astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); @@ -359,7 +369,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input tensor"); PADDLE_ENFORCE_GE( @@ -376,7 +386,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); - auto src_tz = paddle::framework::vectorize(input->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); mkldnn::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(input->type()); @@ -385,7 +395,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { src_tz, src_dt, ctx.InputName("Input") + ctx.InputName("Filter")); const std::string key_conv_pd = key + "@conv_pd"; - bool need_s8_to_u8 = false; std::shared_ptr conv_p; std::shared_ptr src_memory_p; @@ -407,13 +416,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto prim_key = key + key_tid + "@conv_p"; auto dst_key = key + key_tid + "@dst_mem_p"; auto src_key = key + key_tid + "@src_mem_p"; + auto weights_key = key + key_tid + "@weights_mem_p"; + auto bias_key = key + key_tid + "@bias_mem_p"; auto user_src_key = key + key_tid + "@user_src_mem_p"; + auto user_residual_key = key + key_tid + "@user_residual_data_mem_p"; auto src_reorder_key = key + key_tid + "@src_mem_preorder_p"; auto residual_reorder_key = key + key_tid + "@residual_data_mem_preorder_p"; conv_p = std::static_pointer_cast( dev_ctx.GetBlob(prim_key)); + mkldnn::stream astream(mkldnn_engine); + if (conv_p == nullptr || !is_test) { float fuse_alpha = ctx.Attr("fuse_alpha"); float fuse_beta = ctx.Attr("fuse_beta"); @@ -423,7 +437,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, "Wrong layout set for Filter tensor"); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Filter tensor"); PADDLE_ENFORCE_GE( @@ -442,16 +456,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { if (bias) { PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN, "Wrong layout set for Bias tensor"); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Bias tensor"); PADDLE_ENFORCE_EQ(bias->dims().size(), 1, "Bias must only have 1 dimension, i.e. X"); } - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - std::vector strides = ctx.Attr>("strides"); + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + std::vector dilations_temp = ctx.Attr>("dilations"); + std::vector dilations(begin(dilations_temp), + end(dilations_temp)); + std::string padding_algorithm = ctx.Attr("padding_algorithm"); @@ -466,17 +487,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + auto ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); int groups = ctx.Attr("groups"); - auto weights_tz = paddle::framework::vectorize(filter->dims()); + auto weights_tz = paddle::framework::vectorize(filter->dims()); int g = std::max(groups, 1); GetWeightsTz(weights_tz, g, is_conv3d); - auto dst_tz = paddle::framework::vectorize(output->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); PADDLE_ENFORCE_EQ( is_conv3d @@ -526,7 +547,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { */ auto chosen_memory_format = MKLDNNMemoryFormat::any; - std::vector bias_tz; + std::vector bias_tz; auto src_md = platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); @@ -542,7 +563,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { : mkldnn::prop_kind::forward_training; if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); + bias_tz = paddle::framework::vectorize(bias->dims()); auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, MKLDNNMemoryFormat::x); conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( @@ -582,7 +603,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::ToMKLDNNDataType(residual_param->type()); if (residual_param->format() != handler->GetDstFormat()) { auto residual_data_tz = - paddle::framework::vectorize(residual_param->dims()); + paddle::framework::vectorize(residual_param->dims()); auto user_residual_md = platform::MKLDNNMemDesc( residual_data_tz, residual_dt, residual_param->format()); dst_memory_p = platform::SetDstMemory( @@ -601,6 +622,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { // create convolution op primitive auto scale_bias_key = key + "@scale_bias"; + conv_p = handler->AcquireConvolution(); if (bias) { const K* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( @@ -621,16 +643,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( user_bias_memory_p, pipeline, is_test, true, scale_bias_data, mask_reorder); - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - bias_memory_p, dst_memory_p); + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_BIAS, *bias_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); } else { - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - dst_memory_p); + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); } - // push primitive to stream and wait until it's executed - pipeline.push_back(*conv_p); } else { - auto src_memory_reorder_p = std::static_pointer_cast( + auto src_memory_reorder_p = std::static_pointer_cast( dev_ctx.GetBlob(src_reorder_key)); src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(src_key)); @@ -638,10 +661,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { user_src_memory_p = std::static_pointer_cast( dev_ctx.GetBlob(user_src_key)); user_src_memory_p->set_data_handle(to_void_cast(input_data)); + src_memory_reorder_p->execute(astream, *user_src_memory_p, + *src_memory_p); + astream.wait(); } else if (src_memory_p) { src_memory_p->set_data_handle(to_void_cast(input_data)); } - + auto weights_memory_p = std::static_pointer_cast( + dev_ctx.GetBlob(weights_key)); dst_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); conv_pd = @@ -661,19 +688,31 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } platform::SetDstMemoryHandler(ctx, output, handler, dst_memory_p); - if (src_memory_reorder_p) { - pipeline.push_back(*src_memory_reorder_p); - } - - auto residual_reorder_p = std::static_pointer_cast( + auto residual_reorder_p = std::static_pointer_cast( dev_ctx.GetBlob(residual_reorder_key)); if (residual_reorder_p) { - pipeline.push_back(*residual_reorder_p); + auto user_residual_data_p = std::static_pointer_cast( + dev_ctx.GetBlob(user_residual_key)); + residual_reorder_p->execute(astream, *user_residual_data_p, + *dst_memory_p); + astream.wait(); + } + + auto bias_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(bias_key)); + + if (bias_memory_p) { + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_BIAS, *bias_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); + } else { + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); } - pipeline.push_back(*conv_p); } - // push primitive to stream and wait until it's executed - stream(stream::kind::eager).submit(pipeline).wait(); + astream.wait(); if (need_s8_to_u8) { output->mutable_data(ctx.GetPlace()); } @@ -702,17 +741,17 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input tensor"); PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, "Wrong layout set for Filter tensor"); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Filter tensor"); PADDLE_ENFORCE_EQ(output_grad->layout(), DataLayout::kMKLDNN, "Wrong layout set for output_grad tensor"); - PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::undef, "Wrong format set for output_grad tensor"); PADDLE_ENFORCE_EQ( @@ -721,10 +760,17 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { if (!input_grad && !filter_grad) return; - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + std::vector dilations_temp = ctx.Attr>("dilations"); + std::vector dilations(begin(dilations_temp), end(dilations_temp)); + std::string padding_algorithm = ctx.Attr("padding_algorithm"); + int groups = ctx.Attr("groups"); bool is_conv3d = strides.size() == 3U; @@ -740,16 +786,18 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + auto ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); - auto src_tz = paddle::framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto weights_tz = paddle::framework::vectorize(filter->dims()); + int g = std::max(groups, 1); GetWeightsTz(weights_tz, g, is_conv3d); - auto dst_tz = paddle::framework::vectorize(output_grad->dims()); + auto dst_tz = paddle::framework::vectorize(output_grad->dims()); + auto src_format = input->format(); MKLDNNMemoryFormat weights_format = GetWeightsFormat(filter->format(), g, is_conv3d); @@ -803,7 +851,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { weights_tz, platform::MKLDNNGetDataType(), weights_format); auto diff_dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - // Retrieve conv_pd from device context auto conv_pd = std::static_pointer_cast( @@ -815,18 +862,18 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // create backward convolution weights primitive descriptor auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( - mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, - strides, mkldnn_paddings[0], mkldnn_paddings[1], - mkldnn::padding_kind::zero); + mkldnn::algorithm::convolution_direct, src_md, diff_weights_md, + diff_dst_md, strides, mkldnn_paddings[0], mkldnn_paddings[1]); + auto conv_bwd_weights_pd = std::make_shared( conv_bwd_weights_desc, mkldnn_engine, *conv_pd); // create backward convolution data primitive descriptor auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( - mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md, - strides, mkldnn_paddings[0], mkldnn_paddings[1], - mkldnn::padding_kind::zero); + mkldnn::algorithm::convolution_direct, diff_src_md, weights_md, + diff_dst_md, strides, mkldnn_paddings[0], mkldnn_paddings[1]); + auto conv_bwd_data_pd = std::make_shared( conv_bwd_data_desc, mkldnn_engine, *conv_pd); @@ -842,8 +889,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { user_weights_md, to_void_cast(filter_data)); auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory( user_diff_dst_md, to_void_cast(output_grad_data)); - - // create backward conv primitive for weights + mkldnn::stream astream(mkldnn_engine); if (filter_grad) { auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive( user_src_memory_p, pipeline); @@ -859,16 +905,18 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { handler.AcquireDiffWeightsMemoryFromWeightsPrimitive( reinterpret_cast(filter_grad_data)); - auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights( - src_memory_p, diff_dst_memory_4filter_p, diff_weights_memory_p); + auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights(); - // push primitive to stream and wait until it's executed - pipeline.push_back(*conv_bwd_weights_p); + // TODO(grygielski) why no bias_diff? + conv_bwd_weights_p->execute( + astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_4filter_p}, + {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); + astream.wait(); filter_grad->set_layout(DataLayout::kMKLDNN); filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p)); } - if (input_grad) { auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive( user_weights_memory_p, pipeline); @@ -883,15 +931,17 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive( reinterpret_cast(input_grad_data)); - auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData( - diff_dst_memory_4data_p, weights_memory_p, diff_src_memory_p); + auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData(); - pipeline.push_back(*conv_bwd_data_p); + conv_bwd_data_p->execute(astream, + {{MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_4data_p}, + {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); input_grad->set_layout(DataLayout::kMKLDNN); input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); } - stream(stream::kind::eager).submit(pipeline).wait(); } }; diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 958b8906415e752e69e09a438580ca53e210509f..d1ef629ee24b2d007a46c33b1c812e57d23907ca 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -48,12 +48,12 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input tensor"); PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, "Wrong layout set for Filter tensor"); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Filter tensor"); PADDLE_ENFORCE_EQ(input->dims().size(), 4, @@ -64,16 +64,22 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { if (bias) { PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN, "Wrong layout set for Bias tensor"); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Bias tensor"); PADDLE_ENFORCE_EQ(bias->dims().size(), 1, "Bias must only have 1 dimension, i.e. X"); } - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + std::vector dilations_temp = ctx.Attr>("dilations"); + std::vector dilations(begin(dilations_temp), end(dilations_temp)); + int groups = ctx.Attr("groups"); std::string padding_algorithm = ctx.Attr("padding_algorithm"); @@ -83,7 +89,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + auto ksize = framework::vectorize(filter_data_dims); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); @@ -95,8 +101,9 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* filter_data = filter->data(); - auto src_tz = paddle::framework::vectorize(input->dims()); - auto iohw_weights_tz = paddle::framework::vectorize(filter->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto iohw_weights_tz = + paddle::framework::vectorize(filter->dims()); auto weights_tz = iohw_weights_tz; // IOHW -> OIHW @@ -137,7 +144,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { weights_tz[3] = h; weights_tz[4] = w; } - auto dst_tz = paddle::framework::vectorize(output->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); // Get unique name for storing MKLDNN primitives @@ -165,7 +172,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - std::vector bias_tz; + std::vector bias_tz; auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -177,7 +184,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); + bias_tz = paddle::framework::vectorize(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor( @@ -203,15 +210,14 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( user_weights_memory_p, pipeline, is_test); - std::shared_ptr dst_memory_p; - auto output_data = output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); - dst_memory_p = handler.AcquireDstMemoryFromPrimitive( + auto dst_memory_p = handler.AcquireDstMemoryFromPrimitive( platform::to_void_cast(output_data)); - // create convolution op primitive - std::shared_ptr conv_p; + auto conv_p = handler.AcquireConvolution(); + + mkldnn::stream astream(mkldnn_engine); if (bias) { const T* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( @@ -221,16 +227,17 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto bias_memory_p = handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); - conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, - bias_memory_p, dst_memory_p); + + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_BIAS, *bias_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); } else { - conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, - dst_memory_p); + conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}); } - - // push primitive to stream and wait until it's executed - pipeline.push_back(*conv_p); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 41a641659cf74c9e244b581f4f93b5e96592e214..86c1c3232644a1fed236563a65a16bc2f6466d49 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -46,9 +46,8 @@ class DeQuantOpKernel : public framework::OpKernel { float* output_data = output->mutable_data(ctx.GetPlace()); std::vector reorder_scale = {1.0f / scale_data}; - std::vector pipeline; - auto src_tz = paddle::framework::vectorize(input->dims()); - auto dst_tz = paddle::framework::vectorize(output->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); mkldnn::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(input->type()); MKLDNNMemoryFormat src_fmt = input->format(); @@ -69,23 +68,20 @@ class DeQuantOpKernel : public framework::OpKernel { attri.set_output_scales(mask, reorder_scale); auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt); - auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); - src_memory = - std::make_shared(src_pd, to_void_cast(input_data)); - std::shared_ptr src_memory_p = - std::shared_ptr(new primitive::at(*src_memory)); - - auto dst_md = platform::MKLDNNMemDesc( - {dst_tz}, memory::data_type::f32, - platform::MKLDNNFormatForSize(dst_tz.size(), memory::format::nchw)); - auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); + src_memory = std::make_shared( + src_md, engine, to_void_cast(input_data)); + + auto dst_md = + platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32, + platform::MKLDNNFormatForSize( + dst_tz.size(), MKLDNNMemoryFormat::nchw)); + dst_memory = std::make_shared( - dst_pd, to_void_cast(output_data)); + dst_md, engine, to_void_cast(output_data)); auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(src_pd, dst_pd, attri)); - reorder_p = std::shared_ptr( - new reorder(*reorder_pd, *src_memory_p, *dst_memory)); + new reorder::primitive_desc(*src_memory, *dst_memory, attri)); + reorder_p = std::shared_ptr(new reorder(*reorder_pd)); dev_ctx.SetBlob(key_prim, reorder_p); dev_ctx.SetBlob(key_src_mem, src_memory); dev_ctx.SetBlob(key_dst_mem, dst_memory); @@ -99,8 +95,9 @@ class DeQuantOpKernel : public framework::OpKernel { dst_memory->set_data_handle(output->mutable_data(ctx.GetPlace())); } - pipeline.push_back(*reorder_p); - stream(stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(engine); + reorder_p->execute(astream, *src_memory, *dst_memory); + astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory)); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index dbf954213cb45b4a0b99bb6a55a515a2bc7b566c..dfe9639b6cc1334d185cbc91088d63a6a75e9da9 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -42,16 +42,16 @@ class FCPrimitiveFactory { public: explicit FCPrimitiveFactory(const mkldnn::engine& engine) : engine_(engine) {} - inner_product_forward CreateFcPrimitive(const LoDTensor* input, - const Tensor* weights, - const Tensor* bias, LoDTensor* output, - const ExecutionContext& ctx) { + void ExecuteFcPrimitive(const LoDTensor* input, const Tensor* weights, + const Tensor* bias, LoDTensor* output, + const ExecutionContext& ctx) { RecomputeOutputDims(ctx, input, weights, output); // If primitive has already been created and cached, don't create new one, // but update input and output data pointers and return it. if (fc_) { UpdateDataPointers(ctx, output, input); - return *fc_; + this->Execute(); + return; } auto src_desc = CreateMemDescriptor(input, input->format()); input_ = CreateMemory(src_desc, input); @@ -72,7 +72,22 @@ class FCPrimitiveFactory { auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any); fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx); - return *fc_; + this->Execute(); + } + + void Execute() { + mkldnn::stream astream(engine_); + if (bias_) { + fc_->execute(astream, {{MKLDNN_ARG_SRC, *input_}, + {MKLDNN_ARG_WEIGHTS, *weights_}, + {MKLDNN_ARG_BIAS, *bias_}, + {MKLDNN_ARG_DST, *output_}}); + } else { + fc_->execute(astream, {{MKLDNN_ARG_SRC, *input_}, + {MKLDNN_ARG_WEIGHTS, *weights_}, + {MKLDNN_ARG_DST, *output_}}); + } + astream.wait(); } private: @@ -83,7 +98,7 @@ class FCPrimitiveFactory { // If the primitive exists, but the output tensor has changed its // variable, update its format to what has been determined in first // call to CreateFcPrimitive method. - if (out->format() == MKLDNNMemoryFormat::format_undef) { + if (out->format() == MKLDNNMemoryFormat::undef) { auto output_format = platform::GetMKLDNNFormat(*output_); out->set_format((MKLDNNMemoryFormat)output_format); } @@ -94,36 +109,37 @@ class FCPrimitiveFactory { using format = MKLDNNMemoryFormat; switch (fmt) { case format::nChw16c: - return format::oIhw16i; + return format::aBcd16b; case format::nChw8c: - return format::oIhw8i; + return format::aBcd8b; case format::nchw: return format::oihw; case format::nhwc: return format::hwio; default: - return format::format_undef; + return format::undef; } } // Convert data from one data format to another mkldnn::memory Reorder(const memory::desc& src_desc, - const memory::desc& dst_desc, const void* src_data) { - auto src_mem = memory({src_desc, engine_}, const_cast(src_data)); - auto dst_mem = memory({dst_desc, engine_}); + const memory::desc& dst_desc, void* src_data) { + auto src_mem = memory(src_desc, engine_, src_data); + auto dst_mem = memory(dst_desc, engine_); auto reorder = mkldnn::reorder(src_mem, dst_mem); - stream(stream::kind::eager).submit({reorder}).wait(); + mkldnn::stream astream(engine_); + reorder.execute(astream, src_mem, dst_mem); + astream.wait(); return dst_mem; } // Convert data from one data format to another and rescale it. // If the desired data type is (un)signed int8, quantization occurs here. - mkldnn::memory Reorder(const memory& src_mem, - const memory::primitive_desc& dst_pd, + mkldnn::memory Reorder(const memory& src_mem, const memory::desc& dst_md, const std::vector& scale_data) { - mkldnn::memory dst_mem = mkldnn::memory(dst_pd); + mkldnn::memory dst_mem = mkldnn::memory(dst_md, engine_); mkldnn::primitive_attr attributes; // According to MKL-DNN's documentation mask determines along which // dimensions should the scale be applied. @@ -133,19 +149,19 @@ class FCPrimitiveFactory { // becuase we perform per-output-channel quantization int mask = CreateMask(0, scale_data.size() > 1); attributes.set_output_scales(mask, scale_data); - auto reorder = - mkldnn::reorder(mkldnn::reorder::primitive_desc( - src_mem.get_primitive_desc(), dst_pd, attributes), - src_mem, dst_mem); + auto reorder = mkldnn::reorder({src_mem, dst_mem, attributes}); - stream(stream::kind::eager).submit({reorder}).wait(); + mkldnn::stream astream(engine_); + reorder.execute(astream, + {{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}}); + astream.wait(); return dst_mem; } template - static mkldnn::memory::desc CreateMemDescriptor(const std::vector& dims, - MKLDNNMemoryFormat format) { + static mkldnn::memory::desc CreateMemDescriptor( + const std::vector& dims, MKLDNNMemoryFormat format) { return platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType(), format); } @@ -153,28 +169,28 @@ class FCPrimitiveFactory { template static mkldnn::memory::desc CreateMemDescriptor(const Tensor* tensor, MKLDNNMemoryFormat format) { - auto dims = framework::vectorize(tensor->dims()); + auto dims = framework::vectorize(tensor->dims()); return CreateMemDescriptor(dims, format); } template mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc, const Tensor* tensor) { - return CreateMemory(desc, tensor->data()); + return CreateMemory(desc, platform::to_void_cast(tensor->data())); } - mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc, - const void* data) { - return memory({desc, engine_}, const_cast(data)); + mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc, void* data) { + return memory(desc, engine_, data); } // Transpose weights through MKL-DNN's reorder from io to oi format. mkldnn::memory TransposeWeights(const Tensor* weights) { - auto dims = framework::vectorize(weights->dims()); + auto dims = framework::vectorize(weights->dims()); std::swap(dims[0], dims[1]); // Correct output dimensions auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io); auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi); - return Reorder(src_desc, dst_desc, weights->data()); + return Reorder(src_desc, dst_desc, + platform::to_void_cast(weights->data())); } // Compute the bias scales so that its values correspond to the @@ -232,17 +248,17 @@ class FCPrimitiveFactory { } void QuantizeWeights(const ExecutionContext& ctx) { - auto quantized_desc = weights_->get_primitive_desc().desc(); + auto quantized_desc = weights_->get_desc(); quantized_desc.data.data_type = (mkldnn_data_type_t)platform::MKLDNNGetDataType(); - weights_ = Reorder(*weights_, {quantized_desc, engine_}, + weights_ = Reorder(*weights_, quantized_desc, ctx.Attr>("Scale_weights")); } void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc, const ExecutionContext& ctx) { auto bias_scales = ComputeBiasScales(ctx); - bias_ = Reorder(*bias_, fc_prim_desc.bias_primitive_desc(), bias_scales); + bias_ = Reorder(*bias_, fc_prim_desc.bias_desc(), bias_scales); } // Fuse relu into FC with activation type attribute has been set to 'relu' @@ -273,8 +289,8 @@ class FCPrimitiveFactory { const ExecutionContext& ctx) { // Acquire descriptors needed for creation of inner_product primitive // descriptor - const auto weights_desc = weights_memory.get_primitive_desc().desc(); - const auto src_desc = src_memory.get_primitive_desc().desc(); + const auto weights_desc = weights_memory.get_desc(); + const auto src_desc = src_memory.get_desc(); // Based on provided attributes, create attributes used by MKL-DNN to // enable fused post-op activations such as 'relu' const auto attrs = CreatePostOps(ctx); @@ -294,15 +310,12 @@ class FCPrimitiveFactory { output_ = CreateDstMemory(fc_prim_desc, ctx, output); // Return MKL-DNN primitive ready to be fed into pipeline and executed - return inner_product_forward(fc_prim_desc, src_memory, weights_memory, - *bias_, *output_); + return inner_product_forward(fc_prim_desc); } else { auto fc_prim_desc = CreateFcPrimDesc(src_desc, weights_desc, dst_desc, attrs); output_ = CreateDstMemory(fc_prim_desc, ctx, output); - - return inner_product_forward(fc_prim_desc, src_memory, weights_memory, - *output_); + return inner_product_forward(fc_prim_desc); } } @@ -345,8 +358,8 @@ class FCPrimitiveFactory { // perform a converion. mkldnn::memory CreateFourDimWeightsMemory(const Tensor* input, const Tensor* weights) { - auto input_dims = framework::vectorize(input->dims()); - auto weight_dims = framework::vectorize(weights->dims()); + auto input_dims = framework::vectorize(input->dims()); + auto weight_dims = framework::vectorize(weights->dims()); auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]}; auto dst_format = MatchWeightFormat(input->format()); @@ -361,11 +374,11 @@ class FCPrimitiveFactory { mkldnn::memory CreateDstMemory( const mkldnn::inner_product_forward::primitive_desc& fc_prim_desc, const ExecutionContext& ctx, Tensor* output) { - auto dst_prim_desc = fc_prim_desc.dst_primitive_desc(); - auto buffer_size = dst_prim_desc.get_size(); + auto dst_desc = fc_prim_desc.dst_desc(); + auto buffer_size = dst_desc.get_size(); T_out* output_data = output->mutable_data(ctx.GetPlace(), buffer_size); - memory dst_mem(dst_prim_desc, to_void_cast(output_data)); + memory dst_mem(dst_desc, engine_, to_void_cast(output_data)); output->set_format(platform::GetMKLDNNFormat(dst_mem)); return dst_mem; } @@ -421,25 +434,24 @@ GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx, // Choose appropriate primitive factory implementation based on inferred // output type (uint8, int8 or float). template -static inner_product_forward GetFcPrimitive( - const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx, - const LoDTensor* input, const Tensor* w, const Tensor* bias, - LoDTensor* output, const mkldnn::engine& mkldnn_engine, bool fuse_relu, - bool force_fp32_output) { +static void ExecuteFc(const MKLDNNDeviceContext& dev_ctx, + const ExecutionContext& ctx, const LoDTensor* input, + const Tensor* w, const Tensor* bias, LoDTensor* output, + const mkldnn::engine& mkldnn_engine, bool fuse_relu, + bool force_fp32_output) { constexpr bool is_int8 = std::is_same::value || std::is_same::value; if (!is_int8 || force_fp32_output) { - return GetPrimitiveFactory(dev_ctx, ctx, input, w, - mkldnn_engine) - ->CreateFcPrimitive(input, w, bias, output, ctx); + GetPrimitiveFactory(dev_ctx, ctx, input, w, mkldnn_engine) + ->ExecuteFcPrimitive(input, w, bias, output, ctx); } else if (fuse_relu) { - return GetPrimitiveFactory(dev_ctx, ctx, input, w, - mkldnn_engine) - ->CreateFcPrimitive(input, w, bias, output, ctx); + GetPrimitiveFactory(dev_ctx, ctx, input, w, + mkldnn_engine) + ->ExecuteFcPrimitive(input, w, bias, output, ctx); } else { - return GetPrimitiveFactory(dev_ctx, ctx, input, w, - mkldnn_engine) - ->CreateFcPrimitive(input, w, bias, output, ctx); + GetPrimitiveFactory(dev_ctx, ctx, input, w, + mkldnn_engine) + ->ExecuteFcPrimitive(input, w, bias, output, ctx); } } @@ -461,10 +473,8 @@ class FCMKLDNNOpKernel : public framework::OpKernel { bool fuse_relu = ctx.Attr("activation_type") == "relu"; bool force_fp32_output = ctx.Attr("force_fp32_output"); - auto fc = - GetFcPrimitive(dev_ctx, ctx, input, w, bias, output, - mkldnn_engine, fuse_relu, force_fp32_output); - stream(stream::kind::eager).submit({fc}).wait(); + ExecuteFc(dev_ctx, ctx, input, w, bias, output, mkldnn_engine, + fuse_relu, force_fp32_output); output->set_layout(DataLayout::kMKLDNN); } diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index d992765ce91b128984a5544d61f5b600ae38ef69..7f5337a14a245441d083d842f460d7719b79ec8f 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -41,7 +41,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { } tensor->set_layout(DataLayout::kMKLDNN); - tensor->set_format(mkldnn::memory::format::oihw); + tensor->set_format(mkldnn::memory::format_tag::oihw); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index c1e215ad7401251a239ddc49f439eb103607b688..5b025fa11e3f306597fc0888dd3b7ff798606b41 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -49,7 +49,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { const float k = ctx.Attr("k"); bool is_test = ctx.Attr("is_test"); - auto dims = paddle::framework::vectorize(x->dims()); + auto dims = paddle::framework::vectorize(x->dims()); platform::LRNMKLDNNHandler handler(dims, n, alpha, beta, k, x->format(), is_test, dev_ctx, ctx.GetPlace(), @@ -58,14 +58,17 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_memory = handler.AcquireSrcMemory(x); auto dst_memory = handler.AcquireDstMemory(out); - std::shared_ptr workspace_memory; - std::shared_ptr lrn_p; - if (is_test == false) { - workspace_memory = handler.AcquireWorkspaceMemory(mid); - mid->set_layout(framework::DataLayout::kMKLDNN); + auto lrn_p = handler.AcquireForwardPrimitive(); + + auto workspace_memory = handler.AcquireWorkspaceMemory(mid); + mid->set_layout(framework::DataLayout::kMKLDNN); + + mkldnn::stream astream(dev_ctx.GetEngine()); + if (!workspace_memory->get_desc().is_zero()) { mid->set_format(platform::GetMKLDNNFormat(*workspace_memory)); - lrn_p = handler.AcquireForwardPrimitive(*src_memory, *workspace_memory, - *dst_memory); + lrn_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, + {MKLDNN_ARG_DST, *dst_memory}, + {MKLDNN_ARG_WORKSPACE, *workspace_memory}}); } else { // mid has to be allocated and filled // k to pass LRN unit tests @@ -73,11 +76,12 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { mid->mutable_data(ctx.GetPlace()); auto e_mid = framework::EigenTensor::From(*mid); e_mid = e_mid.constant(k); - lrn_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory); - } + mid->set_format(platform::GetMKLDNNFormat(*dst_memory)); - std::vector pipeline = {*lrn_p}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + lrn_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, + {MKLDNN_ARG_DST, *dst_memory}}); + } + astream.wait(); out->set_layout(framework::DataLayout::kMKLDNN); out->set_format(platform::GetMKLDNNFormat(*dst_memory)); @@ -109,7 +113,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); - auto dims = paddle::framework::vectorize(x->dims()); + auto dims = paddle::framework::vectorize(x->dims()); platform::LRNMKLDNNHandler handler(dims, n, alpha, beta, k, x->format(), out_grad->format(), dev_ctx, @@ -120,11 +124,14 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad); auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad); - auto lrn_bwd = handler.AcquireBackwardPrimitive( - *src_memory, *diff_dst_memory, *workspace, *diff_src_memory); + auto lrn_bwd = handler.AcquireBackwardPrimitive(); - std::vector pipeline = {*lrn_bwd}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(dev_ctx.GetEngine()); + lrn_bwd->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}, + {MKLDNN_ARG_DIFF_SRC, *diff_src_memory}, + {MKLDNN_ARG_WORKSPACE, *workspace}}); + astream.wait(); x_grad->set_layout(framework::DataLayout::kMKLDNN); x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory)); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index b9547e24a1672d1ba33977696a439d5879f8e56d..96f4fe9e6d034af6ed0ef147ece9408c7ba49b8f 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -59,6 +59,7 @@ class MulPrimitiveFactory { if (mul_) { UpdateDataPointers(ctx, output, &x_matrix); + Execute(); return *mul_; } @@ -68,9 +69,18 @@ class MulPrimitiveFactory { auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any); mul_ = CreateMulPrimitive(*x_input_, *y_input_, dst_desc, output, ctx); + Execute(); return *mul_; } + void Execute() { + mkldnn::stream astream(engine_); + (*mul_).execute(astream, {{MKLDNN_ARG_SRC, *x_input_}, + {MKLDNN_ARG_WEIGHTS, *y_input_}, + {MKLDNN_ARG_DST, *output_}}); + astream.wait(); + } + protected: template Tensor UpdateDataFormat(const Tensor *data, int num_col_dims, @@ -92,7 +102,7 @@ class MulPrimitiveFactory { to_void_cast(x_tmp.data())); x_tmp.Resize(data->dims()); - x_tmp.set_format((MKLDNNMemoryFormat)dst_mdesc.data.format); + x_tmp.set_format(platform::GetMKLDNNFormat(dst_mdesc)); data_matrix = framework::ReshapeToMatrix(x_tmp, num_col_dims); } else { data_matrix = framework::ReshapeToMatrix(*data, num_col_dims); @@ -106,7 +116,7 @@ class MulPrimitiveFactory { x_input_->set_data_handle(to_void_cast(in->data())); output_->set_data_handle(out->mutable_data(ctx.GetPlace())); - if (out->format() == MKLDNNMemoryFormat::format_undef) { + if (out->format() == MKLDNNMemoryFormat::undef) { auto output_format = platform::GetMKLDNNFormat(*output_); out->set_format((MKLDNNMemoryFormat)output_format); } @@ -116,48 +126,50 @@ class MulPrimitiveFactory { memory::desc CreateMemDescriptor( const Tensor *tensor, MKLDNNMemoryFormat format, memory::data_type type = platform::MKLDNNGetDataType()) { - auto dims = framework::vectorize(tensor->dims()); + auto dims = framework::vectorize(tensor->dims()); return platform::MKLDNNMemDesc(dims, type, format); } template memory::desc CreateMemDescriptor( - const std::vector &dims, MKLDNNMemoryFormat format, + const std::vector &dims, MKLDNNMemoryFormat format, memory::data_type type = platform::MKLDNNGetDataType()) { return platform::MKLDNNMemDesc(dims, type, format); } template memory CreateMemory(const memory::desc &desc, const Tensor *tensor) { - return memory({desc, engine_}, to_void_cast(tensor->data())); + return memory(desc, engine_, to_void_cast(tensor->data())); } memory CreateDstMemory( const inner_product_forward::primitive_desc &mul_prim_desc, const ExecutionContext &ctx, Tensor *output) { - auto dst_prim_desc = mul_prim_desc.dst_primitive_desc(); - auto buffer_size = dst_prim_desc.get_size(); + auto dst_desc = mul_prim_desc.dst_desc(); + auto buffer_size = dst_desc.get_size(); OT *output_data = output->mutable_data(ctx.GetPlace(), buffer_size); - memory dst_mem(dst_prim_desc, to_void_cast(output_data)); - output->set_format(platform::GetMKLDNNFormat(dst_mem)); - return dst_mem; + output->set_format(paddle::platform::GetMKLDNNFormat(dst_desc)); + return memory(dst_desc, engine_, to_void_cast(output_data)); } memory Reorder(const memory::desc &src_desc, const memory::desc &dst_desc, void *src_data, void *dst_data = NULL) { - auto src_mem = memory({src_desc, engine_}, src_data); - auto dst_mem = dst_data ? memory({dst_desc, engine_}, dst_data) - : memory({dst_desc, engine_}); + auto src_mem = memory(src_desc, engine_, src_data); + auto dst_mem = dst_data ? memory(dst_desc, engine_, dst_data) + : memory(dst_desc, engine_); auto reorder = mkldnn::reorder(src_mem, dst_mem); - stream(stream::kind::eager).submit({reorder}).wait(); + + mkldnn::stream astream(engine_); + reorder.execute(astream, src_mem, dst_mem); + astream.wait(); return dst_mem; } memory TransposeInputY(const Tensor *input_y) { - auto dims = framework::vectorize(input_y->dims()); + auto dims = framework::vectorize(input_y->dims()); std::swap(dims[0], dims[1]); // Correct output dimensions auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io); auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi); @@ -169,13 +181,13 @@ class MulPrimitiveFactory { const memory::desc &dst_desc, Tensor *output, const ExecutionContext &ctx) { - const auto y_desc = y_memory.get_primitive_desc().desc(); - const auto x_desc = x_memory.get_primitive_desc().desc(); + const auto y_desc = y_memory.get_desc(); + const auto x_desc = x_memory.get_desc(); auto mul_prim_desc = CreateMulPrimDesc(x_desc, y_desc, dst_desc); output_ = CreateDstMemory(mul_prim_desc, ctx, output); - return inner_product_forward(mul_prim_desc, x_memory, y_memory, *output_); + return inner_product_forward(mul_prim_desc); } inner_product_forward::primitive_desc CreateMulPrimDesc( @@ -228,6 +240,7 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { if (this->mul_) { this->UpdateDataPointers(ctx, output, &x_matrix); + this->Execute(); return *(this->mul_); } @@ -243,6 +256,7 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { this->mul_ = CreateMulPrimitive(*(this->x_input_), *(this->y_input_), dst_desc, output, ctx); + this->Execute(); return *(this->mul_); } @@ -253,22 +267,24 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { mkldnn::primitive_attr attr; attr.set_output_scales(mask, scale); - auto src_mem = memory({src_desc, this->engine_}, src_data); - auto dst_mem = memory({dst_desc, this->engine_}); + auto src_mem = memory(src_desc, this->engine_, src_data); + auto dst_mem = memory(dst_desc, this->engine_); + + auto reorder_pd = mkldnn::reorder::primitive_desc(src_mem, dst_mem, attr); - auto reorder_pd = mkldnn::reorder::primitive_desc( - src_mem.get_primitive_desc(), dst_mem.get_primitive_desc(), attr); + auto reorder = mkldnn::reorder(reorder_pd); - auto reorder = mkldnn::reorder(reorder_pd, src_mem, dst_mem); - stream(stream::kind::eager).submit({reorder}).wait(); + mkldnn::stream astream(this->engine_); + reorder.execute(astream, src_mem, dst_mem); + astream.wait(); return dst_mem; } memory QuantInputY(memory input_y, const std::vector &scale_y) { - const auto &dims = input_y.get_primitive_desc().desc().data.dims; - auto ndims = input_y.get_primitive_desc().desc().data.ndims; - auto y_dims = std::vector(dims, dims + ndims); + const auto &dims = input_y.get_desc().data.dims; + auto ndims = input_y.get_desc().data.ndims; + auto y_dims = std::vector(dims, dims + ndims); auto user_y_desc = this->template CreateMemDescriptor(y_dims, MKLDNNMemoryFormat::oi); @@ -309,8 +325,8 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { const memory::desc &dst_desc, Tensor *output, const ExecutionContext &ctx) { - const auto x_desc = x_memory.get_primitive_desc().desc(); - const auto y_desc = y_memory.get_primitive_desc().desc(); + const auto x_desc = x_memory.get_desc(); + const auto y_desc = y_memory.get_desc(); bool force_fp32_output = ctx.Attr("force_fp32_output"); mkldnn::primitive_attr mul_attr = CreateMulAttr(ctx, force_fp32_output); @@ -318,8 +334,7 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { this->output_ = this->CreateDstMemory(mul_prim_desc, ctx, output); - return inner_product_forward(mul_prim_desc, x_memory, y_memory, - *(this->output_)); + return inner_product_forward(mul_prim_desc); } inner_product_forward::primitive_desc CreateMulPrimDesc( @@ -340,9 +355,8 @@ std::shared_ptr> GetPrimitiveFactory( const Tensor *input_x, const Tensor *input_y, const mkldnn::engine &mkldnn_engine, bool enable_quant) { const std::string key = platform::CreateKey( - input_x->type(), framework::vectorize(input_x->dims()), - input_y->type(), framework::vectorize(input_y->dims()), - ctx.OutputName("Out")); + input_x->type(), framework::vectorize(input_x->dims()), input_y->type(), + framework::vectorize(input_y->dims()), ctx.OutputName("Out")); auto prim_creator = std::static_pointer_cast>( dev_ctx.GetBlob(key)); @@ -399,14 +413,12 @@ class MulMKLDNNKernel : public framework::OpKernel { auto mul = GetMulPrimitive(dev_ctx, ctx, x, y, out, mkldnn_engine); - stream(stream::kind::eager).submit({mul}).wait(); - if (out_dims.size() != 2) { out->Resize(out_dims); } out->set_layout(DataLayout::kMKLDNN); - out->set_format(platform::MKLDNNFormatForSize( - out_dims.size(), mkldnn::memory::format::nchw)); + out->set_format(platform::MKLDNNFormatForSize(out_dims.size(), + MKLDNNMemoryFormat::nchw)); } }; diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index 866a4319dcffd57fb7bba2f803788fd134317ef3..4164e067e5dfffbd3e4166ac642d8ae1e20fb186 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -43,13 +43,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input tensor"); std::string pooling_type = ctx.Attr("pooling_type"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); + + std::vector ksize_temp = ctx.Attr>("ksize"); + std::vector ksize(begin(ksize_temp), end(ksize_temp)); + + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + bool global_pooling = ctx.Attr("global_pooling"); std::string padding_algorithm = ctx.Attr("padding_algorithm"); @@ -71,8 +78,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, data_dims, strides, ksize); - auto src_tz = paddle::framework::vectorize(input->dims()); - auto dst_tz = paddle::framework::vectorize(output->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); auto is_test = ctx.Attr("is_test"); @@ -85,22 +92,21 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_memory = handler.AcquireSrcMemory(input); auto dst_memory = handler.AcquireDstMemory(output); - std::shared_ptr pool_p; - std::shared_ptr workspace_memory; + auto pool_p = handler.AcquireForwardPrimitive(); + mkldnn::stream astream(dev_ctx.GetEngine()); if ((is_test == false) && (pooling_type == "max")) { // Training - workspace_memory = handler.AcquireWorkspaceMemory(); - pool_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory, - *workspace_memory); + auto workspace_memory = handler.AcquireWorkspaceMemory(); + pool_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, + {MKLDNN_ARG_DST, *dst_memory}, + {MKLDNN_ARG_WORKSPACE, *workspace_memory}}); } else { // Inference - pool_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory); + pool_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, + {MKLDNN_ARG_DST, *dst_memory}}); } - - // push primitive to stream and wait until it's executed - std::vector pipeline{*pool_p}; - stream(stream::kind::eager).submit(pipeline).wait(); + astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(platform::GetMKLDNNFormat(*dst_memory)); @@ -120,12 +126,12 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input tensor"); PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN, "Wrong layout set for Input output_grad tensor"); - PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, "Wrong format set for Input output_grad tensor"); PADDLE_ENFORCE_EQ( @@ -133,9 +139,16 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { "is_test attribute should be set to False in training phase."); std::string pooling_type = ctx.Attr("pooling_type"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); + + std::vector ksize_temp = ctx.Attr>("ksize"); + std::vector ksize(begin(ksize_temp), end(ksize_temp)); + + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + bool global_pooling = ctx.Attr("global_pooling"); std::string padding_algorithm = ctx.Attr("padding_algorithm"); @@ -155,8 +168,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector pipeline; - auto diff_src_tz = paddle::framework::vectorize(in_x_grad->dims()); - auto diff_dst_tz = paddle::framework::vectorize(out_grad->dims()); + auto diff_src_tz = paddle::framework::vectorize(in_x_grad->dims()); + auto diff_dst_tz = paddle::framework::vectorize(out_grad->dims()); // Get an unique name from "argument" name of "Out" variable // This name will be used as key when referring info from device context @@ -173,22 +186,21 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad); auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad); - std::shared_ptr pool_bwd_p; - std::shared_ptr workspace_memory; + auto pool_bwd_p = handler.AcquireBackwardPrimitive(); + mkldnn::stream astream(dev_ctx.GetEngine()); if (pooling_type == "max") { // Max - pooling needs Workspace - workspace_memory = handler.AcquireWorkspaceMemory(); - pool_bwd_p = handler.AcquireBackwardPrimitive( - *diff_dst_memory, *workspace_memory, *diff_src_memory); + auto workspace_memory = handler.AcquireWorkspaceMemory(); + pool_bwd_p->execute(astream, {{MKLDNN_ARG_DIFF_SRC, *diff_src_memory}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}, + {MKLDNN_ARG_WORKSPACE, *workspace_memory}}); } else { // Average Pooling - pool_bwd_p = - handler.AcquireBackwardPrimitive(*diff_dst_memory, *diff_src_memory); + pool_bwd_p->execute(astream, {{MKLDNN_ARG_DIFF_SRC, *diff_src_memory}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}}); } - - pipeline.push_back(*pool_bwd_p); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + astream.wait(); in_x_grad->set_layout(DataLayout::kMKLDNN); in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory)); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index f4c68181b6c3bec014e9889ac64b4b1f58d49bea..be5c639829b05b2640c4d9a6612de403e40bd69a 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -42,8 +42,8 @@ class QuantOpKernel : public framework::OpKernel { const auto& engine = dev_ctx.GetEngine(); std::vector pipeline; - auto src_tz = paddle::framework::vectorize(input->dims()); - auto dst_tz = paddle::framework::vectorize(output->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); const T* input_data = input->data(); @@ -66,24 +66,20 @@ class QuantOpKernel : public framework::OpKernel { auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, input->format()); - auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); - src_memory = - std::make_shared(src_pd, to_void_cast(input_data)); - std::shared_ptr src_memory_p = - std::shared_ptr(new primitive::at(*src_memory)); + src_memory = std::make_shared( + src_md, engine, to_void_cast(input_data)); - std::shared_ptr dst_pd; + std::shared_ptr dst_md; if (is_negative) { platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_pd, dst_memory); + dst_md, dst_memory); } else { platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_pd, dst_memory); + dst_md, dst_memory); } auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(src_pd, *dst_pd, attri)); - reorder_p = std::shared_ptr( - new reorder(*reorder_pd, *src_memory_p, *dst_memory)); + new reorder::primitive_desc(*src_memory, *dst_memory, attri)); + reorder_p = std::shared_ptr(new reorder(*reorder_pd)); dev_ctx.SetBlob(key_prim, reorder_p); dev_ctx.SetBlob(key_src_mem, src_memory); @@ -103,8 +99,10 @@ class QuantOpKernel : public framework::OpKernel { } } - pipeline.push_back(*reorder_p); - stream(stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(engine); + reorder_p->execute(astream, *src_memory, *dst_memory); + astream.wait(); + output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory)); } diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index a5e1e5041fb1e4503cd1aa36b8785be3148a7e75..21a49a153d46e1a14bd2ba24c43a7228be8d118d 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -43,8 +43,8 @@ class ReQuantOpKernel : public framework::OpKernel { const auto& engine = dev_ctx.GetEngine(); std::vector pipeline; - auto src_tz = paddle::framework::vectorize(input->dims()); - auto dst_tz = paddle::framework::vectorize(output->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); mkldnn::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(input->type()); mkldnn::memory::data_type dst_dt = src_dt; @@ -60,23 +60,21 @@ class ReQuantOpKernel : public framework::OpKernel { attri.set_output_scales(mask, {scale_shift}); auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt); - auto src_pd = mkldnn::memory::primitive_desc(src_md, engine); - auto src_memory = - std::make_shared(src_pd, to_void_cast(input_data)); - std::shared_ptr src_memory_p = - std::shared_ptr(new primitive::at(*src_memory)); + auto src_memory = std::make_shared( + src_md, engine, to_void_cast(input_data)); auto dst_md = platform::MKLDNNMemDesc({dst_tz}, dst_dt, dst_fmt); - auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); - auto dst_memory = mkldnn::memory(dst_pd, to_void_cast(output_data)); + auto dst_memory = + mkldnn::memory(dst_md, engine, to_void_cast(output_data)); auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(src_pd, dst_pd, attri)); + new reorder::primitive_desc(*src_memory, dst_memory, attri)); - auto reorder_p = std::shared_ptr( - new reorder(*reorder_pd, *src_memory_p, dst_memory)); - pipeline.push_back(*reorder_p); - stream(stream::kind::eager).submit(pipeline).wait(); + auto reorder_p = std::shared_ptr(new reorder(*reorder_pd)); + + mkldnn::stream astream(engine); + reorder_p->execute(astream, *src_memory, dst_memory); + astream.wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(dst_memory)); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 08ead5f0c615fdaacdee52313dd5565f000c19dd..cf0839aa02e7f94259e84326d0a0bdd8a9f966fc 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -38,7 +38,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandlerT { public: - SoftmaxMKLDNNHandler(const std::vector& dims, + SoftmaxMKLDNNHandler(const std::vector& dims, const MKLDNNMemoryFormat fmt, const int& axis, const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, const std::string& uniq_name) @@ -52,7 +52,7 @@ class SoftmaxMKLDNNHandler axis); } - SoftmaxMKLDNNHandler(const std::vector& dims, + SoftmaxMKLDNNHandler(const std::vector& dims, const MKLDNNMemoryFormat fmt, const MKLDNNMemoryFormat diff_fmt, const int& axis, const platform::MKLDNNDeviceContext& dev_ctx, @@ -87,25 +87,24 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto dims = input->dims(); // input and output share the same shape const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - auto softmax_tz = paddle::framework::vectorize(dims); + auto softmax_tz = paddle::framework::vectorize(dims); SoftmaxMKLDNNHandler handler(softmax_tz, input->format(), axis, dev_ctx, ctx.GetPlace(), ctx.OutputName("Out")); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); auto softmax_dst_memory_p = handler.AcquireDstMemory(output); - auto softmax_p = handler.AcquireForwardPrimitive(*softmax_src_memory_p, - *softmax_dst_memory_p); + auto softmax_p = handler.AcquireForwardPrimitive(); - std::vector pipeline{*softmax_p}; - stream(stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(dev_ctx.GetEngine()); + softmax_p->execute(astream, {{MKLDNN_ARG_SRC, *softmax_src_memory_p}, + {MKLDNN_ARG_DST, *softmax_dst_memory_p}}); + astream.wait(); const bool is_test = ctx.Attr("is_test"); if (!is_test) { T* output_data = output->mutable_data(ctx.GetPlace()); - int size = std::accumulate(begin(softmax_tz), end(softmax_tz), 1, - std::multiplies()); - std::for_each(output_data, &output_data[size], [](T& val) { + std::for_each(output_data, &output_data[output->numel()], [](T& val) { val = std::max(val, static_cast(exp(-64))); }); } @@ -136,7 +135,7 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { auto dims = dout->dims(); // input and output share the same shape const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - std::vector softmax_tz = paddle::framework::vectorize(dims); + auto softmax_tz = paddle::framework::vectorize(dims); SoftmaxMKLDNNHandler handler(softmax_tz, output->format(), dout->format(), axis, dev_ctx, @@ -146,11 +145,14 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout); auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); - auto softmax_bwd_p = handler.AcquireBackwardPrimitive( - *dst_memory_p, *diff_dst_memory_p, *diff_src_memory_p); + auto softmax_bwd_p = handler.AcquireBackwardPrimitive(); - std::vector pipeline{*softmax_bwd_p}; - stream(stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(dev_ctx.GetEngine()); + softmax_bwd_p->execute(astream, + {{MKLDNN_ARG_DST, *dst_memory_p}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, + {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); dx->set_layout(framework::DataLayout::kMKLDNN); dx->set_format(dout->format()); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 1a8e9d6911dc4756f8c7d3338d58c6893caa47bd..0b699a52803c93c73dcfc16bde7546ca3ce19eb2 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -63,11 +63,11 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { LoDTensor* output = ctx.Output("Out"); T* output_data = output->mutable_data(ctx.GetPlace()); - auto dst_tz = framework::vectorize(output->dims()); + auto dst_tz = framework::vectorize(output->dims()); auto src_tz = dst_tz; - MKLDNNMemoryFormat output_format{MKLDNNMemoryFormat::format_undef}; + MKLDNNMemoryFormat output_format{MKLDNNMemoryFormat::undef}; std::vector scales; - std::vector srcs_mpd; + std::vector srcs_md; std::vector srcs_mem; PADDLE_ENFORCE_EQ(in_vars[0]->IsType(), true, @@ -75,7 +75,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { auto& input0 = in_vars[0]->Get(); PADDLE_ENFORCE_EQ(input0.layout(), DataLayout::kMKLDNN, "Wrong layout set for inputs[0] tensor"); - PADDLE_ENFORCE_NE(input0.format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input0.format(), MKLDNNMemoryFormat::undef, "Wrong format set for inputs[0] tensor"); MKLDNNMemoryFormat input_format = input0.format(); @@ -86,7 +86,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { auto& input = in_vars[i]->Get(); PADDLE_ENFORCE_EQ(input.layout(), DataLayout::kMKLDNN, "Wrong layout set for inputs"); - PADDLE_ENFORCE_NE(input.format(), MKLDNNMemoryFormat::format_undef, + PADDLE_ENFORCE_NE(input.format(), MKLDNNMemoryFormat::undef, "Wrong format set for inputs"); if (input.numel() == 0) { @@ -97,9 +97,8 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_md = memory::desc(src_tz, memory::data_type::f32, input_format); - auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine); - auto src_mem = memory(src_mpd, to_void_cast(input_data)); - srcs_mpd.push_back(src_mpd); + auto src_mem = memory(src_md, mkldnn_engine, to_void_cast(input_data)); + srcs_md.push_back(src_md); srcs_mem.push_back(src_mem); scales.push_back(1.0); } @@ -107,36 +106,43 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { auto dst_md = memory::desc(dst_tz, memory::data_type::f32, MKLDNNMemoryFormat::any); - auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); + auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_md, mkldnn_engine); std::shared_ptr dst_mem; if (in_place) { - dst_mem.reset(new memory(sum_pd.dst_primitive_desc())); + dst_mem.reset(new memory(sum_pd.dst_desc(), mkldnn_engine)); } else { - dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data)); - } - std::vector inputs; - for (size_t i = 0; i < srcs_mem.size(); ++i) { - inputs.push_back(srcs_mem[i]); + dst_mem.reset( + new memory(sum_pd.dst_desc(), mkldnn_engine, output_data)); } - auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem); - output_format = (MKLDNNMemoryFormat)platform::GetMKLDNNFormat(sum_pd); + auto sum_prim = mkldnn::sum(sum_pd); + output_format = platform::GetMKLDNNFormat(sum_pd.dst_desc()); - primitive reorder_prim; + std::shared_ptr reorder_p; std::shared_ptr target_mem; if (in_place) { output_format = input_format; - target_mem.reset(new memory( - {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine}, - output_data)); - reorder_prim = reorder(*dst_mem, *target_mem); + target_mem.reset( + new memory({{src_tz}, memory::data_type::f32, output_format}, + mkldnn_engine, output_data)); + reorder_p = std::make_shared(*dst_mem, *target_mem); + } + + mkldnn::stream astream(mkldnn_engine); + std::unordered_map args; + for (size_t i = 0; i < srcs_mem.size(); ++i) { + args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, srcs_mem.at(i)}); } + args.insert({MKLDNN_ARG_DST, *dst_mem}); + + sum_prim.execute(astream, args); + astream.wait(); - std::vector pipeline; - pipeline.push_back(sum_prim); - if (in_place) pipeline.push_back(reorder_prim); - stream(stream::kind::eager).submit(pipeline).wait(); + if (in_place) { + reorder_p->execute(astream, *dst_mem, *target_mem); + astream.wait(); + } output->set_layout(DataLayout::kMKLDNN); output->set_format(output_format); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index 065707cac28ab86d1ddcc21b58771cc21c01b088..9808bd831d1598d03f40cf659e18c7fc6de1fd56 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -44,7 +44,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { return; } - auto nchw_tz = paddle::framework::vectorize(input->dims()); + auto nchw_tz = paddle::framework::vectorize(input->dims()); const std::string key = platform::CreateKey(nchw_tz, ctx.OutputName("Out")); @@ -58,12 +58,13 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, transpose_src_memory_p); - std::vector pipeline; - pipeline.push_back(*transpose_p); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(mkldnn_engine); + transpose_p->execute(astream, *transpose_src_memory_p, + *transpose_dst_memory_p); + astream.wait(); output->set_layout(DataLayout::kNCHW); - output->set_format(MKLDNNMemoryFormat::format_undef); + output->set_format(MKLDNNMemoryFormat::undef); } }; @@ -95,7 +96,7 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T* out_grad_data = out_grad->data(); x_grad->mutable_data(ctx.GetPlace()); - auto nchw_tz = paddle::framework::vectorize(out_grad->dims()); + auto nchw_tz = paddle::framework::vectorize(out_grad->dims()); const std::string key = platform::CreateKey( nchw_tz, ctx.OutputName(framework::GradVarName("X"))); @@ -110,9 +111,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, transpose_src_memory_p); - std::vector pipeline; - pipeline.push_back(*transpose_p); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(mkldnn_engine); + transpose_p->execute(astream, *transpose_src_memory_p, + *transpose_dst_memory_p); + astream.wait(); } }; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 54abd967f48d9e58c01420e0c5dc3b5072854fac..b038f68738aaba6d92478cdbe110c71ce65f53b4 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -376,7 +376,9 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } #ifdef PADDLE_WITH_MKLDNN MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) - : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobmap_() { + : CPUDeviceContext(place), + engine_(mkldnn::engine::kind::cpu, 0), + p_blobmap_() { p_blobmap_.reset(new BlobMap()); p_mutex_.reset(new std::mutex()); } diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 3f2904fca68defffd1e9f5e4e709ca2742370168..8c9a1a9e832a3e9dc8f720c527c771b5c06c67d9 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" namespace paddle { #ifdef PADDLE_WITH_MKLDNN -using MKLDNNMemoryFormat = mkldnn::memory::format; +using MKLDNNMemoryFormat = mkldnn::memory::format_tag; #endif namespace platform { @@ -71,11 +71,10 @@ tf_pd MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p, return tf_pd(desc, e, p); } -inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, +inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, mkldnn::memory::data_type data_type, MKLDNNMemoryFormat format) { - mkldnn::memory::dims tz = dims; - return mkldnn::memory::desc({tz}, data_type, format); + return mkldnn::memory::desc({dims}, data_type, format); } inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) { @@ -85,7 +84,7 @@ inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) { template mkldnn::memory::data_type MKLDNNGetDataType() { - return mkldnn::memory::data_type::data_undef; + return mkldnn::memory::data_type::undef; } template <> @@ -105,22 +104,136 @@ inline mkldnn::memory::data_type MKLDNNGetDataType() { return mkldnn::memory::data_type::u8; } -inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) { +inline void Reorder(mkldnn::memory src, mkldnn::memory dst, + const mkldnn::engine& engine) { auto reorder_prim = mkldnn::reorder(src, dst); - std::vector pipeline; - pipeline.push_back(reorder_prim); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + mkldnn::stream astream(engine); + reorder_prim.execute(astream, src, dst); + astream.wait(); } -inline MKLDNNMemoryFormat GetMKLDNNFormat(const mkldnn::memory memory) { - return static_cast( - memory.get_primitive_desc().desc().data.format); +inline mkldnn::memory::format_tag GetMKLDNNFormat( + mkldnn::memory::desc mem_desc) { + auto ndims = mem_desc.data.ndims; + auto strides = mem_desc.data.format_desc.blocking.strides; + auto inner_nblks = mem_desc.data.format_desc.blocking.inner_nblks; + auto inner_blks = mem_desc.data.format_desc.blocking.inner_blks; + auto inner_idxs = mem_desc.data.format_desc.blocking.inner_idxs; + + if (ndims == 1) { + return mkldnn::memory::format_tag::x; + } else if (ndims == 2) { + if (inner_nblks == 0) { + if (strides[0] >= strides[1]) { + return mkldnn::memory::format_tag::nc; + } else { + return mkldnn::memory::format_tag::cn; + } + } + } else if (ndims == 3) { + if (inner_nblks == 0) { + if (strides[0] >= strides[1] && strides[1] >= strides[2]) { + return mkldnn::memory::format_tag::ncw; + } else { + return mkldnn::memory::format_tag::nwc; + } + } + } else if (ndims == 4) { + if (inner_nblks == 0) { + if (strides[0] >= strides[1] && strides[1] >= strides[2] && + strides[2] >= strides[3]) { + return mkldnn::memory::format_tag::nchw; + } else { + return mkldnn::memory::format_tag::nhwc; + } + } else if (inner_nblks == 1) { + if (inner_blks[0] == 16 && inner_idxs[0] == 1) { + return mkldnn::memory::format_tag::nChw16c; + } else if (inner_blks[0] == 8 && inner_idxs[0] == 1) { + return mkldnn::memory::format_tag::nChw8c; + } else if (inner_blks[0] == 8 && inner_idxs[0] == 0) { + if (strides[0] >= strides[2] && strides[2] >= strides[3] && + strides[3] >= strides[1]) { + return mkldnn::memory::format_tag::Acdb8a; + } + } else if (inner_blks[0] == 4 && inner_idxs[0] == 1) { + return mkldnn::memory::format_tag::nChw4c; + } else if (inner_blks[0] == 16 && inner_idxs[0] == 0) { + if (strides[0] >= strides[2] && strides[2] >= strides[3] && + strides[3] >= strides[1]) { + return mkldnn::memory::format_tag::Acdb16a; + } + } + } else if (inner_nblks == 2) { + if (inner_blks[0] == 16 && inner_blks[1] == 16) { + if (inner_idxs[0] == 1 && inner_idxs[1] == 0) { + return mkldnn::memory::format_tag::OIhw16i16o; + } + } else if (inner_blks[0] == 8 && inner_blks[1] == 8) { + if (inner_idxs[0] == 1 && inner_idxs[1] == 0) { + return mkldnn::memory::format_tag::OIhw8i8o; + } + } + } + } else if (ndims == 5) { + if (inner_nblks == 0) { + if (strides[0] >= strides[1] && strides[1] >= strides[2] && + strides[2] >= strides[3] && strides[3] >= strides[4]) { + return mkldnn::memory::format_tag::ncdhw; + } else { + return mkldnn::memory::format_tag::ndhwc; + } + } else if (inner_nblks == 1) { + if (inner_blks[0] == 8 && inner_idxs[0] == 0) { + if (strides[0] >= strides[2] && strides[2] >= strides[3] && + strides[3] >= strides[4] && strides[4] >= strides[1]) { + return mkldnn::memory::format_tag::Acdeb8a; + } + } else if (inner_blks[0] == 8 && inner_idxs[0] == 1) { + if (strides[0] >= strides[1] && strides[1] >= strides[2] && + strides[2] >= strides[3] && strides[3] >= strides[4]) { + return mkldnn::memory::format_tag::aBcde8b; + } + } else if (inner_blks[0] == 16 && inner_idxs[0] == 0) { + if (strides[0] >= strides[2] && strides[2] >= strides[3] && + strides[3] >= strides[4] && strides[4] >= strides[1]) { + return mkldnn::memory::format_tag::Acdeb16a; + } + } else if (inner_blks[0] == 16 && inner_idxs[0] == 1) { + if (strides[0] >= strides[1] && strides[1] >= strides[2] && + strides[2] >= strides[3] && strides[3] >= strides[4]) { + return mkldnn::memory::format_tag::aBcde16b; + } + } + } + } else if (ndims == 6) { + if (inner_nblks == 0) { + if (strides[0] >= strides[1] && strides[1] >= strides[2] && + strides[2] >= strides[3] && strides[3] >= strides[4] && + strides[4] >= strides[5]) { + return mkldnn::memory::format_tag::abcdef; + } + } + } + // DEBUG CODE - KEEP UNTILL TENSOR.MEMORY_DESC IMPLEMENTED + // std::cout<<"@@@@@@@@@@ UNDEFINED FORMAT @@@@@@@@@@@@@@@@@@@"<( - memory.dst_primitive_desc().desc().data.format); +inline mkldnn::memory::format_tag GetMKLDNNFormat(const mkldnn::memory memory) { + auto mem_desc = memory.get_desc(); + return GetMKLDNNFormat(mem_desc); } inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size, @@ -190,13 +303,37 @@ inline void AppendKey(std::string* key, const T& num) { key->append(std::to_string(num)); } +template <> +inline void AppendKey(std::string* key, + const mkldnn::memory::format_tag& format) { + key->append(std::to_string(static_cast(format))); +} + +template <> +inline void AppendKey(std::string* key, + const mkldnn::memory::data_type& data_type) { + key->append(std::to_string(static_cast(data_type))); +} + +template <> +inline void AppendKey(std::string* key, const mkldnn::algorithm& algorithm) { + key->append(std::to_string(static_cast(algorithm))); +} + +template <> +inline void AppendKey(std::string* key, + const mkldnn::normalization_flags& flags) { + key->append(std::to_string(static_cast(flags))); +} + inline void AppendKey(std::string* key, const std::string& str) { key->append(str); } inline void AppendKey(std::string* key, const char* str) { key->append(str); } -inline void AppendKey(std::string* key, const std::vector& dims) { +template +inline void AppendKey(std::string* key, const std::vector& dims) { for (size_t i = 0; i < dims.size(); i++) { AppendKey(key, std::to_string(dims[i])); } @@ -211,8 +348,8 @@ inline std::string CreateKey(ArgTypes&&... args) { return key; } -inline std::vector> ToMkldnnPadding( - const std::vector& paddings) { +inline std::vector> ToMkldnnPadding( + const std::vector& paddings) { if (paddings.size() == 6) { int padding_front = paddings[0]; int padding_back = paddings[1]; diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 2b641504cd65f51e05b0483b9718c6b14f80febb..27756ed0117af1ff82aeb4f7e7cb7dab1ed179ce 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -49,27 +49,23 @@ class MKLDNNHandlerT { } } - template - std::shared_ptr AcquireForwardPrimitive(Args&&... args) { + std::shared_ptr AcquireForwardPrimitive() { const std::string key_p = key_ + "@forward_p"; auto forward_p = std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); if (forward_p == nullptr) { - forward_p = - std::make_shared(*fwd_pd_, std::forward(args)...); + forward_p = std::make_shared(*fwd_pd_); dev_ctx_.SetBlob(key_p, forward_p); } return forward_p; } - template - std::shared_ptr AcquireBackwardPrimitive(Args&&... args) { + std::shared_ptr AcquireBackwardPrimitive() { const std::string key_p = key_ + "@backward_p"; auto backward_p = std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); if (backward_p == nullptr) { - backward_p = - std::make_shared(*bwd_pd_, std::forward(args)...); + backward_p = std::make_shared(*bwd_pd_); dev_ctx_.SetBlob(key_p, backward_p); } return backward_p; @@ -78,40 +74,36 @@ class MKLDNNHandlerT { std::shared_ptr AcquireSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(fwd_pd_->src_primitive_desc(), - to_void_cast(input_data), - "@src_mem_p"); + return this->AcquireMemoryFromPrimitive( + fwd_pd_->src_desc(), to_void_cast(input_data), "@src_mem_p"); } std::shared_ptr AcquireDstMemory(framework::Tensor* output) { - T* ptr = output->mutable_data(place_, - fwd_pd_->dst_primitive_desc().get_size()); - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr, + T* ptr = output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr, "@dst_mem_p"); } std::shared_ptr AcquireDstMemory( const framework::Tensor* output) { const T* output_data = output->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_primitive_desc(), - to_void_cast(output_data), - "@bwd-dst_mem_p"); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->dst_desc(), to_void_cast(output_data), "@bwd-dst_mem_p"); } std::shared_ptr AcquireDiffDstMemory( const framework::Tensor* diffdst) { const T* ptr = diffdst->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_primitive_desc(), - to_void_cast(ptr), - "@diff_dst_mem_p"); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->diff_dst_desc(), to_void_cast(ptr), "@diff_dst_mem_p"); } std::shared_ptr AcquireDiffSrcMemory( framework::Tensor* diffsrc) { - T* ptr = diffsrc->mutable_data( - place_, bwd_pd_->diff_src_primitive_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(), - ptr, "@diff_src_mem_p"); + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr, + "@diff_src_mem_p"); } protected: @@ -156,13 +148,12 @@ class MKLDNNHandlerT { } std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::primitive_desc mdp, void* ptr, - const std::string& suffix) { + mkldnn::memory::desc md, void* ptr, const std::string& suffix) { auto local_key = key_ + suffix; auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - mem_p = std::make_shared(mdp, ptr); + mem_p = std::make_shared(md, engine_, ptr); dev_ctx_.SetBlob(local_key, mem_p); } else { mem_p->set_data_handle(ptr); @@ -214,13 +205,12 @@ class MKLDNNHandler { } std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::primitive_desc mdp, void* ptr, - const std::string& suffix) { + mkldnn::memory::desc md, void* ptr, const std::string& suffix) { auto local_key = key_ + suffix; auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - mem_p = std::make_shared(mdp, ptr); + mem_p = std::make_shared(md, engine_, ptr); dev_ctx_.SetBlob(local_key, mem_p); } else { mem_p->set_data_handle(ptr); @@ -245,8 +235,7 @@ class MKLDNNHandler { ptr = reinterpret_cast(reordered_data.get()); } - mem_p = std::make_shared( - mkldnn::memory::primitive_desc{md, engine_}, ptr); + mem_p = std::make_shared(md, engine_, ptr); dev_ctx_.SetBlob(local_key, mem_p); } else { mem_p->set_data_handle(ptr); @@ -255,7 +244,7 @@ class MKLDNNHandler { } std::shared_ptr AcquireMemory( - const std::vector& dims, const mkldnn::memory::data_type dtype, + const std::vector& dims, const mkldnn::memory::data_type dtype, const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) { /*Generate key*/ auto local_key = key_ + suffix; @@ -264,8 +253,7 @@ class MKLDNNHandler { if (mem_p == nullptr) { auto md = mkldnn::memory::desc(dims, dtype, fmt); - mem_p = std::make_shared( - mkldnn::memory::primitive_desc{md, engine_}, ptr); + mem_p = std::make_shared(md, engine_, ptr); dev_ctx_.SetBlob(local_key, mem_p); } else { mem_p->set_data_handle(ptr); @@ -290,15 +278,18 @@ class MKLDNNHandler { auto reorder_p = std::make_shared(*user_memory_p, *target_memory_p); dev_ctx_.SetBlob(key_reorder_p, reorder_p); - pipeline.push_back(*reorder_p); + mkldnn::stream astream(engine_); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); } return target_memory_p; } std::shared_ptr AcquireMemory( - mkldnn::memory::primitive_desc& mpd, // NOLINT - mkldnn::memory::primitive_desc& user_mpd, // NOLINT + mkldnn::memory::desc& md, // NOLINT + mkldnn::memory::desc& user_md, // NOLINT const std::shared_ptr user_memory_p, const std::string& suffix, std::vector& pipeline, // NOLINT @@ -310,27 +301,34 @@ class MKLDNNHandler { auto target_memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + + mkldnn::stream astream(engine_); + if (target_memory_p == nullptr) { target_memory_p = user_memory_p; - std::shared_ptr reorder_p; - if (mpd != user_mpd) { - target_memory_p = std::make_shared(mpd); - std::shared_ptr reorder_p; + if (md != user_md) { + target_memory_p = std::make_shared(md, engine_); + std::shared_ptr reorder_pd; if (is_INT8) { mkldnn::primitive_attr attri; // attribute for int8 weights and bias data reorder. attri.set_output_scales(mask, scale_data); - auto reorder_pd = std::shared_ptr( - new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri)); - reorder_p = std::shared_ptr(new mkldnn::reorder( - *reorder_pd, *user_memory_p, *target_memory_p)); + reorder_pd = std::shared_ptr( + new mkldnn::reorder::primitive_desc(*user_memory_p, + *target_memory_p, attri)); } else { - reorder_p = std::make_shared(*user_memory_p, - *target_memory_p); + reorder_pd = std::shared_ptr( + new mkldnn::reorder::primitive_desc(*user_memory_p, + *target_memory_p)); } + auto reorder_p = + std::shared_ptr(new mkldnn::reorder(*reorder_pd)); dev_ctx_.SetBlob(key_reorder_p, reorder_p); - pipeline.push_back(*reorder_p); + + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); } dev_ctx_.SetBlob(local_key, target_memory_p); } else if (!is_persistent) { @@ -338,7 +336,9 @@ class MKLDNNHandler { auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); if (reorder_p != nullptr) { - pipeline.push_back(*reorder_p); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); } } return target_memory_p; @@ -366,12 +366,13 @@ class SumMKLDNNHandler : public MKLDNNHandler { dev_ctx_.GetBlob(key_sum_pd)); if (sum_pd_ == nullptr) { // Get vector of inputs primitive descriptors - std::vector src_pds; + std::vector src_ds; for (auto& input_mem : src_mems) { - src_pds.push_back(input_mem->get_primitive_desc()); + src_ds.push_back(input_mem->get_desc()); } - sum_pd_.reset(new mkldnn::sum::primitive_desc(dst_md, scales, src_pds)); + sum_pd_.reset( + new mkldnn::sum::primitive_desc(dst_md, scales, src_ds, engine_)); dev_ctx_.SetBlob(key_sum_pd, sum_pd_); } @@ -379,7 +380,7 @@ class SumMKLDNNHandler : public MKLDNNHandler { } std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(sum_pd_->dst_primitive_desc(), ptr, + return this->AcquireMemoryFromPrimitive(sum_pd_->dst_desc(), ptr, "@dst_mem_p"); } @@ -388,14 +389,12 @@ class SumMKLDNNHandler : public MKLDNNHandler { return this->AcquireMemory(md, ptr, "@user_src2_mem_p"); } - std::shared_ptr AcquireSum( - std::shared_ptr dst_memory, - std::vector* inputs) { + std::shared_ptr AcquireSum() { auto prim_key = key_ + "@sum_p"; auto sum_p = std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); if (sum_p == nullptr) { - sum_p = std::make_shared(*(sum_pd_), *inputs, *(dst_memory)); + sum_p = std::make_shared(*sum_pd_); dev_ctx_.SetBlob(prim_key, sum_p); } return sum_p; @@ -410,7 +409,7 @@ class ActivationMKLDNNHandler : public MKLDNNHandlerT { public: - ActivationMKLDNNHandler(const std::vector& dims, + ActivationMKLDNNHandler(const std::vector& dims, mkldnn::algorithm algorithm, float alpha, float beta, const MKLDNNMemoryFormat fmt, bool is_test, const platform::MKLDNNDeviceContext& dev_ctx, @@ -429,7 +428,7 @@ class ActivationMKLDNNHandler algorithm, md, alpha, beta); } - ActivationMKLDNNHandler(const std::vector& dims, + ActivationMKLDNNHandler(const std::vector& dims, mkldnn::algorithm algorithm, float alpha, float beta, const MKLDNNMemoryFormat fmt, const MKLDNNMemoryFormat diff_fmt, @@ -453,7 +452,7 @@ class ActivationMKLDNNHandler std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_primitive_desc(), + return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), to_void_cast(input_data), "@bwd-src_mem_p"); } @@ -463,8 +462,8 @@ template class LRNMKLDNNHandler : public MKLDNNHandlerT { public: - LRNMKLDNNHandler(const std::vector& dims, const int n, const float alpha, - const float beta, const float k, + LRNMKLDNNHandler(const std::vector& dims, const int n, + const float alpha, const float beta, const float k, const MKLDNNMemoryFormat fmt, bool is_test, const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, const std::string& unique_name) @@ -477,11 +476,11 @@ class LRNMKLDNNHandler this->AcquireForwardPrimitiveDescriptor( is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training, - mkldnn::lrn_across_channels, src_md, n, alpha, beta, k); + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); } - LRNMKLDNNHandler(const std::vector& dims, const int n, const float alpha, - const float beta, const float k, + LRNMKLDNNHandler(const std::vector& dims, const int n, + const float alpha, const float beta, const float k, const MKLDNNMemoryFormat fmt, const MKLDNNMemoryFormat diff_fmt, const platform::MKLDNNDeviceContext& dev_ctx, @@ -496,23 +495,24 @@ class LRNMKLDNNHandler mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); this->AcquireBackwardPrimitiveDescriptor( - mkldnn::lrn_across_channels, src_md, diff_md, n, alpha, beta, k); + mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, beta, + k); } std::shared_ptr AcquireWorkspaceMemory( framework::Tensor* workspace) { T* ptr = workspace->mutable_data( - this->place_, this->fwd_pd_->workspace_primitive_desc().get_size()); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->workspace_primitive_desc(), ptr, "@wrk_mem_p"); + this->place_, this->fwd_pd_->workspace_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), + ptr, "@wrk_mem_p"); } std::shared_ptr AcquireBackwardWorkspaceMemory( const framework::Tensor* workspace) { const T* workspace_data = workspace->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->workspace_primitive_desc(), - to_void_cast(workspace_data), "@bwd-wrk_mem_p"); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), + to_void_cast(workspace_data), + "@bwd-wrk_mem_p"); } }; @@ -521,11 +521,11 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT { public: PoolingMKLDNNHandler( - const std::vector& src_dims, const std::vector& dst_dims, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string& pooling_type, - bool ceil_mode, const MKLDNNMemoryFormat fmt, - mkldnn::memory::data_type dt, bool is_test, + const std::vector& src_dims, + const std::vector& dst_dims, const std::vector& ksize, + const std::vector& strides, const std::vector& paddings, + const std::string& pooling_type, bool ceil_mode, + const MKLDNNMemoryFormat fmt, mkldnn::memory::data_type dt, bool is_test, const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, const std::string& unique_name, bool exclude_padding) : platform::MKLDNNHandlerT& diff_dst_dims, - const std::vector& diff_src_dims, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string& pooling_type, bool ceil_mode, - const MKLDNNMemoryFormat fmt, const MKLDNNMemoryFormat diff_dst_fmt, - mkldnn::memory::data_type dt, + const std::vector& diff_dst_dims, + const std::vector& diff_src_dims, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, const std::string& pooling_type, + bool ceil_mode, const MKLDNNMemoryFormat fmt, + const MKLDNNMemoryFormat diff_dst_fmt, mkldnn::memory::data_type dt, const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, const std::string& unique_name, bool exclude_padding) : platform::MKLDNNHandlerT AcquireWorkspaceMemory(void) { - mkldnn::memory::primitive_desc workspace_mpd = - this->fwd_pd_->workspace_primitive_desc(); + mkldnn::memory::desc workspace_md = this->fwd_pd_->workspace_desc(); // Pooling PD has to be passed to Grad op that // may be executed by diffrent thread, hence // for that one we use key that does not contain TID @@ -605,7 +603,7 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT( this->dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - mem_p = std::make_shared(workspace_mpd); + mem_p = std::make_shared(workspace_md, this->engine_); this->dev_ctx_.SetBlob(local_key, mem_p); } } @@ -619,10 +617,10 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT& src_tz, const std::vector& dst_tz, - const std::vector& kernel_size, const std::vector& paddings, - const std::vector& strides, - std::vector& right_bot_padding) { // NOLINT + const std::vector& src_tz, const std::vector& dst_tz, + const std::vector& kernel_size, + const std::vector& paddings, const std::vector& strides, + std::vector& right_bot_padding) { // NOLINT for (size_t i = 0; i < right_bot_padding.size(); i++) { int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i], paddings[i], strides[i]); @@ -636,8 +634,8 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT class TransposeMKLDNNHandler : public MKLDNNHandler { public: - TransposeMKLDNNHandler(std::vector& dims, // NOLINT - std::vector& axis, // NOLINT + TransposeMKLDNNHandler(std::vector& dims, // NOLINT + std::vector& axis, // NOLINT const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), @@ -657,12 +655,11 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { logical_axis_[i] = i; } - auto src_md = fmt != mkldnn::memory::format::nchw + auto src_md = fmt != MKLDNNMemoryFormat::nchw ? platform::MKLDNNMemDesc( dims_, platform::MKLDNNGetDataType(), fmt) : Axis2MemoryDesc(dims_, logical_axis_); - mem_p = std::make_shared( - mkldnn::memory::primitive_desc{src_md, engine_}, ptr); + mem_p = std::make_shared(src_md, engine_, ptr); dev_ctx_.SetBlob(local_key, mem_p); } else { mem_p->set_data_handle(ptr); @@ -676,12 +673,11 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - auto dst_mdp = mkldnn::memory::primitive_desc{ - Axis2MemoryDesc(dims_, axis_), engine_}; + auto dst_md = Axis2MemoryDesc(dims_, axis_); - auto dst_data = output->mutable_data(place, dst_mdp.get_size()); + auto dst_data = output->mutable_data(place, dst_md.get_size()); - mem_p = std::make_shared(dst_mdp, dst_data); + mem_p = std::make_shared(dst_md, engine_, dst_data); dev_ctx_.SetBlob(local_key, mem_p); } else { auto dst_data = output->mutable_data(place); @@ -705,49 +701,32 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { } protected: - mkldnn_memory_desc_t Axis2MemoryDesc( - const std::vector& nchw_tz, // NOLINT - const std::vector& axis) { - mkldnn_memory_desc_t mem_fmt; - - mem_fmt.primitive_kind = mkldnn_memory; - mem_fmt.ndims = axis.size(); - for (unsigned int i = 0; i < nchw_tz.size(); ++i) { - mem_fmt.dims[i] = nchw_tz[i]; // logical dimensions (nchw format, - // regardless physical layout) - } - if (platform::MKLDNNGetDataType() == mkldnn::memory::data_type::s8) - mem_fmt.data_type = mkldnn_s8; - else if (platform::MKLDNNGetDataType() == mkldnn::memory::data_type::u8) - mem_fmt.data_type = mkldnn_u8; - else - mem_fmt.data_type = mkldnn_f32; - mem_fmt.format = mkldnn_blocked; + mkldnn::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT + std::vector& axis // NOLINT + ) { + size_t ndims = axis.size(); + std::vector strides(ndims); unsigned int total_stride = 1; - for (int i = nchw_tz.size() - 1; i >= 0; --i) { - mem_fmt.layout_desc.blocking.padding_dims[i] = - nchw_tz[i]; // logical dimensions (nchw format, regardless physical - // layout) - mem_fmt.layout_desc.blocking.block_dims[i] = 1; - mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset - mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride; - mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1; + for (int i = ndims - 1; i >= 0; --i) { + strides[axis[i]] = total_stride; total_stride *= nchw_tz[axis[i]]; } - mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset - return mem_fmt; + mkldnn::memory::desc mem_d(nchw_tz, platform::MKLDNNGetDataType(), + strides); + + return mem_d; } private: - std::vector dims_; + std::vector dims_; std::vector axis_; std::vector logical_axis_; }; class ReorderMKLDNNHandler : public MKLDNNHandler { public: - ReorderMKLDNNHandler(std::vector& dims, // NOLINT + ReorderMKLDNNHandler(std::vector& dims, // NOLINT framework::proto::VarType::Type vtype, mkldnn::memory::data_type dtype, const platform::MKLDNNDeviceContext& dev_ctx, @@ -770,11 +749,10 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt); - auto dst_mdp = mkldnn::memory::primitive_desc{dst_md, engine_}; auto dst_data = output->mutable_data(place, vtype_); - mem_p = std::make_shared(dst_mdp, dst_data); + mem_p = std::make_shared(dst_md, engine_, dst_data); dev_ctx_.SetBlob(local_key, mem_p); } else { auto dst_data = output->mutable_data(place, vtype_); @@ -798,7 +776,7 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { } private: - std::vector dims_; + std::vector dims_; framework::proto::VarType::Type vtype_; mkldnn::memory::data_type dtype_; }; @@ -850,28 +828,25 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { key_ += "-BWD"; } - size_t GetDstMemorySize() const { - return conv_pd_->dst_primitive_desc().get_size(); - } + size_t GetDstMemorySize() const { return conv_pd_->dst_desc().get_size(); } MKLDNNMemoryFormat GetDstFormat() const { - return static_cast( - conv_pd_->dst_primitive_desc().desc().data.format); + return paddle::platform::GetMKLDNNFormat(conv_pd_->dst_desc()); } size_t GetDiffWeightsMemorySize() const { - return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size(); + return conv_bwd_weights_pd_->diff_weights_desc().get_size(); } size_t GetDiffSourceMemorySize() const { - return conv_bwd_data_pd_->diff_src_primitive_desc().get_size(); + return conv_bwd_data_pd_->diff_src_desc().get_size(); } std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( const std::shared_ptr user_memory_p, std::vector& pipeline) { // NOLINT - auto src_pd = conv_bwd_weights_pd_->src_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); + auto src_pd = conv_bwd_weights_pd_->src_desc(); + auto user_pd = user_memory_p->get_desc(); return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@weights-src_mem_p", pipeline); } @@ -879,8 +854,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( const std::shared_ptr user_memory_p, std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); + auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_desc(); + auto user_pd = user_memory_p->get_desc(); return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, "@weights-diff_dst_mem_p", pipeline); } @@ -888,15 +863,14 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( void* ptr) { return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr, - "@diff_weights_mem_p"); + conv_bwd_weights_pd_->diff_weights_desc(), ptr, "@diff_weights_mem_p"); } std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( const std::shared_ptr user_memory_p, std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); + auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_desc(); + auto user_pd = user_memory_p->get_desc(); return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, "@data-diff_dst_mem_p", pipeline); } @@ -904,8 +878,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( const std::shared_ptr user_weights_memory_p, std::vector& pipeline) { // NOLINT - auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc(); - auto user_pd = user_weights_memory_p->get_primitive_desc(); + auto weights_pd = conv_bwd_data_pd_->weights_desc(); + auto user_pd = user_weights_memory_p->get_desc(); return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, "@data-weights_mem_p", pipeline); } @@ -926,20 +900,20 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireDiffSrcMemoryFromDataPrimitive( void* ptr) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p"); + return this->AcquireMemoryFromPrimitive(conv_bwd_data_pd_->diff_src_desc(), + ptr, "@diff_src_mem_p"); } std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr, + return this->AcquireMemoryFromPrimitive(conv_pd_->dst_desc(), ptr, "@dst_mem_p"); } std::shared_ptr AcquireSrcMemoryFromPrimitive( const std::shared_ptr user_memory_p, std::vector& pipeline) { // NOLINT - auto src_pd = conv_pd_->src_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); + auto src_pd = conv_pd_->src_desc(); + auto user_pd = user_memory_p->get_desc(); return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", pipeline); } @@ -960,8 +934,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::vector& pipeline, // NOLINT bool is_persistent = false, bool is_INT8 = false, std::vector scale_data = {1.0f}, int mask = 0) { - auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); - auto weights_pd = conv_pd_->weights_primitive_desc(); + auto user_weights_pd = user_weights_memory_p->get_desc(); + auto weights_pd = conv_pd_->weights_desc(); return this->AcquireMemory( weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", pipeline, is_persistent, is_INT8, scale_data, mask); @@ -973,8 +947,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { bool is_persistent = false, bool is_INT8 = false, std::vector scale_data = {1.0f}, int mask = 0) { // NOLINT - auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); - auto bias_pd = conv_pd_->bias_primitive_desc(); + auto user_bias_pd = user_bias_memory_p->get_desc(); + auto bias_pd = conv_pd_->bias_desc(); return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, "@bias_mem_p", pipeline, is_persistent, is_INT8, scale_data, mask); @@ -1020,8 +994,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { AcquireConvolutionPrimitiveDescriptor( const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, boost::optional bias, - const mkldnn::memory::desc& dst, const std::vector& strides, - const std::vector& paddings, const mkldnn::engine& engine, + const mkldnn::memory::desc& dst, const std::vector& strides, + const std::vector& paddings, const mkldnn::engine& engine, const std::string& fuse_activation, float fuse_alpha, float fuse_beta, const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind, const std::vector output_shift_scale = {}, @@ -1047,15 +1021,14 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { auto mkldnn_paddings = ToMkldnnPadding(paddings); auto conv_desc = - bias - ? typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, *bias, dst, stride_dims, mkldnn_paddings[0], - mkldnn_paddings[1], mkldnn::padding_kind::zero) - : typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, dst, stride_dims, mkldnn_paddings[0], - mkldnn_paddings[1], mkldnn::padding_kind::zero); + bias ? typename forward_t::desc( + fwd_prop_kind, convolutional_algorithm::T, + src, weights, *bias, dst, stride_dims, + mkldnn_paddings[0], mkldnn_paddings[1]) + : typename forward_t::desc( + fwd_prop_kind, convolutional_algorithm::T, + src, weights, dst, stride_dims, mkldnn_paddings[0], + mkldnn_paddings[1]); mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_activation, fuse_alpha, fuse_beta, @@ -1071,68 +1044,37 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { return conv_pd_; } - std::shared_ptr AcquireConvolution( - std::shared_ptr src_memory_p, - std::shared_ptr weights_memory_p, - std::shared_ptr dst_memory_p) { - auto prim_key = key_ + "@conv_p"; - auto conv_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (conv_p == nullptr) { - conv_p = std::make_shared(*conv_pd_, *src_memory_p, - *weights_memory_p, *dst_memory_p); - - dev_ctx_.SetBlob(prim_key, conv_p); - } - return conv_p; - } - - std::shared_ptr AcquireConvolution( - std::shared_ptr src_memory_p, - std::shared_ptr weights_memory_p, - std::shared_ptr bias_memory_p, - std::shared_ptr dst_memory_p) { + std::shared_ptr AcquireConvolution() { auto prim_key = key_ + "@conv_p"; auto conv_p = std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); if (conv_p == nullptr) { - conv_p = std::make_shared(*conv_pd_, *src_memory_p, - *weights_memory_p, *bias_memory_p, - *dst_memory_p); + conv_p = std::make_shared(*conv_pd_); dev_ctx_.SetBlob(prim_key, conv_p); } return conv_p; } - std::shared_ptr AcquireConvolutionBackwardWeights( - std::shared_ptr src_memory_p, - std::shared_ptr diff_dst_memory_p, - std::shared_ptr diff_weights_memory_p) { + std::shared_ptr AcquireConvolutionBackwardWeights() { auto prim_key = key_ + "@conv_bwd_weights_p"; auto conv_bwd_weights_p = std::static_pointer_cast( dev_ctx_.GetBlob(prim_key)); if (conv_bwd_weights_p == nullptr) { // create backward conv primitive for weights - conv_bwd_weights_p = std::make_shared( - *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p, - *diff_weights_memory_p); + conv_bwd_weights_p = + std::make_shared(*conv_bwd_weights_pd_); dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p); } return conv_bwd_weights_p; } - std::shared_ptr AcquireConvolutionBackwardData( - std::shared_ptr diff_dst_memory_p, - std::shared_ptr weights_memory_p, - std::shared_ptr diff_src_memory_p) { + std::shared_ptr AcquireConvolutionBackwardData() { auto prim_key = key_ + "@conv_bwd_data_p"; auto conv_bwd_data_p = std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); if (conv_bwd_data_p == nullptr) { - conv_bwd_data_p = std::make_shared( - *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p, - *diff_src_memory_p); + conv_bwd_data_p = std::make_shared(*conv_bwd_data_pd_); dev_ctx_.SetBlob(prim_key, conv_bwd_data_p); } return conv_bwd_data_p; @@ -1199,9 +1141,9 @@ static void SetDstMemoryHandler( template static void SetDstMemoryQuantized( const framework::ExecutionContext& ctx, framework::Tensor* output, - std::vector dst_tz, const mkldnn::engine& engine, - std::shared_ptr& dst_pd, // NOLINT - std::shared_ptr& dst_memory) { // NOLINT + std::vector dst_tz, const mkldnn::engine& engine, + std::shared_ptr& dst_md, // NOLINT + std::shared_ptr& dst_memory) { // NOLINT T* output_data = output->mutable_data(ctx.GetPlace()); const size_t dst_dims = dst_tz.size(); MKLDNNMemoryFormat dst_fmt; @@ -1209,12 +1151,13 @@ static void SetDstMemoryQuantized( "Dst memory for quantization can not have dims > 5"); dst_fmt = platform::MKLDNNFormatForSize(dst_dims, MKLDNNMemoryFormat::nhwc); - auto dst_md = platform::MKLDNNMemDesc( + auto tmp_dst_md = platform::MKLDNNMemDesc( {dst_tz}, paddle::framework::ToMKLDNNDataType( framework::DataTypeTrait::DataType()), dst_fmt); - dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine)); - dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + dst_md.reset(new mkldnn::memory::desc(tmp_dst_md)); + dst_memory.reset( + new mkldnn::memory(*dst_md, engine, to_void_cast(output_data))); } } // namespace platform diff --git a/paddle/fluid/string/to_string.h b/paddle/fluid/string/to_string.h index 8caf149420393ec81131389d7787bee925f4a27d..f5693ee7f1ebf6654de7d8ff20649fac858a27cb 100644 --- a/paddle/fluid/string/to_string.h +++ b/paddle/fluid/string/to_string.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include #include namespace paddle { @@ -24,13 +25,20 @@ inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) { return s; } -template +template ::value, int>::type = 0> inline std::string to_string(T v) { std::ostringstream sout; sout << v; return sout.str(); } +template ::value, int>::type = 0> +inline std::string to_string(T v) { + return std::to_string(static_cast(v)); +} + template <> inline std::string to_string(std::type_index t) { return t.name(); diff --git a/python/setup.py.in b/python/setup.py.in index 4d3fd6a16b48b991a361a534ff50c86e7272dc18..cebb460bd77b00b52f40848f0f78415f1bad9691 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -203,14 +203,16 @@ if '${WITH_MKLDNN}' == 'ON': # TODO(typhoonzero): use install_name_tool to patch mkl libs once # we can support mkl on mac. # - # change rpath of libmkldnn.so.0, add $ORIGIN/ to it. + # change rpath of libmkldnn.so.1, add $ORIGIN/ to it. # The reason is that all thirdparty libraries in the same directory, - # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so. + # thus, libmkldnn.so.1 will find libmklml_intel.so and libiomp5.so. command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" if os.system(command) != 0: raise Exception("patch libmkldnn.so failed, command: %s" % command) - package_data['paddle.libs']+=['libmkldnn.so.0' if os.name != 'nt' else ('mkldnn' + ext_name)] + package_data['paddle.libs']+=['libmkldnn.so.0','libmkldnn.so.1' if os.name != 'nt' else ('mkldnn' + ext_name)] shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) + if os.name != 'nt': + shutil.copy('${MKLDNN_SHARED_LIB_1}', libs_path) if '${WITH_NGRAPH}' == 'ON': # only change rpath in Release mode, # since in Debug mode, nGraph lib may be too large to be changed?