diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt index 98c2f68a6c39ed12795bad4a905558917c0275a4..87173fc42a46c8218fbf0beb4ebf7760f69b7c24 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/contrib/inference/CMakeLists.txt @@ -45,6 +45,10 @@ endfunction(inference_api_test) cc_library(paddle_inference_api SRCS paddle_inference_api.cc paddle_inference_api_impl.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) +if(NOT APPLE) + set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.sym") + set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}") +endif() # Here the shared library doesn't depend on other fluid libraries, or double free will occur. cc_library(paddle_inference_api_shared SHARED @@ -53,8 +57,19 @@ add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api) if(NOT APPLE) - set(LINK_FLAGS "-fPIC -fvisibility=hidden") + set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.map") set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake + "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" + " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n" + "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" + " message(FATAL_ERROR \"Check symbol failed.\")\n" + "endif()\n") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake" + DEPENDS paddle_inference_api_shared) + add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") endif() cc_test(test_paddle_inference_api diff --git a/paddle/contrib/inference/check_symbol.sh b/paddle/contrib/inference/check_symbol.sh new file mode 100755 index 0000000000000000000000000000000000000000..6547ca1413649968e8a0be146915e07192a99898 --- /dev/null +++ b/paddle/contrib/inference/check_symbol.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +lib=$1 +if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi + +num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l) +num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l) + +if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi +if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi + +exit 0 diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt index ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36..2d501bf0085b1bd4c39ee1a6dfaaa9622fd72ce1 100644 --- a/paddle/contrib/inference/demo/CMakeLists.txt +++ b/paddle/contrib/inference/demo/CMakeLists.txt @@ -13,8 +13,6 @@ # limitations under the License. # -inference_api_test(simple_on_word2vec ARGS test_word2vec) - option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF) if(NOT WITH_INFERENCE_DEMO) return() diff --git a/paddle/contrib/inference/demo_ci/CMakeLists.txt b/paddle/contrib/inference/demo_ci/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..789bff7f23cd89bfaeba180efa95972cef6fc58c --- /dev/null +++ b/paddle/contrib/inference/demo_ci/CMakeLists.txt @@ -0,0 +1,77 @@ +cmake_minimum_required(VERSION 3.0) + +project(cpp_inference_demo CXX C) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) + +if(WITH_GPU) + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") +endif() + +include_directories("${PADDLE_LIB}") +include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") +include_directories("${PADDLE_LIB}/third_party/install/glog/include") +include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/snappy/include") +include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") +include_directories("${PADDLE_LIB}/third_party/install/zlib/include") + +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") +link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") +link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") +link_directories("${PADDLE_LIB}/third_party/install/glog/lib") +link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") + +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) + +if(WITH_MKL) + include_directories("${PADDLE_LIB}/third_party/install/mklml/include") + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so + ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so) + set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif() +else() + set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) +endif() + +if(WITH_STATIC_LIB) + set(DEPS + "-Wl,--whole-archive" + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a + "-Wl,--no-whole-archive" + ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a) +else() + # Note: libpaddle_inference_api.so must put before libpaddle_fluid.so + set(DEPS + ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so) +endif() +set(EXTERNAL_LIB "-lrt -ldl -lpthread") + +set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf snappystream snappy z + ${EXTERNAL_LIB}) +if(WITH_GPU) + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so) +endif() + +target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/contrib/inference/demo_ci/run.sh b/paddle/contrib/inference/demo_ci/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..e3a7269af795b05c296423cb2dc92b753397c6b3 --- /dev/null +++ b/paddle/contrib/inference/demo_ci/run.sh @@ -0,0 +1,34 @@ +set -x +PADDLE_ROOT=$1 +WITH_MKL=$2 +WITH_GPU=$3 +if [ $3 == "ON" ]; then + use_gpu_list='true false' +else + use_gpu_list='false' +fi + +mkdir -p build +cd build + +for WITH_STATIC_LIB in false; do + rm -rf * + cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ + -DWITH_MKL=$WITH_MKL \ + -DDEMO_NAME=simple_on_word2vec \ + -DWITH_GPU=$WITH_GPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB + make + for use_gpu in $use_gpu_list; do + ./simple_on_word2vec \ + --dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \ + --use_gpu=$use_gpu + done +done +if [ $? -eq 0 ]; then + exit 0 +else + echo "inference demo runs fail." + exit 1 +fi +set +x diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc similarity index 68% rename from paddle/contrib/inference/demo/simple_on_word2vec.cc rename to paddle/contrib/inference/demo_ci/simple_on_word2vec.cc index c253014642f39a042430992548a285cc7078a959..9713837f86d40383da946af1681e1945c84336b0 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc @@ -16,21 +16,27 @@ limitations under the License. */ * This file contains a simple demo for how to take a model for inference. */ +#include #include -#include #include #include -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_bool(use_gpu, false, "Whether use gpu."); namespace paddle { namespace demo { -DEFINE_string(dirname, "", "Directory of the inference model."); - void Main(bool use_gpu) { //# 1. Create PaddlePredictor with a config. NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + if (FLAGS_dirname.empty()) { + LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model"; + exit(1); + } + config.model_dir = FLAGS_dirname; config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; @@ -54,12 +60,16 @@ void Main(bool use_gpu) { CHECK(predictor->Run(slots, &outputs)); //# 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "output buffer size: " << outputs.front().data.length(); + PADDLE_ENFORCE(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + float result[5] = { + 0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); } } } @@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) { // Multi-threads only support on CPU // 0. Create PaddlePredictor with a config. NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname; config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; @@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) { CHECK(predictor->Run(inputs, &outputs)); // 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "TID: " << tid << ", " - << "output buffer size: " << outputs.front().data.length(); + PADDLE_ENFORCE(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + float result[5] = { + 0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); } } }); @@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) { } } -TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); } - -#ifdef PADDLE_WITH_CUDA -TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); } -#endif - } // namespace demo } // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(false /* use_gpu*/); + paddle::demo::MainThreads(1, false /* use_gpu*/); + paddle::demo::MainThreads(4, false /* use_gpu*/); + if (FLAGS_use_gpu) { + paddle::demo::Main(true /*use_gpu*/); + paddle::demo::MainThreads(1, true /*use_gpu*/); + paddle::demo::MainThreads(4, true /*use_gpu*/); + } + return 0; +} diff --git a/paddle/contrib/inference/paddle_inference_api.map b/paddle/contrib/inference/paddle_inference_api.map new file mode 100644 index 0000000000000000000000000000000000000000..5203784dc1fcb672eb6a26d9dfd3ffbe02e08038 --- /dev/null +++ b/paddle/contrib/inference/paddle_inference_api.map @@ -0,0 +1,6 @@ +{ + global: + *paddle*; + local: + *; +}; diff --git a/paddle/contrib/inference/paddle_inference_api.sym b/paddle/contrib/inference/paddle_inference_api.sym new file mode 100644 index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374 --- /dev/null +++ b/paddle/contrib/inference/paddle_inference_api.sym @@ -0,0 +1 @@ +*paddle* diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 1895aea7f98cb1ad12b2ce16545339252349ea37..b1c33c3415f49f9b1160655034350087432d0cb0 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -13,6 +13,12 @@ endif() # Create static library cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api) +if(NOT APPLE) + # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. + set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") + set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}") +endif() + # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym new file mode 100644 index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374 --- /dev/null +++ b/paddle/fluid/inference/paddle_fluid.sym @@ -0,0 +1 @@ +*paddle* diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 6b06913d1c83f4534238ac3dd22ac4035c0f0fbf..5bfa1aaa696d5cbe8bdcb94d708746259952740f 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -29,6 +29,79 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; +class ConvMKLDNNHandler : public platform::MKLDNNHandler { + public: + ConvMKLDNNHandler( + std::shared_ptr conv_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key) { + conv_pd_ = conv_pd; + } + + std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { + return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr, + "@dst_mem_p"); + } + + std::shared_ptr AcquireSrcMemoryFromPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { + auto src_pd = conv_pd_->src_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", + pipeline); + } + + std::shared_ptr AcquireWeightsMemoryFromPrimitive( + const std::shared_ptr user_weights_memory_p, + std::vector& pipeline) { + auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); + auto weights_pd = conv_pd_->weights_primitive_desc(); + return this->AcquireMemory(weights_pd, user_weights_pd, + user_weights_memory_p, "@weights_mem_p", + pipeline); + } + + std::shared_ptr AcquireConvolution( + std::shared_ptr src_memory_p, + std::shared_ptr weights_memory_p, + std::shared_ptr dst_memory_p) { + auto prim_key = key_ + "@conv_p"; + auto prim_desc_key = key_ + "@conv_pd"; + auto conv_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution primitive in device context"); + if (conv_p == nullptr) { + conv_p = std::make_shared( + *conv_pd_, *(src_memory_p), *(weights_memory_p.get()), + *(dst_memory_p.get())); + + dev_ctx_.SetBlob(prim_key, conv_p); + } else { + is_reusing_ = true; + } + return conv_p; + } + + // Generate keys for storing/retriving primitives for this operator + // TODO(jczaja): Make hashing function more optimial + static std::string GetHash(memory::dims& input_dims, + memory::dims& weights_dims, + std::vector& strides, + std::vector& paddings, + std::vector& dilations, int groups, + const std::string& suffix) { + return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + + dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + + suffix; + } + + private: + std::shared_ptr conv_pd_; +}; + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -36,10 +109,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - // Get unique name for index - const std::string key = ctx.op().Output("Output"); - const std::string key_conv_pd = key + "@conv_pd"; - auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -80,68 +149,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // create mkldnn memory from input tensors (data/weights) - auto user_src_memory = memory( - {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, - to_void_cast(input_data)); - auto user_weights_memory = - memory({{{weights_tz}, memory::data_type::f32, filter->format()}, - mkldnn_engine}, - to_void_cast(filter_data)); + // Get unique name for storing MKLDNN primitives + const std::string key = ConvMKLDNNHandler::GetHash( + src_tz, weights_tz, strides, paddings, dilations, groups, + ctx.op().Output("Output")); + const std::string key_conv_pd = key + "@conv_pd"; + + std::vector pipeline; + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose * the memory format preferred for best performance */ - auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, - memory::format::any); + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), memory::format::any); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::f32, memory::format::any); - auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, - memory::format::any); + weights_tz, platform::MKLDNNGetDataType(), memory::format::any); + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), memory::format::any); // create a conv primitive descriptor and save it for usage in backward std::shared_ptr conv_pd = ConvFwdPrimitiveDesc( src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); - // create reorder primitive if the input format is not the preferred one - auto src_memory = user_src_memory; - primitive reorder_src; - bool is_src_reordered = false; - if (memory::primitive_desc(conv_pd->src_primitive_desc()) != - user_src_memory.get_primitive_desc()) { - src_memory = memory(conv_pd->src_primitive_desc()); - reorder_src = reorder(user_src_memory, src_memory); - is_src_reordered = true; - } - auto weights_memory = user_weights_memory; - primitive reorder_weights; - bool is_weights_reordered = false; - if (memory::primitive_desc(conv_pd->weights_primitive_desc()) != - user_weights_memory.get_primitive_desc()) { - weights_memory = memory(conv_pd->weights_primitive_desc()); - reorder_weights = reorder(user_weights_memory, weights_memory); - is_weights_reordered = true; - } + ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); - // create memory primitive for conv dst - auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + auto src_memory_p = + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline); + auto dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); // create convolution op primitive - auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); + auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); // push primitive to stream and wait until it's executed - std::vector pipeline; - if (is_src_reordered) pipeline.push_back(reorder_src); - if (is_weights_reordered) pipeline.push_back(reorder_weights); - pipeline.push_back(conv_prim); + pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx.SetBlob(key_conv_pd, conv_pd); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(dst_memory)); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); } private: @@ -197,13 +260,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { if (!input_grad && !filter_grad) return; - // Get an unique name from "argument" name of "Output" variable - // This name will be used as key when saving info into device context - const std::string key = ctx.op().Input("Output"); - const std::string key_conv_pd = key + "@conv_pd"; - std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); const T* input_data = input->data(); const T* filter_data = filter->data(); @@ -223,6 +283,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + // Get an unique name from "argument" name of "Output" variable + // This name will be used as key when saving info into device context + const std::string key = + ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings, + dilations, groups, ctx.op().Input("Output")); + + const std::string key_conv_pd = key + "@conv_pd"; + // create mkldnn memory from input tensors (input/weights/output_grad) auto user_src_memory = memory( {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc index 4e35c38e4e03d4d0f00601812fdc4803519b89ae..b5cb6a724c095eb849f3a184f13843e1a0cca92f 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cc +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { "(float) " "Prior boxes center offset.") .SetDefault(0.5); + AddAttr( + "min_max_aspect_ratios_order", + "(bool) If set True, the output prior box is in order of" + "[min, max, aspect_ratios], which is consistent with Caffe." + "Please note, this order affects the weights order of convolution layer" + "followed by and does not affect the final detection results.") + .SetDefault(false); AddComment(R"DOC( Prior box operator Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu index f67e6ca91c0852b5a3be35d23246884d1157caa4..1ea8cfc1d2af8cc6c332768a467cdcd4c0166319 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cu +++ b/paddle/fluid/operators/detection/prior_box_op.cu @@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, const int im_width, const int as_num, const T offset, const T step_width, const T step_height, const T* min_sizes, - const T* max_sizes, const int min_num, - bool is_clip) { + const T* max_sizes, const int min_num, bool is_clip, + bool min_max_aspect_ratios_order) { int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; int box_num = height * width * num_priors; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num; @@ -44,14 +44,28 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, T min_size = min_sizes[m]; if (max_sizes) { int s = p % (as_num + 1); - if (s < as_num) { - T ar = aspect_ratios[s]; - bw = min_size * sqrt(ar) / 2.; - bh = min_size / sqrt(ar) / 2.; + if (!min_max_aspect_ratios_order) { + if (s < as_num) { + T ar = aspect_ratios[s]; + bw = min_size * sqrt(ar) / 2.; + bh = min_size / sqrt(ar) / 2.; + } else { + T max_size = max_sizes[m]; + bw = sqrt(min_size * max_size) / 2.; + bh = bw; + } } else { - T max_size = max_sizes[m]; - bw = sqrt(min_size * max_size) / 2.; - bh = bw; + if (s == 0) { + bw = bh = min_size / 2.; + } else if (s == 1) { + T max_size = max_sizes[m]; + bw = sqrt(min_size * max_size) / 2.; + bh = bw; + } else { + T ar = aspect_ratios[s - 1]; + bw = min_size * sqrt(ar) / 2.; + bh = min_size / sqrt(ar) / 2.; + } } } else { int s = p % as_num; @@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { auto variances = ctx.Attr>("variances"); auto flip = ctx.Attr("flip"); auto clip = ctx.Attr("clip"); + auto min_max_aspect_ratios_order = + ctx.Attr("min_max_aspect_ratios_order"); std::vector aspect_ratios; ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); @@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { GenPriorBox<<>>( boxes->data(), r.data(), height, width, im_height, im_width, aspect_ratios.size(), offset, step_width, step_height, min.data(), - max_data, min_num, clip); + max_data, min_num, clip, min_max_aspect_ratios_order); framework::Tensor v; framework::TensorFromVector(variances, ctx.device_context(), &v); diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h index 1c62fd8d2c4d4e4deba4ca6442efbaff83e36c35..4e226abbb51c271502f0ca5419d488643b5a1a82 100644 --- a/paddle/fluid/operators/detection/prior_box_op.h +++ b/paddle/fluid/operators/detection/prior_box_op.h @@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel { auto variances = ctx.Attr>("variances"); auto flip = ctx.Attr("flip"); auto clip = ctx.Attr("clip"); + auto min_max_aspect_ratios_order = + ctx.Attr("min_max_aspect_ratios_order"); std::vector aspect_ratios; ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); @@ -108,26 +110,59 @@ class PriorBoxOpKernel : public framework::OpKernel { int idx = 0; for (size_t s = 0; s < min_sizes.size(); ++s) { auto min_size = min_sizes[s]; - // priors with different aspect ratios - for (size_t r = 0; r < aspect_ratios.size(); ++r) { - float ar = aspect_ratios[r]; - box_width = min_size * sqrt(ar) / 2.; - box_height = min_size / sqrt(ar) / 2.; - e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; - e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; - e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; - e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; - idx++; - } - if (max_sizes.size() > 0) { - auto max_size = max_sizes[s]; - // square prior with size sqrt(minSize * maxSize) - box_width = box_height = sqrt(min_size * max_size) / 2.; + if (min_max_aspect_ratios_order) { + box_width = box_height = min_size / 2.; e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; + if (max_sizes.size() > 0) { + auto max_size = max_sizes[s]; + // square prior with size sqrt(minSize * maxSize) + box_width = box_height = sqrt(min_size * max_size) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } + // priors with different aspect ratios + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + if (fabs(ar - 1.) < 1e-6) { + continue; + } + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } + } else { + // priors with different aspect ratios + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } + if (max_sizes.size() > 0) { + auto max_size = max_sizes[s]; + // square prior with size sqrt(minSize * maxSize) + box_width = box_height = sqrt(min_size * max_size) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } } } } diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 33fec2c1073819d88d85a8872227adcb9df3e8f4..a8f93e6848a1db1f5aa0ee266a076af2b5d0c964 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -222,15 +222,16 @@ class MKLDNNHandler { static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT const std::string& suffix) { - auto dims2str = [](const mkldnn::memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dims2str(operand_dims) + suffix; + }; + + protected: + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; } protected: diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d173b41e86f61954954b6a5ea9957d2e172deca0..bf45c11a9de53a109c72ff7a89b807bc80feb7c8 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -510,11 +510,23 @@ function gen_fluid_inference_lib() { EOF make -j `nproc` inference_lib_dist cd ${PADDLE_ROOT}/build - mv fluid_install_dir fluid + cp -r fluid_install_dir fluid tar -cf fluid.tgz fluid fi } +function test_fluid_inference_lib() { + if [ ${WITH_C_API:-OFF} == "OFF" ] ; then + cat < 0 and max_sizes[0] > 0: if not _is_list_or_tuple_(max_sizes): @@ -911,7 +918,8 @@ def multi_box_head(inputs, kernel_size=1, pad=0, stride=1, - name=None): + name=None, + min_max_aspect_ratios_order=False): """ Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. The details of this algorithm, please refer the @@ -954,6 +962,11 @@ def multi_box_head(inputs, pad(int|list|tuple): The padding of conv2d. Default:0. stride(int|list|tuple): The stride of conv2d. Default:1, name(str): Name of the prior box layer. Default: None. + min_max_aspect_ratios_order(bool): If set True, the output prior box is + in order of [min, max, aspect_ratios], which is consistent with + Caffe. Please note, this order affects the weights order of + convolution layer followed by and does not affect the fininal + detection results. Default: False. Returns: tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances) @@ -1068,7 +1081,8 @@ def multi_box_head(inputs, step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0] box, var = prior_box(input, image, min_size, max_size, aspect_ratio, - variance, flip, clip, step, offset) + variance, flip, clip, step, offset, None, + min_max_aspect_ratios_order) box_results.append(box) var_results.append(var) diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py index bcbc02a2baa46b9ab583ecf3006bd3262e6038fd..e15554737b9f3fa36382dde15ded928271679538 100644 --- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py @@ -32,6 +32,7 @@ class TestPriorBoxOp(OpTest): 'variances': self.variances, 'flip': self.flip, 'clip': self.clip, + 'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order, 'step_w': self.step_w, 'step_h': self.step_h, 'offset': self.offset @@ -52,6 +53,9 @@ class TestPriorBoxOp(OpTest): max_sizes = [5, 10] self.max_sizes = np.array(max_sizes).astype('float32').tolist() + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = False + def init_test_params(self): self.layer_w = 32 self.layer_h = 32 @@ -71,6 +75,7 @@ class TestPriorBoxOp(OpTest): self.set_max_sizes() self.aspect_ratios = [2.0, 3.0] self.flip = True + self.set_min_max_aspect_ratios_order() self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] self.aspect_ratios = np.array( self.aspect_ratios, dtype=np.float).flatten() @@ -78,7 +83,6 @@ class TestPriorBoxOp(OpTest): self.variances = np.array(self.variances, dtype=np.float).flatten() self.clip = True - self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) if len(self.max_sizes) > 0: self.num_priors += len(self.max_sizes) @@ -106,26 +110,60 @@ class TestPriorBoxOp(OpTest): idx = 0 for s in range(len(self.min_sizes)): min_size = self.min_sizes[s] - # rest of priors - for r in range(len(self.real_aspect_ratios)): - ar = self.real_aspect_ratios[r] - c_w = min_size * math.sqrt(ar) / 2 - c_h = (min_size / math.sqrt(ar)) / 2 - out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, - (c_y - c_h) / self.image_h, - (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h] - idx += 1 - - if len(self.max_sizes) > 0: - max_size = self.max_sizes[s] - # second prior: aspect_ratio = 1, - c_w = c_h = math.sqrt(min_size * max_size) / 2 + if not self.min_max_aspect_ratios_order: + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + else: + c_w = c_h = min_size / 2. out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h, (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h] idx += 1 + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + if abs(ar - 1.) < 1e-6: + continue + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 # clip the prior's coordidate such that it is within[0, 1] if self.clip: @@ -137,10 +175,15 @@ class TestPriorBoxOp(OpTest): self.out_var = out_var.astype('float32') -class TestPriorBoxOpWithMaxSize(TestPriorBoxOp): +class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp): def set_max_sizes(self): self.max_sizes = [] +class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp): + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = True + + if __name__ == '__main__': unittest.main()