diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt index 98c2f68a6c39ed12795bad4a905558917c0275a4..87173fc42a46c8218fbf0beb4ebf7760f69b7c24 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/contrib/inference/CMakeLists.txt @@ -45,6 +45,10 @@ endfunction(inference_api_test) cc_library(paddle_inference_api SRCS paddle_inference_api.cc paddle_inference_api_impl.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) +if(NOT APPLE) + set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.sym") + set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}") +endif() # Here the shared library doesn't depend on other fluid libraries, or double free will occur. cc_library(paddle_inference_api_shared SHARED @@ -53,8 +57,19 @@ add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api) if(NOT APPLE) - set(LINK_FLAGS "-fPIC -fvisibility=hidden") + set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.map") set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake + "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" + " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n" + "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" + " message(FATAL_ERROR \"Check symbol failed.\")\n" + "endif()\n") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake" + DEPENDS paddle_inference_api_shared) + add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") endif() cc_test(test_paddle_inference_api diff --git a/paddle/contrib/inference/check_symbol.sh b/paddle/contrib/inference/check_symbol.sh new file mode 100755 index 0000000000000000000000000000000000000000..6547ca1413649968e8a0be146915e07192a99898 --- /dev/null +++ b/paddle/contrib/inference/check_symbol.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +lib=$1 +if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi + +num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l) +num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l) + +if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi +if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi + +exit 0 diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt index ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36..2d501bf0085b1bd4c39ee1a6dfaaa9622fd72ce1 100644 --- a/paddle/contrib/inference/demo/CMakeLists.txt +++ b/paddle/contrib/inference/demo/CMakeLists.txt @@ -13,8 +13,6 @@ # limitations under the License. # -inference_api_test(simple_on_word2vec ARGS test_word2vec) - option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF) if(NOT WITH_INFERENCE_DEMO) return() diff --git a/paddle/contrib/inference/demo_ci/CMakeLists.txt b/paddle/contrib/inference/demo_ci/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..789bff7f23cd89bfaeba180efa95972cef6fc58c --- /dev/null +++ b/paddle/contrib/inference/demo_ci/CMakeLists.txt @@ -0,0 +1,77 @@ +cmake_minimum_required(VERSION 3.0) + +project(cpp_inference_demo CXX C) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) + +if(WITH_GPU) + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") +endif() + +include_directories("${PADDLE_LIB}") +include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") +include_directories("${PADDLE_LIB}/third_party/install/glog/include") +include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/snappy/include") +include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") +include_directories("${PADDLE_LIB}/third_party/install/zlib/include") + +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") +link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") +link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") +link_directories("${PADDLE_LIB}/third_party/install/glog/lib") +link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") + +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) + +if(WITH_MKL) + include_directories("${PADDLE_LIB}/third_party/install/mklml/include") + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so + ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so) + set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif() +else() + set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) +endif() + +if(WITH_STATIC_LIB) + set(DEPS + "-Wl,--whole-archive" + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a + "-Wl,--no-whole-archive" + ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a) +else() + # Note: libpaddle_inference_api.so must put before libpaddle_fluid.so + set(DEPS + ${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so) +endif() +set(EXTERNAL_LIB "-lrt -ldl -lpthread") + +set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf snappystream snappy z + ${EXTERNAL_LIB}) +if(WITH_GPU) + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so) +endif() + +target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/contrib/inference/demo_ci/run.sh b/paddle/contrib/inference/demo_ci/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..e3a7269af795b05c296423cb2dc92b753397c6b3 --- /dev/null +++ b/paddle/contrib/inference/demo_ci/run.sh @@ -0,0 +1,34 @@ +set -x +PADDLE_ROOT=$1 +WITH_MKL=$2 +WITH_GPU=$3 +if [ $3 == "ON" ]; then + use_gpu_list='true false' +else + use_gpu_list='false' +fi + +mkdir -p build +cd build + +for WITH_STATIC_LIB in false; do + rm -rf * + cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ + -DWITH_MKL=$WITH_MKL \ + -DDEMO_NAME=simple_on_word2vec \ + -DWITH_GPU=$WITH_GPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB + make + for use_gpu in $use_gpu_list; do + ./simple_on_word2vec \ + --dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \ + --use_gpu=$use_gpu + done +done +if [ $? -eq 0 ]; then + exit 0 +else + echo "inference demo runs fail." + exit 1 +fi +set +x diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc similarity index 68% rename from paddle/contrib/inference/demo/simple_on_word2vec.cc rename to paddle/contrib/inference/demo_ci/simple_on_word2vec.cc index c253014642f39a042430992548a285cc7078a959..9713837f86d40383da946af1681e1945c84336b0 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/contrib/inference/demo_ci/simple_on_word2vec.cc @@ -16,21 +16,27 @@ limitations under the License. */ * This file contains a simple demo for how to take a model for inference. */ +#include #include -#include #include #include -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_bool(use_gpu, false, "Whether use gpu."); namespace paddle { namespace demo { -DEFINE_string(dirname, "", "Directory of the inference model."); - void Main(bool use_gpu) { //# 1. Create PaddlePredictor with a config. NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + if (FLAGS_dirname.empty()) { + LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model"; + exit(1); + } + config.model_dir = FLAGS_dirname; config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; @@ -54,12 +60,16 @@ void Main(bool use_gpu) { CHECK(predictor->Run(slots, &outputs)); //# 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "output buffer size: " << outputs.front().data.length(); + PADDLE_ENFORCE(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + float result[5] = { + 0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); } } } @@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) { // Multi-threads only support on CPU // 0. Create PaddlePredictor with a config. NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname; config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; @@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) { CHECK(predictor->Run(inputs, &outputs)); // 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "TID: " << tid << ", " - << "output buffer size: " << outputs.front().data.length(); + PADDLE_ENFORCE(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + float result[5] = { + 0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); } } }); @@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) { } } -TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); } - -#ifdef PADDLE_WITH_CUDA -TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); } -#endif - } // namespace demo } // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(false /* use_gpu*/); + paddle::demo::MainThreads(1, false /* use_gpu*/); + paddle::demo::MainThreads(4, false /* use_gpu*/); + if (FLAGS_use_gpu) { + paddle::demo::Main(true /*use_gpu*/); + paddle::demo::MainThreads(1, true /*use_gpu*/); + paddle::demo::MainThreads(4, true /*use_gpu*/); + } + return 0; +} diff --git a/paddle/contrib/inference/paddle_inference_api.map b/paddle/contrib/inference/paddle_inference_api.map new file mode 100644 index 0000000000000000000000000000000000000000..5203784dc1fcb672eb6a26d9dfd3ffbe02e08038 --- /dev/null +++ b/paddle/contrib/inference/paddle_inference_api.map @@ -0,0 +1,6 @@ +{ + global: + *paddle*; + local: + *; +}; diff --git a/paddle/contrib/inference/paddle_inference_api.sym b/paddle/contrib/inference/paddle_inference_api.sym new file mode 100644 index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374 --- /dev/null +++ b/paddle/contrib/inference/paddle_inference_api.sym @@ -0,0 +1 @@ +*paddle* diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 1895aea7f98cb1ad12b2ce16545339252349ea37..b1c33c3415f49f9b1160655034350087432d0cb0 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -13,6 +13,12 @@ endif() # Create static library cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api) +if(NOT APPLE) + # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. + set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") + set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}") +endif() + # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index d09bf3ed161703b0cf273522921e157c7360a0bc..bd24e8a7d9c20b8cd9c4e41a76ffc33a004a9a69 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const { return dot.Build(); } +std::string DataFlowGraph::HumanReadableInfo(bool show_values, + bool show_functions) const { + std::stringstream values, functions; + for (auto &n : nodes.nodes()) { + if (show_values && n->IsValue()) { + values << n->repr() << "\n"; + } + if (show_functions && n->IsFunction()) { + functions << n->repr() << "\n"; + } + } + return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str(); +} + // // NodesBFSIterator // @@ -146,7 +160,7 @@ bool GraphTraits::NodesBFSIterator::operator==( if ((!queue_.empty()) && (!other.queue_.empty())) { return queue_.front() == other.queue_.front() && visited_.size() == other.visited_.size(); // here need to check the - // equality of queue and + // equality of queue and // visited. Just a light but week implementation. } return false; @@ -208,6 +222,76 @@ Node *GraphTraits::NodesDFSIterator::operator->() { return stack_.top(); } +GraphTraits::NodesTSIterator::NodesTSIterator( + const std::vector &source) { + PADDLE_ENFORCE(!source.empty(), + "Start points of topological sorting should not be empty!"); + std::unordered_set visited; + std::unordered_set to_visit{source.begin(), source.end()}; + + std::vector inlink_visited; + while (!to_visit.empty()) { + std::vector queue(to_visit.begin(), to_visit.end()); + for (auto *p : queue) { + inlink_visited.clear(); + + std::copy_if(p->inlinks.begin(), p->inlinks.end(), + std::back_inserter(inlink_visited), + [&](Node *x) { return visited.count(x); }); + + if (inlink_visited.size() == p->inlinks.size()) { + sorted_.push_back(p); + for (auto *_ : p->outlinks) { + if (!visited.count(_)) { + to_visit.insert(_); + } + } + + to_visit.erase(p); + visited.insert(p); + } + } + } +} + +GraphTraits::NodesTSIterator::NodesTSIterator( + const paddle::inference::analysis::GraphTraits< + DataFlowGraph>::NodesTSIterator &other) + : sorted_(other.sorted_), cursor_(other.cursor_) {} + +Node &GraphTraits::NodesTSIterator::operator*() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return *sorted_[cursor_]; +} + +paddle::inference::analysis::GraphTraits::NodesTSIterator + &GraphTraits::NodesTSIterator::operator++() { + if (++cursor_ >= sorted_.size()) { + sorted_.clear(); + cursor_ = 0; + } + return *this; +} +paddle::inference::analysis::GraphTraits::NodesTSIterator & +GraphTraits::NodesTSIterator::operator=( + const paddle::inference::analysis::GraphTraits< + DataFlowGraph>::NodesTSIterator &other) { + cursor_ = other.cursor_; + sorted_ = other.sorted_; + return *this; +} + +bool GraphTraits::NodesTSIterator::operator==( + const paddle::inference::analysis::GraphTraits< + DataFlowGraph>::NodesTSIterator &other) { + return sorted_ == other.sorted_ && cursor_ == other.cursor_; +} + +Node *GraphTraits::NodesTSIterator::operator->() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return sorted_[cursor_]; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index a4fefc83e0c551d52bec87299bcbc966e7a2dbf7..5dd914d1971bfb5bcc0b1db41d73e2b67120bc06 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -48,6 +48,9 @@ struct DataFlowGraph { // Output a DOT graph file for debug. std::string DotString() const; + std::string HumanReadableInfo(bool show_values = true, + bool show_functions = true) const; + private: // Remove duplicate edges and so on. void Clean(); @@ -107,6 +110,32 @@ struct GraphTraits { std::unordered_set visited_; }; + // Topological sorting iterator on nodes. + struct NodesTSIterator + : public std::iterator { + NodesTSIterator() = default; + explicit NodesTSIterator(const std::vector &source); + NodesTSIterator(NodesTSIterator &&other) + : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { + other.cursor_ = 0; + } + NodesTSIterator(const NodesTSIterator &other); + + Node &operator*(); + NodesTSIterator &operator++(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesTSIterator &operator=(const NodesTSIterator &other); + bool operator==(const NodesTSIterator &other); + bool operator!=(const NodesTSIterator &other) { return !(*this == other); } + Node *operator->(); + + private: + std::vector sorted_; + int cursor_{0}; + }; + explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} // default use BFS to visit the nodes. @@ -119,17 +148,24 @@ struct GraphTraits { iterator_range nodes_in_DFS() { return iterator_range(nodes_dfs_begin(), nodes_dfs_end()); } + iterator_range nodes_in_TS() { + return iterator_range(nodes_ts_begin(), nodes_ts_end()); + } private: NodesBFSIterator nodes_bfs_begin() { return NodesBFSIterator(graph_->inputs); } NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); } + NodesDFSIterator nodes_dfs_begin() { return NodesDFSIterator(graph_->inputs); } NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); } + NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); } + NodesTSIterator nodes_ts_end() { return NodesTSIterator(); } + private: DataFlowGraph *graph_; }; diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 9d7cceeb65888b8ba3fdf39e88fc2877abd82d11..7912f8d7f17ae3c79e8f73f36b7095fd52c9ac86 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) { auto dfg = ProgramDescToDFG(desc); dfg.Build(); - for (auto* in : dfg.inputs) { + for (auto *in : dfg.inputs) { LOG(INFO) << "inputs: " << in->name() << " " << static_cast(in->type()); } - for (auto* out : dfg.outputs) { + for (auto *out : dfg.outputs) { LOG(INFO) << "outputs: " << out->name() << " " << static_cast(out->type()); } @@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) { ASSERT_EQ(count, dfg.nodes.size()); } +// Topological sorting. +/* + * Graph topology + * inputs: 0, 1, 2 + * 0 -> 4 + * 0 -> 5 + * 1 -> 6 + * 2 -> 7 + * 4 -> 5 + * 4 -> 7 + * 4 -> 3 + * 7 -> 3 + */ +TEST(DataFlowGraph, TS) { + DataFlowGraph graph; + + for (int i = 0; i < 8; i++) { + auto *node = graph.nodes.Create(Node::Type::kValue); + node->SetName("node-" + std::to_string(i)); + } + + auto add_link = [&](int i, int j) { + Node *source = graph.nodes.GetMutable(i); + Node *target = graph.nodes.GetMutable(j); + target->inlinks.push_back(source); + source->outlinks.push_back(target); + }; + + graph.inputs.push_back(graph.nodes.GetMutable(0)); + graph.inputs.push_back(graph.nodes.GetMutable(1)); + graph.inputs.push_back(graph.nodes.GetMutable(2)); + + add_link(0, 4); + add_link(0, 5); + add_link(1, 6); + add_link(2, 7); + add_link(4, 5); + add_link(4, 7); + add_link(4, 3); + add_link(7, 3); + + auto its = GraphTraits(&graph).nodes_in_TS(); + std::vector sorted_ids; + for (auto it = its.begin(); it != its.end(); ++it) { + LOG(INFO) << it->name(); + sorted_ids.push_back(it->id()); + } + + // Assert a occurs prior to b in the sorted_ids. + auto assert_positive_sequence_pair = [&](int a, int b) { + auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a); + auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b); + ASSERT_LT(a_offset, b_offset); + }; + + assert_positive_sequence_pair(2, 7); + assert_positive_sequence_pair(7, 3); + assert_positive_sequence_pair(4, 3); + assert_positive_sequence_pair(0, 4); + assert_positive_sequence_pair(0, 5); + assert_positive_sequence_pair(1, 6); + assert_positive_sequence_pair(4, 5); + assert_positive_sequence_pair(4, 7); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym new file mode 100644 index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374 --- /dev/null +++ b/paddle/fluid/inference/paddle_fluid.sym @@ -0,0 +1 @@ +*paddle* diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 6b06913d1c83f4534238ac3dd22ac4035c0f0fbf..5bfa1aaa696d5cbe8bdcb94d708746259952740f 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -29,6 +29,79 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; +class ConvMKLDNNHandler : public platform::MKLDNNHandler { + public: + ConvMKLDNNHandler( + std::shared_ptr conv_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key) { + conv_pd_ = conv_pd; + } + + std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { + return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr, + "@dst_mem_p"); + } + + std::shared_ptr AcquireSrcMemoryFromPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { + auto src_pd = conv_pd_->src_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", + pipeline); + } + + std::shared_ptr AcquireWeightsMemoryFromPrimitive( + const std::shared_ptr user_weights_memory_p, + std::vector& pipeline) { + auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); + auto weights_pd = conv_pd_->weights_primitive_desc(); + return this->AcquireMemory(weights_pd, user_weights_pd, + user_weights_memory_p, "@weights_mem_p", + pipeline); + } + + std::shared_ptr AcquireConvolution( + std::shared_ptr src_memory_p, + std::shared_ptr weights_memory_p, + std::shared_ptr dst_memory_p) { + auto prim_key = key_ + "@conv_p"; + auto prim_desc_key = key_ + "@conv_pd"; + auto conv_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution primitive in device context"); + if (conv_p == nullptr) { + conv_p = std::make_shared( + *conv_pd_, *(src_memory_p), *(weights_memory_p.get()), + *(dst_memory_p.get())); + + dev_ctx_.SetBlob(prim_key, conv_p); + } else { + is_reusing_ = true; + } + return conv_p; + } + + // Generate keys for storing/retriving primitives for this operator + // TODO(jczaja): Make hashing function more optimial + static std::string GetHash(memory::dims& input_dims, + memory::dims& weights_dims, + std::vector& strides, + std::vector& paddings, + std::vector& dilations, int groups, + const std::string& suffix) { + return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + + dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + + suffix; + } + + private: + std::shared_ptr conv_pd_; +}; + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -36,10 +109,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - // Get unique name for index - const std::string key = ctx.op().Output("Output"); - const std::string key_conv_pd = key + "@conv_pd"; - auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -80,68 +149,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // create mkldnn memory from input tensors (data/weights) - auto user_src_memory = memory( - {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, - to_void_cast(input_data)); - auto user_weights_memory = - memory({{{weights_tz}, memory::data_type::f32, filter->format()}, - mkldnn_engine}, - to_void_cast(filter_data)); + // Get unique name for storing MKLDNN primitives + const std::string key = ConvMKLDNNHandler::GetHash( + src_tz, weights_tz, strides, paddings, dilations, groups, + ctx.op().Output("Output")); + const std::string key_conv_pd = key + "@conv_pd"; + + std::vector pipeline; + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose * the memory format preferred for best performance */ - auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, - memory::format::any); + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), memory::format::any); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::f32, memory::format::any); - auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, - memory::format::any); + weights_tz, platform::MKLDNNGetDataType(), memory::format::any); + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), memory::format::any); // create a conv primitive descriptor and save it for usage in backward std::shared_ptr conv_pd = ConvFwdPrimitiveDesc( src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); - // create reorder primitive if the input format is not the preferred one - auto src_memory = user_src_memory; - primitive reorder_src; - bool is_src_reordered = false; - if (memory::primitive_desc(conv_pd->src_primitive_desc()) != - user_src_memory.get_primitive_desc()) { - src_memory = memory(conv_pd->src_primitive_desc()); - reorder_src = reorder(user_src_memory, src_memory); - is_src_reordered = true; - } - auto weights_memory = user_weights_memory; - primitive reorder_weights; - bool is_weights_reordered = false; - if (memory::primitive_desc(conv_pd->weights_primitive_desc()) != - user_weights_memory.get_primitive_desc()) { - weights_memory = memory(conv_pd->weights_primitive_desc()); - reorder_weights = reorder(user_weights_memory, weights_memory); - is_weights_reordered = true; - } + ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); - // create memory primitive for conv dst - auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + auto src_memory_p = + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline); + auto dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); // create convolution op primitive - auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); + auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); // push primitive to stream and wait until it's executed - std::vector pipeline; - if (is_src_reordered) pipeline.push_back(reorder_src); - if (is_weights_reordered) pipeline.push_back(reorder_weights); - pipeline.push_back(conv_prim); + pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx.SetBlob(key_conv_pd, conv_pd); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(dst_memory)); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); } private: @@ -197,13 +260,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { if (!input_grad && !filter_grad) return; - // Get an unique name from "argument" name of "Output" variable - // This name will be used as key when saving info into device context - const std::string key = ctx.op().Input("Output"); - const std::string key_conv_pd = key + "@conv_pd"; - std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); const T* input_data = input->data(); const T* filter_data = filter->data(); @@ -223,6 +283,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + // Get an unique name from "argument" name of "Output" variable + // This name will be used as key when saving info into device context + const std::string key = + ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings, + dilations, groups, ctx.op().Input("Output")); + + const std::string key_conv_pd = key + "@conv_pd"; + // create mkldnn memory from input tensors (input/weights/output_grad) auto user_src_memory = memory( {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 3b0c9b2886504ee381b2b33e06a4552602725e57..9a1643d5b35c067ba9064286bab32019fb34fbe8 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel { std::minstd_rand engine, std::vector* inds) const { std::uniform_real_distribution uniform(0, 1); - if (inds->size() > num) { - for (int i = num; i < inds->size(); ++i) { + const int64_t size = static_cast(inds->size()); + if (size > num) { + for (int64_t i = num; i < size; ++i) { int rng_ind = std::floor(uniform(engine) * i); if (rng_ind < num) std::iter_swap(inds->begin() + rng_ind + offset, diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 0669661d225c664010fce97f0a526b62988b92c5..c8c7f36536a76ea103ef6f5689c0fbdb76102688 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/im2sequence_op.h" +#include #include namespace paddle { @@ -28,20 +29,19 @@ class Im2SequenceOp : public framework::OperatorWithKernel { "Input(X) of Im2SequenceOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of Im2SequenceOp op should not be null."); - auto in_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input(X) format must be 4D tensor, eg., NCHW."); - - auto kernels = ctx->Attrs().Get>("kernels"); - auto strides = ctx->Attrs().Get>("strides"); - auto paddings = ctx->Attrs().Get>("paddings"); - int batch_size = in_dim[0]; int img_channels = in_dim[1]; int img_height = in_dim[2]; int img_width = in_dim[3]; + auto kernels = ctx->Attrs().Get>("kernels"); + auto strides = ctx->Attrs().Get>("strides"); + auto paddings = ctx->Attrs().Get>("paddings"); + int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], paddings[2], strides[0]); int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], @@ -61,6 +61,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { "C: channels" "H: height" "W: width"); + AddInput("Y", + "(Tensor) The input tensor of image real size(H, W)." + "2-D with shape [batchsize, 2]") + .AsDispensable(); AddOutput("Out", "(LodTensor) The output data of im2sequence op,"); AddAttr>("kernels", "(vector), the " @@ -73,6 +77,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { "(vector default:{0, 0, 0, 0}), the " "paddings(up_pad, left_pad, down_pad, right_pad)") .SetDefault({0, 0, 0, 0}); + AddAttr>("out_stride", + "the attribute is valid only when input(Y)" + "is not NULL.this attribute represents the" + "scaling of the pic through the CNN" + "(vector dedault:{1,1}),the out_stride" + " (out_stride_height, out_stride_width)") + .SetDefault({1, 1}); AddComment(R"DOC( This op uses kernels to scan images and converts these images to sequences. After expanding, The number of time steps are output_height * output_width @@ -123,7 +134,7 @@ output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.] [ 7. 1. 7. 9. 2. 1. 3. 5.] [ 5. 7. 2. 4. 1. 3. 9. 0.] [ 7. 9. 4. 8. 3. 5. 0. 8.]] -output.dims = {8, 9} +output.dims = {8, 8} output.lod = [[0, 4, 8]] )DOC"); diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h index d792c68f784d8ffec0eb303a6ab9b59c9f121fa7..5bfb91db1887909c65de5f2e5321a8e6be6cf5ac 100644 --- a/paddle/fluid/operators/im2sequence_op.h +++ b/paddle/fluid/operators/im2sequence_op.h @@ -13,6 +13,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" @@ -39,50 +40,106 @@ class Im2SequenceKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { const Tensor* in = ctx.Input("X"); LoDTensor* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - // TODO(wanghaoshuang): Add layout checker after 'set_layout' - // being available for python API - // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW, - // "Input(X) layout must be NCHW"); auto in_dim = in->dims(); int batch_size = in_dim[0]; int img_channels = in_dim[1]; int img_height = in_dim[2]; int img_width = in_dim[3]; - auto kernels = ctx.Attr>("kernels"); auto strides = ctx.Attr>("strides"); auto paddings = ctx.Attr>("paddings"); - int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], - paddings[2], strides[0]); - int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], - paddings[3], strides[1]); - - const std::vector dilations({1, 1}); - - auto out_dims = out->dims(); - out->Resize({batch_size, out->numel() / batch_size}); - for (int i = 0; i < batch_size; i++) { - const Tensor src = - in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); - Tensor dst = out->Slice(i, i + 1).Resize( - {output_height, output_width, img_channels, kernels[0], kernels[1]}); - - math::Im2ColFunctor f; - auto& dev_ctx = ctx.template device_context(); - f(dev_ctx, src, dilations, strides, paddings, &dst); - } - out->Resize(out_dims); - - // set lod information - // TODO(wanghaoshuang): Move this to InferShape - framework::LoD lod(1); - lod[0].reserve(batch_size + 1); - for (int i = 0, offset = 0; i < batch_size + 1; ++i) { + if (ctx.HasInput("Y") && batch_size > 1) { + const Tensor* imgrealsize = ctx.Input("Y"); + auto out_stride = ctx.Attr>("out_stride"); + Tensor cpu_shape_tensor; + TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor); + std::vector imgreal_h; + std::vector imgreal_w; + std::vector output_height; + std::vector output_width; + int result = 0; + for (int i = 0; i < batch_size; i++) { + int tmp_real_h = static_cast((cpu_shape_tensor.data())[2 * i]); + int tmp_real_w = + static_cast((cpu_shape_tensor.data())[2 * i + 1]); + if (tmp_real_h % out_stride[0] == 0) { + tmp_real_h = tmp_real_h / out_stride[0]; + } else { + tmp_real_h = tmp_real_h / out_stride[0] + 1; + } + if (tmp_real_w % out_stride[1] == 0) { + tmp_real_w = tmp_real_w / out_stride[1]; + } else { + tmp_real_w = tmp_real_w / out_stride[1] + 1; + } + imgreal_h.push_back(tmp_real_h); + imgreal_w.push_back(tmp_real_w); + output_height.push_back(Im2SeqOutputSize( + imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0])); + output_width.push_back(Im2SeqOutputSize( + imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1])); + result += output_height[i] * output_width[i]; + } + + out->mutable_data({result, img_channels * kernels[0] * kernels[1]}, + ctx.GetPlace()); + + const std::vector dilations({1, 1}); + int offset_out = 0; + for (int i = 0; i < batch_size; i++) { + const Tensor src = + in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + Tensor dst = out->Slice(offset_out, + offset_out + output_height[i] * output_width[i]) + .Resize({output_height[i], output_width[i], + img_channels, kernels[0], kernels[1]}); + offset_out += output_height[i] * output_width[i]; + + math::Im2ColFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + framework::LoD lod(1); + lod[0].reserve(batch_size + 1); + int offset = 0; + lod[0].push_back(offset); + for (int i = 0; i < batch_size; ++i) { + offset += output_height[i] * output_width[i]; + lod[0].push_back(offset); + } + out->set_lod(lod); + } else { + out->mutable_data(ctx.GetPlace()); + int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], + paddings[3], strides[1]); + + const std::vector dilations({1, 1}); + auto out_dims = out->dims(); + out->Resize({batch_size, out->numel() / batch_size}); + for (int i = 0; i < batch_size; i++) { + const Tensor src = + in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + Tensor dst = + out->Slice(i, i + 1).Resize({output_height, output_width, + img_channels, kernels[0], kernels[1]}); + + math::Im2ColFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + out->Resize(out_dims); + framework::LoD lod(1); + lod[0].reserve(batch_size + 1); + int offset = 0; lod[0].push_back(offset); - offset += output_height * output_width; + for (int i = 0; i < batch_size; ++i) { + offset += output_height * output_width; + lod[0].push_back(offset); + } + out->set_lod(lod); } - out->set_lod(lod); } }; diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 336d6febc2ce3a55e82ed613bbc1081101f822f0..a50b9ace39249f4f899a46e171bbdced033b46bc 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -43,21 +43,6 @@ class Im2ColFunctordims()[3]; int col_width = col->dims()[4]; - PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - - ((dilation[0] * (filter_height - 1) + 1))) / - stride[0] + - 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - - ((dilation[1] * (filter_width - 1) + 1))) / - stride[1] + - 1, - col_width, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - int channels_col = im_channels * filter_height * filter_width; const T* im_data = im.data(); @@ -178,17 +163,6 @@ class Im2ColFunctordims()[0]; int col_width = col->dims()[1]; - PADDLE_ENFORCE_EQ( - (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ( - (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1, - col_width, - "col_width and padding(padding_left, padding_right) are " - "inconsistent."); - const T* im_data = im.data(); T* col_data = col->data(); diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index eecb233d22cea06da016b2671fd606b70eddf5a5..4897767f4d88d9e079f05c921153923c4eb354b0 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -77,21 +77,6 @@ class Im2ColFunctordims()[3]; int col_width = col->dims()[4]; - PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - - (dilation[0] * (filter_height - 1) + 1)) / - stride[0] + - 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - - (dilation[1] * (filter_width - 1) + 1)) / - stride[1] + - 1, - col_width, - "col_width and padding(padding_left, padding_right) are " - "inconsistent."); - int num_outputs = im_channels * col_height * col_width; int blocks = (num_outputs + 1024 - 1) / 1024; int block_x = 512; @@ -274,21 +259,6 @@ class Im2ColFunctordims()[0]; int col_width = col->dims()[1]; - PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - - (dilation[0] * (filter_height - 1) + 1)) / - stride[0] + - 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - - (dilation[1] * (filter_width - 1) + 1)) / - stride[1] + - 1, - col_width, - "col_width and padding(padding_left, padding_right) are " - "inconsistent."); - int block_dim_x = 0; int block_dim_y = 0; if (filter_height <= 4 && filter_width <= 4) { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 20037d0764056c2a093af801c9cc1eb788dd46d6..e0d7937ae2f3ce4bda12f3771727e2992d63cb9b 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -46,7 +46,7 @@ ENDIF() # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS malloc - place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) + place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) cc_test(init_test SRCS init_test.cc DEPS device_context) diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 33fec2c1073819d88d85a8872227adcb9df3e8f4..a8f93e6848a1db1f5aa0ee266a076af2b5d0c964 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -222,15 +222,16 @@ class MKLDNNHandler { static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT const std::string& suffix) { - auto dims2str = [](const mkldnn::memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dims2str(operand_dims) + suffix; + }; + + protected: + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; } protected: diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d173b41e86f61954954b6a5ea9957d2e172deca0..bf45c11a9de53a109c72ff7a89b807bc80feb7c8 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -510,11 +510,23 @@ function gen_fluid_inference_lib() { EOF make -j `nproc` inference_lib_dist cd ${PADDLE_ROOT}/build - mv fluid_install_dir fluid + cp -r fluid_install_dir fluid tar -cf fluid.tgz fluid fi } +function test_fluid_inference_lib() { + if [ ${WITH_C_API:-OFF} == "OFF" ] ; then + cat < 1: pending_sum_ops.append( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947..07b806f544497ccabe4dde9a370e90da372e6cba 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1,4 +1,18 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -3900,7 +3914,13 @@ def transpose(x, perm, name=None): return out -def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): +def im2sequence(input, + filter_size=1, + stride=1, + padding=0, + input_image_size=None, + out_stride=1, + name=None): """ Extracts image patches from the input tensor to form a tensor of shape {input.batch_size * output_height * output_width, filter_size_H * @@ -3937,6 +3957,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): padding_up = padding_down = padding_left = padding_right = padding Default: padding = 0. + input_image_size(Variable): the input contains image real size.It's dim + is [batchsize, 2]. It is dispensable.It is just for batch inference. + + out_stride(int|tuple): The scaling of image through CNN. It is + dispensable. It is valid only when input_image_size is not null. + If out_stride is tuple, it must contain two intergers, + (out_stride_H, out_stride_W). Otherwise, + the out_stride_H = out_stride_W = out_stride. + name (int): The name of this layer. It is optional. Returns: @@ -3987,7 +4016,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): [ 5. 7. 2. 4. 1. 3. 9. 0.] [ 7. 9. 4. 8. 3. 5. 0. 8.]] - output.dims = {8, 9} + output.dims = {8, 8} output.lod = [[4, 4]] @@ -4009,18 +4038,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): if len(padding) == 2: padding.append(padding[0]) padding.append(padding[1]) - + inputs = {"X": input} + attrs = {"kernels": filter_size, "strides": stride, "padding": padding} + if input_image_size: + if isinstance(out_stride, int): + out_stride = [out_stride, out_stride] + inputs["Y"] = input_image_size + attrs["out_stride"] = out_stride helper = LayerHelper('im2sequence', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) helper.append_op( - type='im2sequence', - inputs={'X': input}, - outputs={'Out': out}, - attrs={ - 'kernels': filter_size, - 'strides': stride, - 'paddings': padding, - }) + type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs) return out diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py index 4946475f11a4fc0ccaffeec6821d3976ea7c6560..13bc5768740ece00bbe285a0b47d82bb8a42d2c7 100644 --- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py +++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py @@ -16,23 +16,48 @@ import numpy as np from op_test import OpTest -def get_output_shape(attrs, in_shape): +def get_output_shape(attrs, in_shape, img_real_size): + batchsize = in_shape[0] img_height = in_shape[2] img_width = in_shape[3] + paddings = np.array(attrs['paddings']).astype("int32") + kernels = np.array(attrs['kernels']).astype("int32") + strides = np.array(attrs['strides']).astype("int32") + output_height = np.zeros((1, batchsize)).astype("int32") + output_width = np.zeros((1, batchsize)).astype("int32") + if len(img_real_size): + out_stride = np.array(attrs['out_stride']).astype("int32") + imgreal_h = 0 + imgreal_w = 0 + for index in range(batchsize): + if img_real_size[index, 0] % out_stride[0] == 0: + imgreal_h = img_real_size[index, 0] / out_stride[0] + else: + imgreal_h = img_real_size[index, 0] / out_stride[0] + 1 + if img_real_size[index, 0] % out_stride[1] == 0: + imgreal_w = img_real_size[index, 1] / out_stride[1] + else: + imgreal_w = img_real_size[index, 0] / out_stride[1] + 1 + output_height[0,index] = \ + 1 + \ + (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \ + strides[0] - paddings = attrs['paddings'] - kernels = attrs['kernels'] - strides = attrs['strides'] + output_width[0,index] = \ + 1 + \ + (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \ + strides[1] + else: + for index in range(batchsize): + output_height[0,index] = \ + 1 + \ + (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \ + strides[0] - output_height = \ - 1 + \ - (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \ - strides[0] - - output_width = \ - 1 + \ - (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \ - strides[1] + output_width[0,index] = \ + 1 + \ + (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \ + strides[1] return output_height, output_width @@ -75,22 +100,25 @@ def im2col(attrs, im, col): im_row_offset][im_col_offset] -def Im2Sequence(inputs, attrs): - output_height, output_width = get_output_shape(attrs, inputs.shape) +def Im2Sequence(inputs, img_real_size, attrs): + output_height, output_width = get_output_shape(attrs, inputs.shape, + img_real_size) img_channels = inputs.shape[1] batch_size = inputs.shape[0] - out = np.zeros([ - batch_size, output_height, output_width, img_channels, - attrs['kernels'][0], attrs['kernels'][1] - ]).astype("float32") - - for i in range(len(inputs)): - im2col(attrs, inputs[i], out[i]) - - out = out.reshape([ - batch_size * output_height * output_width, - img_channels * attrs['kernels'][0] * attrs['kernels'][1] - ]) + out = [] + for index in range(batch_size): + tmp = np.zeros([ + output_height[0, index], output_width[0, index], img_channels, + attrs['kernels'][0], attrs['kernels'][1] + ]).astype("float32") + out.append(tmp) + for index in range(len(inputs)): + im2col(attrs, inputs[index], out[index]) + out[index] = out[index].reshape([ + output_height[0, index] * output_width[0, index], + img_channels * attrs['kernels'][0] * attrs['kernels'][1] + ]) + out = np.concatenate(out, axis=0) return out @@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest): self.attrs = { 'kernels': [2, 2], 'strides': [1, 1], - 'paddings': [1, 1, 1, 1] + 'paddings': [1, 1, 1, 1], } def setUp(self): @@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest): self.batch_size, self.img_channels, self.img_height, self.img_width ]).astype("float32") - out = Im2Sequence(x, self.attrs) + real_size = np.array([]).astype("float32") + out = Im2Sequence(x, real_size, self.attrs) self.inputs = {'X': x} self.outputs = {'Out': out} @@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp): self.attrs = { 'kernels': [2, 1], 'strides': [2, 1], - 'paddings': [2, 1, 2, 1] + 'paddings': [2, 1, 2, 1], } class TestBlockExpandOpCase3(TestBlockExpandOp): def config(self): - self.batch_size = 3 + self.batch_size = 2 self.img_channels = 1 self.img_height = 4 self.img_width = 5 self.attrs = { 'kernels': [2, 1], 'strides': [2, 1], - 'paddings': [2, 0, 2, 0] + 'paddings': [2, 0, 2, 0], } @@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp): self.attrs = { 'kernels': [2, 2], 'strides': [1, 1], - 'paddings': [0, 0, 0, 0] + 'paddings': [0, 0, 0, 0], + } + + +class TestBlockExpandOpCase5(OpTest): + def config(self): + self.batch_size = 1 + self.img_channels = 3 + self.img_height = 4 + self.img_width = 5 + self.attrs = { + 'kernels': [2, 1], + 'strides': [2, 1], + 'paddings': [2, 1, 2, 1], + 'out_stride': [2, 2], + } + + def setUp(self): + self.config() + self.op_type = "im2sequence" + x = np.random.uniform(0.1, 1, [ + self.batch_size, self.img_channels, self.img_height, self.img_width + ]).astype("float32") + real_size = np.array([[8, 10], [5, 8]]).astype("float32") + out = np.array(Im2Sequence(x, real_size, self.attrs)) + self.inputs = {'X': x, 'Y': real_size} #l ?? + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +class TestBlockExpandOpCase6(OpTest): + def config(self): + self.batch_size = 3 + self.img_channels = 1 + self.img_height = 4 + self.img_width = 5 + self.attrs = { + 'kernels': [2, 1], + 'strides': [1, 1], + 'paddings': [0, 0, 0, 0], + 'out_stride': [1, 1], + } + + def setUp(self): + self.config() + self.op_type = "im2sequence" + x = np.random.uniform(0.1, 1, [ + self.batch_size, self.img_channels, self.img_height, self.img_width + ]).astype("float32") + real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32") + out = np.array(Im2Sequence(x, real_size, self.attrs)) + self.inputs = {'X': x, 'Y': real_size} #l ?? + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +class TestBlockExpandOpCase7(OpTest): + def config(self): + self.batch_size = 2 + self.img_channels = 2 + self.img_height = 3 + self.img_width = 3 + self.attrs = { + 'kernels': [2, 2], + 'strides': [1, 1], + 'paddings': [1, 0, 1, 0], + 'out_stride': [2, 2], } + def setUp(self): + self.config() + self.op_type = "im2sequence" + x = np.random.uniform(0.1, 1, [ + self.batch_size, self.img_channels, self.img_height, self.img_width + ]).astype("float32") + real_size = np.array([[6, 6], [4, 4]]).astype("float32") + out = np.array(Im2Sequence(x, real_size, self.attrs)) + self.inputs = {'X': x, 'Y': real_size} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + if __name__ == '__main__': unittest.main() +#set shiftwidth=4 set expandtab set tabstop=4 diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 842d34c07e94a79e3351347e2528ecc478cc56dc..31ae4e7d8c4107fce7022234d3aa47827647d012 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -251,12 +251,16 @@ class TestBook(unittest.TestCase): print(str(program)) def test_im2sequence(self): - print("test_im2sequence") program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 128, 128], dtype='float32') + y = layers.data(name='y', shape=[], dtype='float32') output = layers.im2sequence( - input=x, stride=[1, 1], filter_size=[2, 2]) + input=x, + input_image_size=y, + stride=[1, 1], + filter_size=[2, 2], + out_stride=[1, 1]) self.assertIsNotNone(output) print(str(program)) diff --git a/python/setup.py.in b/python/setup.py.in index ee4bbf6febc667f487a83631bbefb5ef11a2177f..a064f36cc19dbc626dd85d76290280a01fa57215 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -181,6 +181,14 @@ else: command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" if os.system(command) != 0: raise Exception("patch core.so failed, command: %s" % command) +if '${WITH_FLUID_ONLY}'== 'OFF': + # change rpath of _swig_paddle.so. + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + else: + command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + if os.system(command) != 0: + raise Exception("patch _swig_paddle.so failed, command: %s" % command) setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}',