提交 9130a884 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_is_taged

......@@ -45,6 +45,10 @@ endfunction(inference_api_test)
cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
if(NOT APPLE)
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.sym")
set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
# Here the shared library doesn't depend on other fluid libraries, or double free will occur.
cc_library(paddle_inference_api_shared SHARED
......@@ -53,8 +57,19 @@ add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB
set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
if(NOT APPLE)
set(LINK_FLAGS "-fPIC -fvisibility=hidden")
set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_api.map")
set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
"execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
" ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
"if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
" message(FATAL_ERROR \"Check symbol failed.\")\n"
"endif()\n")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
DEPENDS paddle_inference_api_shared)
add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
endif()
cc_test(test_paddle_inference_api
......
#!/bin/bash
lib=$1
if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
exit 0
......@@ -13,8 +13,6 @@
# limitations under the License.
#
inference_api_test(simple_on_word2vec ARGS test_word2vec)
option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
if(NOT WITH_INFERENCE_DEMO)
return()
......
cmake_minimum_required(VERSION 3.0)
project(cpp_inference_demo CXX C)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if(NOT DEFINED PADDLE_LIB)
message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
endif()
if(NOT DEFINED DEMO_NAME)
message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
endif()
option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON)
option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON)
if(WITH_GPU)
set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
endif()
include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3")
link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL)
include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
if(EXISTS ${MKLDNN_PATH})
include_directories("${MKLDNN_PATH}/include")
set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
endif()
else()
set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
endif()
if(WITH_STATIC_LIB)
set(DEPS
"-Wl,--whole-archive"
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
"-Wl,--no-whole-archive"
${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.a)
else()
# Note: libpaddle_inference_api.so must put before libpaddle_fluid.so
set(DEPS
${PADDLE_LIB}/contrib/inference/libpaddle_inference_api.so
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
endif()
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
glog gflags protobuf snappystream snappy z
${EXTERNAL_LIB})
if(WITH_GPU)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
endif()
target_link_libraries(${DEMO_NAME} ${DEPS})
set -x
PADDLE_ROOT=$1
WITH_MKL=$2
WITH_GPU=$3
if [ $3 == "ON" ]; then
use_gpu_list='true false'
else
use_gpu_list='false'
fi
mkdir -p build
cd build
for WITH_STATIC_LIB in false; do
rm -rf *
cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
-DWITH_MKL=$WITH_MKL \
-DDEMO_NAME=simple_on_word2vec \
-DWITH_GPU=$WITH_GPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB
make
for use_gpu in $use_gpu_list; do
./simple_on_word2vec \
--dirname=${PADDLE_ROOT}/build/python/paddle/fluid/tests/book/word2vec.inference.model \
--use_gpu=$use_gpu
done
done
if [ $? -eq 0 ]; then
exit 0
else
echo "inference demo runs fail."
exit 1
fi
set +x
......@@ -16,21 +16,27 @@ limitations under the License. */
* This file contains a simple demo for how to take a model for inference.
*/
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <memory>
#include <thread>
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "contrib/inference/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_bool(use_gpu, false, "Whether use gpu.");
namespace paddle {
namespace demo {
DEFINE_string(dirname, "", "Directory of the inference model.");
void Main(bool use_gpu) {
//# 1. Create PaddlePredictor with a config.
NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
if (FLAGS_dirname.empty()) {
LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
exit(1);
}
config.model_dir = FLAGS_dirname;
config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
......@@ -54,12 +60,16 @@ void Main(bool use_gpu) {
CHECK(predictor->Run(slots, &outputs));
//# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length();
PADDLE_ENFORCE(outputs.size(), 1UL);
// Check the output buffer size and result of each tid.
PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
float result[5] = {
0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706};
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]);
}
}
}
......@@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) {
// Multi-threads only support on CPU
// 0. Create PaddlePredictor with a config.
NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
config.model_dir = FLAGS_dirname;
config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
......@@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) {
CHECK(predictor->Run(inputs, &outputs));
// 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "TID: " << tid << ", "
<< "output buffer size: " << outputs.front().data.length();
PADDLE_ENFORCE(outputs.size(), 1UL);
// Check the output buffer size and result of each tid.
PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
float result[5] = {
0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706};
const size_t num_elements =
outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]);
}
}
});
......@@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) {
}
}
TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
#ifdef PADDLE_WITH_CUDA
TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
#endif
} // namespace demo
} // namespace paddle
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
paddle::demo::Main(false /* use_gpu*/);
paddle::demo::MainThreads(1, false /* use_gpu*/);
paddle::demo::MainThreads(4, false /* use_gpu*/);
if (FLAGS_use_gpu) {
paddle::demo::Main(true /*use_gpu*/);
paddle::demo::MainThreads(1, true /*use_gpu*/);
paddle::demo::MainThreads(4, true /*use_gpu*/);
}
return 0;
}
......@@ -13,6 +13,12 @@ endif()
# Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
# Create shared library
cc_library(paddle_fluid_shared SHARED
SRCS io.cc
......
......@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const {
return dot.Build();
}
std::string DataFlowGraph::HumanReadableInfo(bool show_values,
bool show_functions) const {
std::stringstream values, functions;
for (auto &n : nodes.nodes()) {
if (show_values && n->IsValue()) {
values << n->repr() << "\n";
}
if (show_functions && n->IsFunction()) {
functions << n->repr() << "\n";
}
}
return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
}
//
// NodesBFSIterator
//
......@@ -146,7 +160,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
if ((!queue_.empty()) && (!other.queue_.empty())) {
return queue_.front() == other.queue_.front() &&
visited_.size() == other.visited_.size(); // here need to check the
// equality of queue and
// equality of queue and
// visited. Just a light but week implementation.
}
return false;
......@@ -208,6 +222,76 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
return stack_.top();
}
GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
inlink_visited.clear();
std::copy_if(p->inlinks.begin(), p->inlinks.end(),
std::back_inserter(inlink_visited),
[&](Node *x) { return visited.count(x); });
if (inlink_visited.size() == p->inlinks.size()) {
sorted_.push_back(p);
for (auto *_ : p->outlinks) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
const paddle::inference::analysis::GraphTraits<
DataFlowGraph>::NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
&GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
const paddle::inference::analysis::GraphTraits<
DataFlowGraph>::NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
const paddle::inference::analysis::GraphTraits<
DataFlowGraph>::NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -48,6 +48,9 @@ struct DataFlowGraph {
// Output a DOT graph file for debug.
std::string DotString() const;
std::string HumanReadableInfo(bool show_values = true,
bool show_functions = true) const;
private:
// Remove duplicate edges and so on.
void Clean();
......@@ -107,6 +110,32 @@ struct GraphTraits<DataFlowGraph> {
std::unordered_set<Node *> visited_;
};
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, Node *> {
NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
Node *operator->();
private:
std::vector<Node *> sorted_;
int cursor_{0};
};
explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
// default use BFS to visit the nodes.
......@@ -119,17 +148,24 @@ struct GraphTraits<DataFlowGraph> {
iterator_range<NodesDFSIterator> nodes_in_DFS() {
return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
}
iterator_range<NodesTSIterator> nodes_in_TS() {
return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
}
private:
NodesBFSIterator nodes_bfs_begin() {
return NodesBFSIterator(graph_->inputs);
}
NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
NodesDFSIterator nodes_dfs_begin() {
return NodesDFSIterator(graph_->inputs);
}
NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
private:
DataFlowGraph *graph_;
};
......
......@@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) {
auto dfg = ProgramDescToDFG(desc);
dfg.Build();
for (auto* in : dfg.inputs) {
for (auto *in : dfg.inputs) {
LOG(INFO) << "inputs: " << in->name() << " "
<< static_cast<int>(in->type());
}
for (auto* out : dfg.outputs) {
for (auto *out : dfg.outputs) {
LOG(INFO) << "outputs: " << out->name() << " "
<< static_cast<int>(out->type());
}
......@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) {
ASSERT_EQ(count, dfg.nodes.size());
}
// Topological sorting.
/*
* Graph topology
* inputs: 0, 1, 2
* 0 -> 4
* 0 -> 5
* 1 -> 6
* 2 -> 7
* 4 -> 5
* 4 -> 7
* 4 -> 3
* 7 -> 3
*/
TEST(DataFlowGraph, TS) {
DataFlowGraph graph;
for (int i = 0; i < 8; i++) {
auto *node = graph.nodes.Create(Node::Type::kValue);
node->SetName("node-" + std::to_string(i));
}
auto add_link = [&](int i, int j) {
Node *source = graph.nodes.GetMutable(i);
Node *target = graph.nodes.GetMutable(j);
target->inlinks.push_back(source);
source->outlinks.push_back(target);
};
graph.inputs.push_back(graph.nodes.GetMutable(0));
graph.inputs.push_back(graph.nodes.GetMutable(1));
graph.inputs.push_back(graph.nodes.GetMutable(2));
add_link(0, 4);
add_link(0, 5);
add_link(1, 6);
add_link(2, 7);
add_link(4, 5);
add_link(4, 7);
add_link(4, 3);
add_link(7, 3);
auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
std::vector<int> sorted_ids;
for (auto it = its.begin(); it != its.end(); ++it) {
LOG(INFO) << it->name();
sorted_ids.push_back(it->id());
}
// Assert a occurs prior to b in the sorted_ids.
auto assert_positive_sequence_pair = [&](int a, int b) {
auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
ASSERT_LT(a_offset, b_offset);
};
assert_positive_sequence_pair(2, 7);
assert_positive_sequence_pair(7, 3);
assert_positive_sequence_pair(4, 3);
assert_positive_sequence_pair(0, 4);
assert_positive_sequence_pair(0, 5);
assert_positive_sequence_pair(1, 6);
assert_positive_sequence_pair(4, 5);
assert_positive_sequence_pair(4, 7);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -29,6 +29,79 @@ using mkldnn::stream;
using platform::to_void_cast;
using platform::GetMKLDNNFormat;
class ConvMKLDNNHandler : public platform::MKLDNNHandler {
public:
ConvMKLDNNHandler(
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key) {
conv_pd_ = conv_pd;
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
"@dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto src_pd = conv_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd,
user_weights_memory_p, "@weights_mem_p",
pipeline);
}
std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> dst_memory_p) {
auto prim_key = key_ + "@conv_p";
auto prim_desc_key = key_ + "@conv_pd";
auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (conv_p == nullptr) {
conv_p = std::make_shared<mkldnn::convolution_forward>(
*conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
*(dst_memory_p.get()));
dev_ctx_.SetBlob(prim_key, conv_p);
} else {
is_reusing_ = true;
}
return conv_p;
}
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static std::string GetHash(memory::dims& input_dims,
memory::dims& weights_dims,
std::vector<int>& strides,
std::vector<int>& paddings,
std::vector<int>& dilations, int groups,
const std::string& suffix) {
return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
suffix;
}
private:
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
};
template <typename T>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
......@@ -36,10 +109,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
// Get unique name for index
const std::string key = ctx.op().Output("Output");
const std::string key_conv_pd = key + "@conv_pd";
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
......@@ -80,68 +149,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory = memory(
{{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
to_void_cast(input_data));
auto user_weights_memory =
memory({{{weights_tz}, memory::data_type::f32, filter->format()},
mkldnn_engine},
to_void_cast(filter_data));
// Get unique name for storing MKLDNN primitives
const std::string key = ConvMKLDNNHandler::GetHash(
src_tz, weights_tz, strides, paddings, dilations, groups,
ctx.op().Output("Output"));
const std::string key_conv_pd = key + "@conv_pd";
std::vector<primitive> pipeline;
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
memory::format::any);
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::f32, memory::format::any);
auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
memory::format::any);
weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
// create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
// create reorder primitive if the input format is not the preferred one
auto src_memory = user_src_memory;
primitive reorder_src;
bool is_src_reordered = false;
if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
user_src_memory.get_primitive_desc()) {
src_memory = memory(conv_pd->src_primitive_desc());
reorder_src = reorder(user_src_memory, src_memory);
is_src_reordered = true;
}
auto weights_memory = user_weights_memory;
primitive reorder_weights;
bool is_weights_reordered = false;
if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
user_weights_memory.get_primitive_desc()) {
weights_memory = memory(conv_pd->weights_primitive_desc());
reorder_weights = reorder(user_weights_memory, weights_memory);
is_weights_reordered = true;
}
ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create memory primitive for conv dst
auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p =
handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data));
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline);
auto dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
// create convolution op primitive
auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
// push primitive to stream and wait until it's executed
std::vector<primitive> pipeline;
if (is_src_reordered) pipeline.push_back(reorder_src);
if (is_weights_reordered) pipeline.push_back(reorder_weights);
pipeline.push_back(conv_prim);
pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait();
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(dst_memory));
output->set_format(GetMKLDNNFormat(*dst_memory_p));
}
private:
......@@ -197,13 +260,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
if (!input_grad && !filter_grad) return;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Input("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
......@@ -223,6 +283,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key =
ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
dilations, groups, ctx.op().Input("Output"));
const std::string key_conv_pd = key + "@conv_pd";
// create mkldnn memory from input tensors (input/weights/output_grad)
auto user_src_memory = memory(
{{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
......
......@@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
std::minstd_rand engine,
std::vector<int>* inds) const {
std::uniform_real_distribution<float> uniform(0, 1);
if (inds->size() > num) {
for (int i = num; i < inds->size(); ++i) {
const int64_t size = static_cast<int64_t>(inds->size());
if (size > num) {
for (int64_t i = num; i < size; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind + offset,
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/im2sequence_op.h"
#include <string>
#include <vector>
namespace paddle {
......@@ -28,20 +29,19 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
"Input(X) of Im2SequenceOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of Im2SequenceOp op should not be null.");
auto in_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(in_dim.size(), 4,
"Input(X) format must be 4D tensor, eg., NCHW.");
auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
int batch_size = in_dim[0];
int img_channels = in_dim[1];
int img_height = in_dim[2];
int img_width = in_dim[3];
auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
paddings[2], strides[0]);
int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
......@@ -61,6 +61,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
"C: channels"
"H: height"
"W: width");
AddInput("Y",
"(Tensor) The input tensor of image real size(H, W)."
"2-D with shape [batchsize, 2]")
.AsDispensable();
AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
AddAttr<std::vector<int>>("kernels",
"(vector<int>), the "
......@@ -73,6 +77,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<int> default:{0, 0, 0, 0}), the "
"paddings(up_pad, left_pad, down_pad, right_pad)")
.SetDefault({0, 0, 0, 0});
AddAttr<std::vector<int>>("out_stride",
"the attribute is valid only when input(Y)"
"is not NULL.this attribute represents the"
"scaling of the pic through the CNN"
"(vector<int> dedault:{1,1}),the out_stride"
" (out_stride_height, out_stride_width)")
.SetDefault({1, 1});
AddComment(R"DOC(
This op uses kernels to scan images and converts these images to sequences.
After expanding, The number of time steps are output_height * output_width
......@@ -123,7 +134,7 @@ output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.]
[ 7. 1. 7. 9. 2. 1. 3. 5.]
[ 5. 7. 2. 4. 1. 3. 9. 0.]
[ 7. 9. 4. 8. 3. 5. 0. 8.]]
output.dims = {8, 9}
output.dims = {8, 8}
output.lod = [[0, 4, 8]]
)DOC");
......
......@@ -13,6 +13,7 @@
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/eigen.h"
......@@ -39,50 +40,106 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* in = ctx.Input<Tensor>("X");
LoDTensor* out = ctx.Output<LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
// TODO(wanghaoshuang): Add layout checker after 'set_layout'
// being available for python API
// PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
// "Input(X) layout must be NCHW");
auto in_dim = in->dims();
int batch_size = in_dim[0];
int img_channels = in_dim[1];
int img_height = in_dim[2];
int img_width = in_dim[3];
auto kernels = ctx.Attr<std::vector<int>>("kernels");
auto strides = ctx.Attr<std::vector<int>>("strides");
auto paddings = ctx.Attr<std::vector<int>>("paddings");
int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
paddings[2], strides[0]);
int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
paddings[3], strides[1]);
const std::vector<int> dilations({1, 1});
auto out_dims = out->dims();
out->Resize({batch_size, out->numel() / batch_size});
for (int i = 0; i < batch_size; i++) {
const Tensor src =
in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
Tensor dst = out->Slice(i, i + 1).Resize(
{output_height, output_width, img_channels, kernels[0], kernels[1]});
math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
f(dev_ctx, src, dilations, strides, paddings, &dst);
}
out->Resize(out_dims);
// set lod information
// TODO(wanghaoshuang): Move this to InferShape
framework::LoD lod(1);
lod[0].reserve(batch_size + 1);
for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
if (ctx.HasInput("Y") && batch_size > 1) {
const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
Tensor cpu_shape_tensor;
TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
std::vector<int> imgreal_h;
std::vector<int> imgreal_w;
std::vector<int> output_height;
std::vector<int> output_width;
int result = 0;
for (int i = 0; i < batch_size; i++) {
int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
int tmp_real_w =
static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
if (tmp_real_h % out_stride[0] == 0) {
tmp_real_h = tmp_real_h / out_stride[0];
} else {
tmp_real_h = tmp_real_h / out_stride[0] + 1;
}
if (tmp_real_w % out_stride[1] == 0) {
tmp_real_w = tmp_real_w / out_stride[1];
} else {
tmp_real_w = tmp_real_w / out_stride[1] + 1;
}
imgreal_h.push_back(tmp_real_h);
imgreal_w.push_back(tmp_real_w);
output_height.push_back(Im2SeqOutputSize(
imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
output_width.push_back(Im2SeqOutputSize(
imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
result += output_height[i] * output_width[i];
}
out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
ctx.GetPlace());
const std::vector<int> dilations({1, 1});
int offset_out = 0;
for (int i = 0; i < batch_size; i++) {
const Tensor src =
in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
Tensor dst = out->Slice(offset_out,
offset_out + output_height[i] * output_width[i])
.Resize({output_height[i], output_width[i],
img_channels, kernels[0], kernels[1]});
offset_out += output_height[i] * output_width[i];
math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
f(dev_ctx, src, dilations, strides, paddings, &dst);
}
framework::LoD lod(1);
lod[0].reserve(batch_size + 1);
int offset = 0;
lod[0].push_back(offset);
for (int i = 0; i < batch_size; ++i) {
offset += output_height[i] * output_width[i];
lod[0].push_back(offset);
}
out->set_lod(lod);
} else {
out->mutable_data<T>(ctx.GetPlace());
int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
paddings[2], strides[0]);
int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
paddings[3], strides[1]);
const std::vector<int> dilations({1, 1});
auto out_dims = out->dims();
out->Resize({batch_size, out->numel() / batch_size});
for (int i = 0; i < batch_size; i++) {
const Tensor src =
in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
Tensor dst =
out->Slice(i, i + 1).Resize({output_height, output_width,
img_channels, kernels[0], kernels[1]});
math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
f(dev_ctx, src, dilations, strides, paddings, &dst);
}
out->Resize(out_dims);
framework::LoD lod(1);
lod[0].reserve(batch_size + 1);
int offset = 0;
lod[0].push_back(offset);
offset += output_height * output_width;
for (int i = 0; i < batch_size; ++i) {
offset += output_height * output_width;
lod[0].push_back(offset);
}
out->set_lod(lod);
}
out->set_lod(lod);
}
};
......
......@@ -43,21 +43,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int col_height = col->dims()[3];
int col_width = col->dims()[4];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
((dilation[0] * (filter_height - 1) + 1))) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
((dilation[1] * (filter_width - 1) + 1))) /
stride[1] +
1,
col_width,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
int channels_col = im_channels * filter_height * filter_width;
const T* im_data = im.data<T>();
......@@ -178,17 +163,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int col_height = col->dims()[0];
int col_width = col->dims()[1];
PADDLE_ENFORCE_EQ(
(im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
const T* im_data = im.data<T>();
T* col_data = col->data<T>();
......
......@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int col_height = col->dims()[3];
int col_width = col->dims()[4];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
(dilation[0] * (filter_height - 1) + 1)) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
stride[1] +
1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
int num_outputs = im_channels * col_height * col_width;
int blocks = (num_outputs + 1024 - 1) / 1024;
int block_x = 512;
......@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int col_height = col->dims()[0];
int col_width = col->dims()[1];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
(dilation[0] * (filter_height - 1) + 1)) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
stride[1] +
1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
int block_dim_x = 0;
int block_dim_y = 0;
if (filter_height <= 4 && filter_width <= 4) {
......
......@@ -46,7 +46,7 @@ ENDIF()
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
cc_test(init_test SRCS init_test.cc DEPS device_context)
......
......@@ -222,15 +222,16 @@ class MKLDNNHandler {
static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT
const std::string& suffix) {
auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) {
dstr += std::to_string(operand_dims[i]) + "-";
}
return dstr;
};
return dims2str(operand_dims) + suffix;
};
protected:
static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) {
dstr += std::to_string(operand_dims[i]) + "-";
}
return dstr;
}
protected:
......
......@@ -510,11 +510,23 @@ function gen_fluid_inference_lib() {
EOF
make -j `nproc` inference_lib_dist
cd ${PADDLE_ROOT}/build
mv fluid_install_dir fluid
cp -r fluid_install_dir fluid
tar -cf fluid.tgz fluid
fi
}
function test_fluid_inference_lib() {
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF
========================================
Testing fluid inference library ...
========================================
EOF
cd ${PADDLE_ROOT}/paddle/contrib/inference/demo_ci
sh run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
fi
}
function main() {
set -e
local CMD=$1
......@@ -568,6 +580,7 @@ function main() {
run_test
gen_capi_package
gen_fluid_inference_lib
test_fluid_inference_lib
;;
*)
print_usage
......
......@@ -123,7 +123,8 @@ def _append_grad_suffix_(name):
def _addup_repetitive_outputs_(op_descs):
"""
In backward part, an variable may be the output of more than one ops.
In this case, the variable should be the accumulation of all the outputs.
And one op may yield its multiple outputs to the same variable.
In these cases, the variable should be the accumulation of all the outputs.
`sum_op`s are added to implement the accumulate.
"""
pending_sum_ops = []
......@@ -136,29 +137,46 @@ def _addup_repetitive_outputs_(op_descs):
"sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
{"use_mkldnn": False}), idx))
renamed_vars[var_name] = [var_name]
for var_name in op_desc.output_arg_names():
if var_name == core.empty_var_name(
) or var_name in op_desc.input_arg_names():
# empty variable or inplace op
continue
if len(renamed_vars[var_name]) == 0:
# it's the first time we get the variable
renamed_vars[var_name] = [var_name]
else:
if len(renamed_vars[var_name]) == 1:
for param_idx, param_name in enumerate(op_desc.output_names()):
arg_names = op_desc.output(param_name)
for arg_idx, var_name in enumerate(arg_names):
if var_name == core.empty_var_name(
) or var_name in op_desc.input_arg_names():
# empty variable or inplace op
continue
if len(renamed_vars[var_name]) == 0:
# it's the first time we get the variable
renamed_vars[var_name] = [var_name]
else:
if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
# rename original var_name
renamed_vars[var_name][0] = new_name
_rename_arg_(op_descs, var_name, new_name, 0, idx)
_rename_arg_(pending_sum_ops, var_name, new_name)
for p in op_desc.output_names()[:param_idx]:
p_arg_names = op_desc.output(p)
if var_name in p_arg_names:
op_desc.set_output(p, [
new_name if x == var_name else x
for x in p_arg_names
])
arg_names = [
new_name if x == var_name else x
for x in arg_names[:arg_idx]
] + arg_names[arg_idx:]
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
# rename original var_name
renamed_vars[var_name][0] = new_name
_rename_arg_(op_descs, var_name, new_name, 0, idx)
_rename_arg_(pending_sum_ops, var_name, new_name)
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
op_desc.rename_output(var_name, new_name)
renamed_vars[var_name].append(new_name)
arg_names[arg_idx] = new_name
op_desc.set_output(param_name, arg_names)
renamed_vars[var_name].append(new_name)
for var_name, inputs in renamed_vars.iteritems():
if len(inputs) > 1:
pending_sum_ops.append(
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -3900,7 +3914,13 @@ def transpose(x, perm, name=None):
return out
def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
def im2sequence(input,
filter_size=1,
stride=1,
padding=0,
input_image_size=None,
out_stride=1,
name=None):
"""
Extracts image patches from the input tensor to form a tensor of shape
{input.batch_size * output_height * output_width, filter_size_H *
......@@ -3937,6 +3957,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
padding_up = padding_down = padding_left = padding_right = padding
Default: padding = 0.
input_image_size(Variable): the input contains image real size.It's dim
is [batchsize, 2]. It is dispensable.It is just for batch inference.
out_stride(int|tuple): The scaling of image through CNN. It is
dispensable. It is valid only when input_image_size is not null.
If out_stride is tuple, it must contain two intergers,
(out_stride_H, out_stride_W). Otherwise,
the out_stride_H = out_stride_W = out_stride.
name (int): The name of this layer. It is optional.
Returns:
......@@ -3987,7 +4016,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
[ 5. 7. 2. 4. 1. 3. 9. 0.]
[ 7. 9. 4. 8. 3. 5. 0. 8.]]
output.dims = {8, 9}
output.dims = {8, 8}
output.lod = [[4, 4]]
......@@ -4009,18 +4038,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
if len(padding) == 2:
padding.append(padding[0])
padding.append(padding[1])
inputs = {"X": input}
attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
if input_image_size:
if isinstance(out_stride, int):
out_stride = [out_stride, out_stride]
inputs["Y"] = input_image_size
attrs["out_stride"] = out_stride
helper = LayerHelper('im2sequence', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(
type='im2sequence',
inputs={'X': input},
outputs={'Out': out},
attrs={
'kernels': filter_size,
'strides': stride,
'paddings': padding,
})
type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
return out
......
......@@ -16,23 +16,48 @@ import numpy as np
from op_test import OpTest
def get_output_shape(attrs, in_shape):
def get_output_shape(attrs, in_shape, img_real_size):
batchsize = in_shape[0]
img_height = in_shape[2]
img_width = in_shape[3]
paddings = np.array(attrs['paddings']).astype("int32")
kernels = np.array(attrs['kernels']).astype("int32")
strides = np.array(attrs['strides']).astype("int32")
output_height = np.zeros((1, batchsize)).astype("int32")
output_width = np.zeros((1, batchsize)).astype("int32")
if len(img_real_size):
out_stride = np.array(attrs['out_stride']).astype("int32")
imgreal_h = 0
imgreal_w = 0
for index in range(batchsize):
if img_real_size[index, 0] % out_stride[0] == 0:
imgreal_h = img_real_size[index, 0] / out_stride[0]
else:
imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
if img_real_size[index, 0] % out_stride[1] == 0:
imgreal_w = img_real_size[index, 1] / out_stride[1]
else:
imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
output_height[0,index] = \
1 + \
(imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
strides[0]
paddings = attrs['paddings']
kernels = attrs['kernels']
strides = attrs['strides']
output_width[0,index] = \
1 + \
(imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
strides[1]
else:
for index in range(batchsize):
output_height[0,index] = \
1 + \
(img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
strides[0]
output_height = \
1 + \
(img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
strides[0]
output_width = \
1 + \
(img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
strides[1]
output_width[0,index] = \
1 + \
(img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
strides[1]
return output_height, output_width
......@@ -75,22 +100,25 @@ def im2col(attrs, im, col):
im_row_offset][im_col_offset]
def Im2Sequence(inputs, attrs):
output_height, output_width = get_output_shape(attrs, inputs.shape)
def Im2Sequence(inputs, img_real_size, attrs):
output_height, output_width = get_output_shape(attrs, inputs.shape,
img_real_size)
img_channels = inputs.shape[1]
batch_size = inputs.shape[0]
out = np.zeros([
batch_size, output_height, output_width, img_channels,
attrs['kernels'][0], attrs['kernels'][1]
]).astype("float32")
for i in range(len(inputs)):
im2col(attrs, inputs[i], out[i])
out = out.reshape([
batch_size * output_height * output_width,
img_channels * attrs['kernels'][0] * attrs['kernels'][1]
])
out = []
for index in range(batch_size):
tmp = np.zeros([
output_height[0, index], output_width[0, index], img_channels,
attrs['kernels'][0], attrs['kernels'][1]
]).astype("float32")
out.append(tmp)
for index in range(len(inputs)):
im2col(attrs, inputs[index], out[index])
out[index] = out[index].reshape([
output_height[0, index] * output_width[0, index],
img_channels * attrs['kernels'][0] * attrs['kernels'][1]
])
out = np.concatenate(out, axis=0)
return out
......@@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest):
self.attrs = {
'kernels': [2, 2],
'strides': [1, 1],
'paddings': [1, 1, 1, 1]
'paddings': [1, 1, 1, 1],
}
def setUp(self):
......@@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest):
self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32")
out = Im2Sequence(x, self.attrs)
real_size = np.array([]).astype("float32")
out = Im2Sequence(x, real_size, self.attrs)
self.inputs = {'X': x}
self.outputs = {'Out': out}
......@@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
self.attrs = {
'kernels': [2, 1],
'strides': [2, 1],
'paddings': [2, 1, 2, 1]
'paddings': [2, 1, 2, 1],
}
class TestBlockExpandOpCase3(TestBlockExpandOp):
def config(self):
self.batch_size = 3
self.batch_size = 2
self.img_channels = 1
self.img_height = 4
self.img_width = 5
self.attrs = {
'kernels': [2, 1],
'strides': [2, 1],
'paddings': [2, 0, 2, 0]
'paddings': [2, 0, 2, 0],
}
......@@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
self.attrs = {
'kernels': [2, 2],
'strides': [1, 1],
'paddings': [0, 0, 0, 0]
'paddings': [0, 0, 0, 0],
}
class TestBlockExpandOpCase5(OpTest):
def config(self):
self.batch_size = 1
self.img_channels = 3
self.img_height = 4
self.img_width = 5
self.attrs = {
'kernels': [2, 1],
'strides': [2, 1],
'paddings': [2, 1, 2, 1],
'out_stride': [2, 2],
}
def setUp(self):
self.config()
self.op_type = "im2sequence"
x = np.random.uniform(0.1, 1, [
self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32")
real_size = np.array([[8, 10], [5, 8]]).astype("float32")
out = np.array(Im2Sequence(x, real_size, self.attrs))
self.inputs = {'X': x, 'Y': real_size} #l ??
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
class TestBlockExpandOpCase6(OpTest):
def config(self):
self.batch_size = 3
self.img_channels = 1
self.img_height = 4
self.img_width = 5
self.attrs = {
'kernels': [2, 1],
'strides': [1, 1],
'paddings': [0, 0, 0, 0],
'out_stride': [1, 1],
}
def setUp(self):
self.config()
self.op_type = "im2sequence"
x = np.random.uniform(0.1, 1, [
self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32")
real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
out = np.array(Im2Sequence(x, real_size, self.attrs))
self.inputs = {'X': x, 'Y': real_size} #l ??
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
class TestBlockExpandOpCase7(OpTest):
def config(self):
self.batch_size = 2
self.img_channels = 2
self.img_height = 3
self.img_width = 3
self.attrs = {
'kernels': [2, 2],
'strides': [1, 1],
'paddings': [1, 0, 1, 0],
'out_stride': [2, 2],
}
def setUp(self):
self.config()
self.op_type = "im2sequence"
x = np.random.uniform(0.1, 1, [
self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32")
real_size = np.array([[6, 6], [4, 4]]).astype("float32")
out = np.array(Im2Sequence(x, real_size, self.attrs))
self.inputs = {'X': x, 'Y': real_size}
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
#set shiftwidth=4 set expandtab set tabstop=4
......@@ -251,12 +251,16 @@ class TestBook(unittest.TestCase):
print(str(program))
def test_im2sequence(self):
print("test_im2sequence")
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
y = layers.data(name='y', shape=[], dtype='float32')
output = layers.im2sequence(
input=x, stride=[1, 1], filter_size=[2, 2])
input=x,
input_image_size=y,
stride=[1, 1],
filter_size=[2, 2],
out_stride=[1, 1])
self.assertIsNotNone(output)
print(str(program))
......
......@@ -181,6 +181,14 @@ else:
command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
if os.system(command) != 0:
raise Exception("patch core.so failed, command: %s" % command)
if '${WITH_FLUID_ONLY}'== 'OFF':
# change rpath of _swig_paddle.so.
if "@APPLE@" == "1":
command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
else:
command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
if os.system(command) != 0:
raise Exception("patch _swig_paddle.so failed, command: %s" % command)
setup(name='${PACKAGE_NAME}',
version='${PADDLE_VERSION}',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册