提交 459d4cc8 编写于 作者: L luotao1

Merge branch 'develop' into multi-thread2

......@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr
......
......@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
## Installation
It is recommended to check out the
[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
before looking into the
[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
## Documentation
We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
- [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters.
- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
You can also run distributed training jobs on Kubernetes clusters.
- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
Our new API enables much shorter programs.
- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
We appreciate your contributions!
......
......@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
if (NOT WIN32) # windows msvc2015 support c++11 natively.
# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
endif(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
# in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
......@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
else(NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
endif()
endif(NOT WIN32)
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
......@@ -44,7 +44,7 @@ ExternalProject_Add(
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party
# 4. remove .git, and package the directory.
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
......
......@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework")
if (NOT WIN32)
copy(framework_lib DEPS framework_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
)
else()
copy(framework_lib
set(framework_lib_deps framework_py_proto)
endif(NOT WIN32)
copy(framework_lib DEPS ${framework_lib_deps}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
${src_dir}/${module}/ir/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
)
endif(NOT WIN32)
set(module "memory")
copy(memory_lib
......@@ -161,7 +158,8 @@ set(module "inference")
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform")
......
......@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
:maxdepth: 2
feeding_data
use_recordio_reader
Python Reader
#############
......
.. _user_guide_use_recordio_as_train_data:
############################
使用RecordIO文件作为训练数据
############################
相比于 :ref:`user_guide_use_numpy_array_as_train_data`,
:ref:`user_guide_use_recordio_as_train_data` 的性能更好;
但是用户需要先将训练数据集转换成RecordIO文件格式,再使用
:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝,
使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
将训练数据转换成RecordIO文件格式
################################
:code:`fluid.recordio_writer` 中,每个记录都是一个
:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
的所有特征。例如对于图像分类来说,这个数组可以包含图片和分类标签。
用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
:ref:`user_guide_reader` 转换成多个RecordIO文件。
具体使用方法为:
.. code-block:: python
import paddle.fluid as fluid
import numpy
def reader_creator():
def __impl__():
for i in range(1000):
yield [
numpy.random.random(size=[3,224,224], dtype="float32"),
numpy.random.random(size=[1], dtype="int64")
]
return __impl__
img = fluid.layers.data(name="image", shape=[3, 224, 224])
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
BATCH_SIZE = 32
reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
fluid.recordio_writer.convert_reader_to_recordio_file(
"train.recordio", feeder=feeder, reader_creator=reader)
其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
:ref:`_api_fluid_data_feeder_DataFeeder`
是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
:ref:`user_guide_reader` 。
上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件,
其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整,
用户可以将每一个Record的样本数设置成1。并参考
:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
配置神经网络, 打开RecordIO文件
##############################
RecordIO文件转换好之后,用户可以使用 :code:`fluid.layers.open_files()`
打开文件,并使用 :code:`fluid.layers.read_file` 读取文件内容。
简单使用方法如下:
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(
filenames=["train.recordio"],
shape=[[3, 224, 224], [1]],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
pass_num=100
)
image, label = fluid.layers.read_file(file_obj)
其中如果设置了 :code:`pass_num` ,那么当所有数据读完后,会重新读取数据,
直到读取了 :code:`pass_num` 遍。
进阶使用
########
使用 :code:`fluid.layers.double_buffer()`
------------------------------------------
:code:`Double buffer` 使用双缓冲技术,将训练数据从内存中复制到显存中。配置双缓冲
需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
.. code-block:: python
import paddle.fliud as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.double_buffer(file_obj)
image, label = fluid.layers.read_file(file_obj)
双缓冲技术可以参考
`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
配置数据增强
------------
使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
preprocessor = fluid.layers.Preprocessor(reader=data_file)
with preprocessor.block():
image, label = preprocessor.inputs()
image = image / 2
label = label + 1
preprocessor.outputs(image, label)
如上代码所示,使用 :code:`Preprocessor` 定义了一个数据增强模块,并在
:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
:code:`preprocessor.outputs()` 标记预处理后的输出。
.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
使用Op组batch
-------------
使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.batch(file_obj, batch_size=32)
img, label = fluid.layers.read_file(file_obj)
需要注意的是,如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据,
那么这几个样本直接组成一个批量数据进行训练。
读入数据的shuffle
-----------------
使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
img, label = fliud.layers.read_file(file_obj)
需要注意的是:
1. :code:`shuffle` 实现方法是:
先读入 :code:`buffer_size` 条样本,再随机的选出样本进行训练。
2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存,需要确定训练过程中内存
足够支持缓存 :code:`buffer_size` 条数据。
......@@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'],
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
......
.tensor_util.cu
.data_type_transform.cu
\ No newline at end of file
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif()
add_custom_command(OUTPUT .${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
endforeach()
endfunction()
add_subdirectory(ir)
if (NOT WIN32)
add_subdirectory(details)
......@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
if (WIN32)
windows_symbolic(tensor_util SRCS tensor_util.cu)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
endif(WIN32)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
endif()
......@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function)
if(WITH_GPU)
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
if (WIN32)
windows_symbolic(hidden_file SRCS data_type_transform.cu)
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
add_dependencies(data_type_transform hidden_file)
else()
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
endif(WIN32)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
......
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n")
file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
function(pass_library TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
endfunction()
cc_library(node SRCS node.cc DEPS proto_desc)
cc_library(graph SRCS graph.cc DEPS node)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector)
pass_library(graph_to_program_pass)
pass_library(graph_viz_pass)
pass_library(fc_fuse_pass)
pass_library(attention_lstm_fuse_pass)
pass_library(infer_clean_graph_pass)
pass_library(fc_lstm_fuse_pass)
pass_library(seq_concat_fc_fuse_pass)
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
......@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
auto* cell_init = graph->RetriveNode(6);
auto* hidden_init = graph->RetriveNode(8);
#define LINK_TO(node0, node1) \
node0->outputs.push_back(node1); \
node1->inputs.push_back(node0);
auto* lstm_op = graph->CreateOpNode(&op_desc);
PrepareParameters(graph, param);
LINK_TO(X, lstm_op);
LINK_TO(cell_init, lstm_op);
LINK_TO(hidden_init, lstm_op);
LINK_TO(lstm_op, LSTMOUT);
IR_NODE_LINK_TO(X, lstm_op);
IR_NODE_LINK_TO(cell_init, lstm_op);
IR_NODE_LINK_TO(hidden_init, lstm_op);
IR_NODE_LINK_TO(lstm_op, LSTMOUT);
GraphSafeRemoveNodes(graph, marked_nodes);
}
......
......@@ -21,74 +21,26 @@ namespace paddle {
namespace framework {
namespace ir {
bool VarOutLinksToOp(Node* node, const std::string& op_type) {
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
void BuildFCPattern(PDPattern* pattern) {
// Create Operators
auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul");
auto* elementwise_add_op =
pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add");
// Create variables
// w
auto* mul_weight_var = pattern->NewNode("mul_weight")
->AsInput()
->assert_is_op_nth_input("mul", "Y", 0);
// x
auto* mul_tmp_var = pattern->NewNode("mul_tmp_var")
->AsInput()
->assert_is_op_nth_input("mul", "X", 0);
// intermediate variable, will be removed in the IR after fuse.
auto* mul_out_var = pattern->NewNode("mul_out")
->AsIntermediate()
->assert_is_only_output_of_op("mul")
->assert_is_op_input("elementwise_add");
// bias
auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar")
->assert_is_op_input("elementwise_add")
->AsInput();
// output
auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out")
->AsOutput()
->assert_is_op_output("elementwise_add");
mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
.LinksTo({elementwise_add_out_var});
}
// Replace the node `from` in the links to `to`
bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
for (auto*& n : *links) {
if (n == from) {
n = to;
return true;
}
}
return false;
}
std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("fc", graph.get());
FusePassBase::Init("fc_fuse", graph.get());
std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd;
BuildFCPattern(gpd.mutable_pattern());
#define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
// BuildFCPattern(gpd.mutable_pattern());
auto* x = gpd.mutable_pattern()
->NewNode("fc_fuse/x")
->AsInput()
->assert_is_op_input("mul", "X");
patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/);
#define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
......@@ -98,43 +50,33 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
// scenerio.
// FC's fusion is simple, just op fuse, no need to process the
// parameters.
GET_NODE(mul_tmp_var); // x
GET_NODE(mul_weight); // Y
GET_NODE(elementwise_add_tmpvar); // bias
GET_NODE(elementwise_add_out); // Out
GET_NODE(mul); // MUL op
GET_NODE(elementwise_add); // ELEMENT_ADD op
GET_NODE(mul_out); // tmp
GET_NODE(x); // x
GET_NODE(w); // Y
GET_NODE(fc_bias); // bias
GET_NODE(fc_out); // Out
GET_NODE(mul); // MUL op
GET_NODE(elementwise_add); // ELEMENT_ADD op
GET_NODE(mul_out); // tmp
#undef GET_NODE
// Create an FC Node.
OpDesc desc;
std::string fc_x_in = mul_tmp_var->Name();
std::string fc_Y_in = mul_weight->Name();
std::string fc_bias_in = elementwise_add_tmpvar->Name();
std::string fc_out = elementwise_add_out->Name();
std::string fc_x_in = x->Name();
std::string fc_Y_in = w->Name();
std::string fc_bias_in = fc_bias->Name();
std::string fc_out_out = fc_out->Name();
desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
desc.SetOutput("Out", std::vector<std::string>({fc_out}));
desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
desc.SetType("fc");
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
fc_node->inputs =
std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
fc_node->outputs.push_back(elementwise_add_out);
// Update link relatons
PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
elementwise_add, fc_node));
PADDLE_ENFORCE(
LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
// Drop old nodes
graph->RemoveNode(mul);
graph->RemoveNode(elementwise_add);
graph->RemoveNode(mul_out); // tmp variable
IR_NODE_LINK_TO(x, fc_node);
IR_NODE_LINK_TO(w, fc_node);
IR_NODE_LINK_TO(fc_bias, fc_node);
IR_NODE_LINK_TO(fc_node, fc_out);
found_fc_count++;
};
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
namespace paddle {
......@@ -94,21 +95,37 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
op_desc.SetOutput("Hidden", {hidden_n->Name()});
op_desc.SetOutput("Cell", {cell_n->Name()});
op_desc.SetOutput("XX", {xx_n->Name()});
op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"});
op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"});
op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"});
op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
auto* op = graph->CreateOpNode(&op_desc);
// TODO(TJ): get from attr
op_desc.SetAttr("use_seq", true);
#define TMP_NAME(x) "at.new.tmp." #x
#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
OP_SET_OUT(BatchedCell);
OP_SET_OUT(BatchedHidden);
OP_SET_OUT(ReorderedH0);
OP_SET_OUT(ReorderedC0);
#undef OP_SET_OUT
#define LINK_TO(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
LINK_TO(input_n, op);
LINK_TO(weight_x_n, op);
LINK_TO(weight_h_n, op);
LINK_TO(bias_n, op);
LINK_TO(op, hidden_n);
#undef LINK_TO
auto* op = graph->CreateOpNode(&op_desc);
PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
auto* scope = graph->Get<Scope*>(kParamScopeAttr);
#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
TMP_NEW(BatchedCell);
TMP_NEW(BatchedHidden);
TMP_NEW(ReorderedH0);
TMP_NEW(ReorderedC0);
#undef TMP_NEW
#undef TMP_NAME
IR_NODE_LINK_TO(input_n, op);
IR_NODE_LINK_TO(weight_x_n, op);
IR_NODE_LINK_TO(weight_h_n, op);
IR_NODE_LINK_TO(bias_n, op);
IR_NODE_LINK_TO(op, hidden_n);
return op;
};
......@@ -116,7 +133,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
auto fc_no_bias_handler = [&](
const GraphPatternDetector::subgraph_t& subgraph, Graph* g) {
#define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \
......
......@@ -111,6 +111,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
return false;
}
}
for (auto& item : pdnodes2nodes_) {
for (auto& n : item.second) {
GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
}
}
VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty();
......@@ -278,7 +283,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
for (const auto& subgraph : *subgraphs) {
bool valid = true;
for (auto& item : subgraph) {
if (node_set.count(item.second)) {
if (item.first->IsIntermediate() && node_set.count(item.second)) {
valid = false;
break;
}
......@@ -334,22 +339,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
}
PDNode* PDNode::assert_is_op() {
asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); });
asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
return this;
}
PDNode* PDNode::assert_is_op(const std::string& op_type) {
asserts_.emplace_back([this, op_type](Node* x) {
asserts_.emplace_back([op_type](Node* x) {
return x && x->IsOp() && x->Op()->Type() == op_type;
});
return this;
}
PDNode* PDNode::assert_is_var() {
asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); });
asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
return this;
}
PDNode* PDNode::assert_var_not_persistable() {
assert_is_var();
asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); });
asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
return this;
}
PDNode* PDNode::assert_is_persistable_var() {
......@@ -491,14 +496,16 @@ void GraphSafeRemoveNodes(Graph* graph,
for (auto it = node->inputs.begin(); it != node->inputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it);
} else
} else {
it++;
}
}
for (auto it = node->outputs.begin(); it != node->outputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->outputs.erase(it);
} else
} else {
it++;
}
}
}
}
......
......@@ -245,6 +245,8 @@ class GraphPatternDetector {
void UniquePatterns(std::vector<subgraph_t>* subgraphs);
// Remove overlapped match subgraphs, when overlapped, keep the previous one.
// The intermediate PDNodes will be removed, so can't shared by multiple
// patterns.
void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
// Validate whether the intermediate nodes are linked by external nodes.
......@@ -295,6 +297,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
} // namespace patterns
#define IR_NODE_LINK_TO(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
},
"OP0");
auto* any_var = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsVar(); }, "VAR");
auto* any_var = x.mutable_pattern()
->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
->AsIntermediate();
auto* any_op1 = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsOp(); }, "OP1");
......
......@@ -13,42 +13,41 @@
// limitations under the License.
#include <algorithm>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class InferCleanGraphPass : public Pass {
class InferCleanGraphPass : public FusePassBase {
public:
virtual ~InferCleanGraphPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init("original_graph", graph.get());
PADDLE_ENFORCE(graph.get());
auto is_valid_node = [](Node* x) {
return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
};
std::unordered_set<Node*> invalid_nodes;
std::unordered_set<const Node*> invalid_nodes;
int valid_op = 0;
for (auto* node : graph->Nodes()) {
if (is_valid_node(node)) {
invalid_nodes.insert(node);
} else if (node->IsOp()) {
// Collect all the operators to help tracking number of operators.
++valid_op;
}
}
// remove nodes from the graph.
for (auto* node : invalid_nodes) {
graph->RemoveNode(node);
}
GraphSafeRemoveNodes(graph.get(), invalid_nodes);
// clean edges.
for (auto* node : graph->Nodes()) {
CleanEdges(&node->inputs, invalid_nodes);
CleanEdges(&node->outputs, invalid_nodes);
}
AddStatis(valid_op);
return graph;
}
......
......@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
op_desc.SetAttr("fc_activation", act->Op()->Type());
auto* op_node = graph->CreateOpNode(&op_desc);
// Add links
#define NODE_LINKS(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
NODE_LINKS(fc_w, op_node);
NODE_LINKS(fc_bias, op_node);
NODE_LINKS(concat_in0, op_node);
NODE_LINKS(sequence_expand0_in, op_node);
NODE_LINKS(sequence_expand1_in, op_node);
NODE_LINKS(op_node, fc_out);
// Add links
IR_NODE_LINK_TO(fc_w, op_node);
IR_NODE_LINK_TO(fc_bias, op_node);
IR_NODE_LINK_TO(concat_in0, op_node);
IR_NODE_LINK_TO(sequence_expand0_in, op_node);
IR_NODE_LINK_TO(sequence_expand1_in, op_node);
IR_NODE_LINK_TO(op_node, fc_out);
// Clean nodes.
std::unordered_set<const Node*> marked_nodes;
......@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
marked_nodes.erase(sequence_expand0_in);
marked_nodes.erase(sequence_expand1_in);
marked_nodes.erase(fc_out);
GraphSafeRemoveNodes(graph, marked_nodes);
});
......
......@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api
SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass)
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
......@@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
#endif()
# Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
......@@ -32,6 +32,7 @@ endif()
# Create shared library
cc_library(paddle_fluid_shared SHARED
SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
DEPS ${fluid_modules} paddle_fluid_api)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
......
......@@ -33,7 +33,7 @@ function (inference_analysis_test TARGET)
endif()
cc_test(${TARGET}
SRCS "${analysis_test_SRCS}"
DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING)
......@@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
analysis_predictor
# ir
fc_fuse_pass
fc_lstm_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detector
infer_clean_graph_pass
attention_lstm_fuse_pass
paddle_inference_api
pass
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
--infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
......
......@@ -23,6 +23,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
......@@ -329,9 +330,20 @@ void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false,
LOG(INFO) << "fused " << item.first << " " << item.second;
}
ASSERT_TRUE(fuse_statis.count("fc"));
EXPECT_EQ(fuse_statis.at("fc"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1);
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
}
}
......@@ -348,10 +360,3 @@ TEST(Analyzer, DituRNN_multi_thread) {
} // namespace analysis
} // namespace inference
} // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
......@@ -16,6 +16,7 @@
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
namespace paddle {
namespace inference {
......@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
} // namespace analysis
} // namespace inference
} // namespace paddle
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_fuse_pass);
......@@ -18,10 +18,7 @@ if(APPLE)
endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
graph_viz_pass fc_fuse_pass
infer_clean_graph_pass
)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
......
......@@ -20,6 +20,7 @@
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace paddle {
......@@ -132,7 +133,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
}
} // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
......@@ -17,6 +17,7 @@
#include <glog/logging.h>
#include <sys/time.h>
#include <algorithm>
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
......
......@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
dx.device(d) = dout * ((beta * out) + temp2);
auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
}
};
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h"
#include <sys/time.h>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
/*
* transform that computes target bounding-box regression deltas
* given proposal boxes and ground-truth boxes.
*/
template <typename T>
inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
const framework::Tensor& gt_boxes, const T* weights,
const bool normalized, framework::Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
for (int64_t i = 0; i < box_num; ++i) {
ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
trg(i, 2) = std::log(gt_w / ex_w);
trg(i, 3) = std::log(gt_h / ex_h);
if (weights) {
trg(i, 0) = trg(i, 0) / weights[0];
trg(i, 1) = trg(i, 1) / weights[1];
trg(i, 2) = trg(i, 2) / weights[2];
trg(i, 3) = trg(i, 3) / weights[3];
}
}
}
template <typename T>
void Gather(const T* in, const int in_stride, const int* index, const int num,
T* out) {
const int stride_bytes = in_stride * sizeof(T);
for (int i = 0; i < num; ++i) {
int id = index[i];
memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
}
}
} // namespace operators
} // namespace paddle
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/math/math_function.h"
......@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
}
}
template <typename T>
void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes,
const std::vector<float>& weights, Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
auto box_delta_et = framework::EigenTensor<T, 2>::From(*box_delta);
T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
for (int64_t i = 0; i < box_num; ++i) {
ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1;
ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1;
ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1;
gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1;
gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0];
box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1];
box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2];
box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3];
}
}
template <typename T>
std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& context, Tensor* iou,
......@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
Tensor* sampled_labels, Tensor* sampled_gts) {
int fg_num = fg_inds.size();
int bg_num = bg_inds.size();
int gt_num = fg_num + bg_num;
Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
int* gt_box_inds_data =
gt_box_inds_t.mutable_data<int>({gt_num}, context.GetPlace());
gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
int* gt_label_inds_data =
gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
......@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(
// Gather boxes and labels
Tensor sampled_boxes, sampled_labels, sampled_gts;
int boxes_num = fg_inds.size() + bg_inds.size();
int fg_num = fg_inds.size();
int bg_num = bg_inds.size();
int boxes_num = fg_num + bg_num;
framework::DDim bbox_dim({boxes_num, kBoxDim});
sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
sampled_gts.mutable_data<T>(bbox_dim, context.GetPlace());
sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
// Compute targets
Tensor bbox_targets_single;
bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
BoxToDelta<T>(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights,
BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
&bbox_targets_single);
// Scale rois
......@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
auto rpn_rois_lod = rpn_rois->lod().back();
auto gt_classes_lod = gt_classes->lod().back();
auto gt_boxes_lod = gt_boxes->lod().back();
for (size_t i = 0; i < n; ++i) {
for (int i = 0; i < n; ++i) {
Tensor rpn_rois_slice =
rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
Tensor gt_classes_slice =
......
......@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
context.GetPlace());
rpn_roi_probs->mutable_data<T>({scores->numel() / 4, 1},
context.GetPlace());
rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
......@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
CPUGather<T>(ctx, proposals, keep, &bbox_sel);
CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_sel);
return std::make_pair(bbox_sel, scores_filter);
}
Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include <random>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
......@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
auto in_dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(in_dims.size(), 2,
"The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("LocationIndex", {-1});
ctx->SetOutputDim("ScoreIndex", {-1});
ctx->SetOutputDim("TargetLabel", {-1, 1});
ctx->SetOutputDim("TargetBBox", {-1, 4});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("DistMat")->type()),
platform::CPUPlace());
}
};
template <typename T>
class RpnTargetAssignKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* anchor_t = context.Input<Tensor>("Anchor"); // (H*W*A) * 4
auto* gt_bbox_t = context.Input<Tensor>("GtBox");
auto* dist_t = context.Input<LoDTensor>("DistMat");
auto* loc_index_t = context.Output<Tensor>("LocationIndex");
auto* score_index_t = context.Output<Tensor>("ScoreIndex");
auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
auto lod = dist_t->lod().back();
int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
int64_t anchor_num = dist_t->dims()[1];
PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
float pos_threshold = context.Attr<float>("rpn_positive_overlap");
float neg_threshold = context.Attr<float>("rpn_negative_overlap");
float fg_fraction = context.Attr<float>("fg_fraction");
int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
int64_t max_num = batch_num * anchor_num;
auto place = context.GetPlace();
tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
Tensor tmp_tgt_lbl;
auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
int fg_num = 0;
int bg_num = 0;
for (int i = 0; i < batch_num; ++i) {
Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
rpn_batch_size, fg_num_per_batch, engine,
tmp_lbl_data + i * anchor_num);
int cur_fg_num = fg_bg_gt[0].size();
int cur_bg_num = fg_bg_gt[1].size();
std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
[i, anchor_num](int d) { return d + i * anchor_num; });
memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
score_index + cur_fg_num,
[i, anchor_num](int d) { return d + i * anchor_num; });
// get target bbox deltas
if (cur_fg_num) {
Tensor fg_gt;
T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
T* tgt_data = tgt_bbox.data<T>();
Gather<T>(anchor_t->data<T>(), 4,
reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
tgt_data);
Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
cur_fg_num, gt_data);
BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
}
loc_index += cur_fg_num;
score_index += cur_fg_num + cur_bg_num;
fg_num += cur_fg_num;
bg_num += cur_bg_num;
}
int lbl_num = fg_num + bg_num;
PADDLE_ENFORCE_LE(fg_num, max_num);
PADDLE_ENFORCE_LE(lbl_num, max_num);
tgt_bbox_t->Resize({fg_num, 4});
loc_index_t->Resize({fg_num});
score_index_t->Resize({lbl_num});
auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
lbl_data);
}
private:
void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
const int row, const int col, const float pos_threshold,
const float neg_threshold, int64_t* target_label_data,
const float neg_threshold, int64_t* target_label,
std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
int fg_offset = fg_inds->size();
int bg_offset = bg_inds->size();
float epsilon = 0.0001;
for (int64_t i = 0; i < row; ++i) {
const T* v = dist_data + i * col;
T max_dist = *std::max_element(v, v + col);
T max = *std::max_element(v, v + col);
for (int64_t j = 0; j < col; ++j) {
T val = dist_data[i * col + j];
if (val == max_dist) target_label_data[j] = 1;
if (std::abs(max - v[j]) < epsilon) {
target_label[j] = 1;
}
}
}
// Pick the fg/bg and count the number
// Pick the fg/bg
const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
for (int64_t j = 0; j < col; ++j) {
if (anchor_to_gt_max.data<T>()[j] > pos_threshold) {
target_label_data[j] = 1;
} else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) {
target_label_data[j] = 0;
if (anchor_to_gt_max_data[j] >= pos_threshold) {
target_label[j] = 1;
} else if (anchor_to_gt_max_data[j] < neg_threshold) {
target_label[j] = 0;
}
if (target_label_data[j] == 1) {
fg_inds->push_back(fg_offset + j);
} else if (target_label_data[j] == 0) {
bg_inds->push_back(bg_offset + j);
if (target_label[j] == 1) {
fg_inds->push_back(j);
} else if (target_label[j] == 0) {
bg_inds->push_back(j);
}
}
}
void ReservoirSampling(const int num, const int offset,
std::minstd_rand engine,
void ReservoirSampling(const int num, std::minstd_rand engine,
std::vector<int>* inds) const {
std::uniform_real_distribution<float> uniform(0, 1);
const int64_t size = static_cast<int64_t>(inds->size() - offset);
if (size > num) {
for (int64_t i = num; i < size; ++i) {
size_t len = inds->size();
if (len > static_cast<size_t>(num)) {
for (size_t i = num; i < len; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind + offset,
inds->begin() + i + offset);
std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
}
inds->resize(num);
}
}
void RpnTargetAssign(const framework::ExecutionContext& ctx,
const Tensor& dist, const float pos_threshold,
const float neg_threshold, const int rpn_batch_size,
const int fg_num, std::minstd_rand engine,
std::vector<int>* fg_inds, std::vector<int>* bg_inds,
int64_t* target_label_data) const {
// std::vector<std::vector<int>> RpnTargetAssign(
std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& ctx, const Tensor& dist,
const float pos_threshold, const float neg_threshold,
const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
int64_t* target_label) const {
auto* dist_data = dist.data<T>();
int64_t row = dist.dims()[0];
int64_t col = dist.dims()[1];
int fg_offset = fg_inds->size();
int bg_offset = bg_inds->size();
int row = dist.dims()[0];
int col = dist.dims()[1];
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::vector<int> gt_inds;
// Calculate the max IoU between anchors and gt boxes
Tensor anchor_to_gt_max;
anchor_to_gt_max.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(col), 1}),
platform::CPUPlace());
auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
auto x = EigenMatrix<T>::From(dist);
auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max);
x_col_max.device(place) =
x.maximum(Eigen::DSizes<int, 1>(0))
.reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1));
// Map from anchor to gt box that has highest overlap
auto place = ctx.GetPlace();
Tensor anchor_to_gt_max, anchor_to_gt_argmax;
anchor_to_gt_max.mutable_data<T>({col}, place);
int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
auto x = framework::EigenMatrix<T>::From(dist);
auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
auto x_col_argmax =
framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
x_col_argmax = x.argmax(0).template cast<int>();
// Follow the Faster RCNN's implementation
ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
neg_threshold, target_label_data, fg_inds, bg_inds);
neg_threshold, target_label, &fg_inds, &bg_inds);
// Reservoir Sampling
ReservoirSampling(fg_num, fg_offset, engine, fg_inds);
int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset);
ReservoirSampling(bg_num, bg_offset, engine, bg_inds);
}
ReservoirSampling(fg_num, engine, &fg_inds);
int fg_num2 = static_cast<int>(fg_inds.size());
int bg_num = rpn_batch_size - fg_num2;
ReservoirSampling(bg_num, engine, &bg_inds);
void Compute(const framework::ExecutionContext& context) const override {
auto* dist = context.Input<LoDTensor>("DistMat");
auto* loc_index = context.Output<Tensor>("LocationIndex");
auto* score_index = context.Output<Tensor>("ScoreIndex");
auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
auto col = dist->dims()[1];
int64_t n = dist->lod().size() == 0UL
? 1
: static_cast<int64_t>(dist->lod().back().size() - 1);
if (dist->lod().size()) {
PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
"Only support 1 level of LoD.");
gt_inds.reserve(fg_num2);
for (int i = 0; i < fg_num2; ++i) {
gt_inds.emplace_back(argmax[fg_inds[i]]);
}
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
float pos_threshold = context.Attr<float>("rpn_positive_overlap");
float neg_threshold = context.Attr<float>("rpn_negative_overlap");
float fg_fraction = context.Attr<float>("fg_fraction");
int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
int64_t* target_label_data =
tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
std::vector<std::vector<int>> fg_bg_gt;
fg_bg_gt.emplace_back(fg_inds);
fg_bg_gt.emplace_back(bg_inds);
fg_bg_gt.emplace_back(gt_inds);
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
if (n == 1) {
RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
target_label_data);
} else {
auto lod = dist->lod().back();
for (size_t i = 0; i < lod.size() - 1; ++i) {
Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
target_label_data + i * col);
}
}
int* loc_index_data = loc_index->mutable_data<int>(
{static_cast<int>(fg_inds.size())}, context.GetPlace());
int* score_index_data = score_index->mutable_data<int>(
{static_cast<int>(fg_inds.size() + bg_inds.size())},
context.GetPlace());
memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
fg_inds.size() * sizeof(int));
memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
fg_inds.size() * sizeof(int));
memcpy(score_index_data + fg_inds.size(),
reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
return fg_bg_gt;
}
};
class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Anchor",
"(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
AddInput(
"DistMat",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
......@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"ScoreIndex",
"(Tensor), The indexes of foreground and background anchors in all "
"RPN anchors(The rest anchors are ignored). The shape of the "
"ScoreIndex is [F + B], F and B depend on the value of input "
"tensor and attributes.");
AddOutput("TargetLabel",
"(Tensor<int64_t>), The target labels of each anchor with shape "
"[K * M, 1], "
"K and M is the same as they are in DistMat.");
"ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
" number.");
AddOutput("TargetBBox",
"(Tensor<int64_t>), The target bbox deltas with shape "
"[F, 4], F is the sampled foreground number.");
AddOutput(
"TargetLabel",
"(Tensor<int64_t>), The target labels of each anchor with shape "
"[F + B, 1], F and B are sampled foreground and backgroud number.");
AddComment(R"DOC(
This operator can be, for given the IoU between the ground truth bboxes and the
anchors, to assign classification and regression targets to each prediction.
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <glog/logging.h>
#include <algorithm>
#include <iterator>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
template <typename T, typename DeviceContext>
class MidWiseTransformIterator;
// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
template <typename T>
class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
class RowwiseTransformIterator<T, platform::CPUDeviceContext>
: public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
T *, T &> {
public:
RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
......@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
};
template <typename T>
class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
class MidWiseTransformIterator<T, platform::CPUDeviceContext>
: public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
T *, T &> {
public:
MidWiseTransformIterator(const T *ptr, int n, int post)
: ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
......@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
const framework::Tensor &dout, int axis, framework::Tensor *dx,
framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
size_t N = static_cast<size_t>(framework::product(x_dim));
#if !defined(_WIN32)
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), N);
#else
platform::ForRange<DeviceContext> for_range(
ctx.device_context<DeviceContext>(), N);
#endif // !_WIN32
for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
......@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
template <typename Functor, typename DeviceContext, typename T,
typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
const framework::Tensor *x,
const framework::Tensor *y, int axis, Functor func,
framework::Tensor *z) {
TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), func);
auto x_dims = x->dims();
auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
......
......@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
gate_data, frame_size * 3);
// calculate activited gate
Eigen::array<int, 2> extents({{batch_size, frame_size}});
Eigen::array<int, 2> u_offsets({{0, 0}});
Eigen::array<int, 2> extents{{batch_size, frame_size}};
Eigen::array<int, 2> u_offsets{{0, 0}};
ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(u_offsets, extents), g.slice(u_offsets, extents));
auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets({{0, frame_size}});
Eigen::array<int, 2> r_offsets{{0, frame_size}};
ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(r_offsets, extents), g.slice(r_offsets, extents));
auto r = g.slice(r_offsets, extents); // reset gate
......@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
weight_data + frame_size * frame_size * 2, frame_size, 1,
gate_data + frame_size * 2, frame_size * 3);
Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
ActCompute(context.Attr<int>("activation"), place,
g.slice(c_offsets, extents), g.slice(c_offsets, extents));
auto c = g.slice(c_offsets, extents); // output candidate
......@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1];
Eigen::array<int, 2> extents({{batch_size, frame_size}});
Eigen::array<int, 2> u_offsets({{0, 0}});
Eigen::array<int, 2> extents{{batch_size, frame_size}};
Eigen::array<int, 2> u_offsets{{0, 0}};
auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets({{0, frame_size}});
Eigen::array<int, 2> r_offsets{{0, frame_size}};
auto r = g.slice(r_offsets, extents); // reset gate
Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
auto c = g.slice(c_offsets, extents); // output candidate
// backward for unactivated update gate
......
......@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
auto dist = framework::EigenVector<T>::Flatten(*dist_t);
out.device(dev) =
static_cast<T>(1 - epsilon) * in +
epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
static_cast<T>(epsilon) *
dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
} else {
out.device(dev) = static_cast<T>(1 - epsilon) * in +
static_cast<T>(epsilon / label_dim);
......
......@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
vec_scal<float, platform::jit::avx2>(n, a, x, y);
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = a - x[i];
}
}
template <>
inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
const float* x, float* y) {
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = a - x[i];
}
#else
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
const float* x, float* y) {
vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
}
template <>
inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
}
// out = x*y + (1-x)*z
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
}
}
template <>
inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
const float* y, const float* z,
float* out) {
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(1.f);
__m256 tmpx, tmpy, tmpz;
for (i = 0; i < end; i += block) {
tmpx = _mm256_loadu_ps(x + i);
tmpy = _mm256_loadu_ps(y + i);
tmpz = _mm256_loadu_ps(z + i);
tmpy = _mm256_mul_ps(tmpx, tmpy);
tmpx = _mm256_sub_ps(bias, tmpx);
tmpz = _mm256_mul_ps(tmpx, tmpz);
tmpz = _mm256_add_ps(tmpy, tmpz);
_mm256_storeu_ps(out + i, tmpz);
}
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
}
#else
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
#endif
}
template <>
inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
const float* y,
const float* z, float* out) {
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <>
inline void vec_cross<float, platform::jit::avx512_common>(
const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
......
......@@ -17,6 +17,11 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#endif // _WIN32
namespace paddle {
namespace operators {
namespace math {
......@@ -55,12 +60,38 @@ namespace math {
* FindLastSet(x) = 1 + \floor*{\log_{2}x}
* \f]
*/
#if !defined(_WIN32)
inline constexpr size_t FindLastSet(size_t x) {
return std::is_same<size_t, unsigned int>::value
? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
: (std::is_same<size_t, unsigned long>::value // NOLINT
? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
: (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
#else
// windows don't have built-in clz, ctz function
template <typename T>
inline int ctz(const T& value) {
DWORD trailing_zero = 0;
if (_BitScanForward(&trailing_zero, value)) {
return static_cast<int>(trailing_zero);
} else {
return static_cast<int>(0);
}
}
template <typename T>
inline int clz(const T& value) {
DWORD leadning_zero = 0;
if (_BitScanReverse(&leadning_zero, value)) {
return static_cast<int>(sizeof(T) * 8 - leadning_zero);
} else {
return static_cast<int>(0);
}
}
inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
#endif // !_WIN32
}
struct SimpleCode {
......
......@@ -16,13 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
template <typename DeviceContext, typename T>
class MaxOutFunctor {
public:
......
......@@ -18,15 +18,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX \
__FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm
// still wondering where to put it.
/*
* \brief Extracting simple operations from pooling.
* Both MaxPool and AvgPool need "initial", "compute" and "finalize"
......
......@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
// Calculate the start position of each batch.
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// num_batch = 5,
// max_seqlen = 5,
// batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
......@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
// where 1 is the second sequence,
// 0 is the first sequence,
// 2 is the third sequence.
// The num_batch represents batch size after rearranging the
// The max_seqlen represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence.
paddle::framework::LoD batch_lods;
......@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
batch_lods.emplace_back(std::vector<size_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor
int num_batch = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
int max_seqlen = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
// batch_lods[2] is the sort order for the input LoDTensor.
......@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
size_t* batch_starts = batch_lods[0].data();
size_t* seq2batch_idx = batch_lods[1].data();
batch_starts[0] = 0;
for (int n = 0; n < num_batch; n++) {
for (int n = 0; n < max_seqlen; n++) {
auto batch_id = static_cast<int>(batch_starts[n]);
for (size_t i = 0; i < seq_info.size(); ++i) {
int seq_len = seq_info[i].length;
......
......@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
template <typename T>
__global__ void GPUROIPoolForward(
const int nthreads, const T* input_data, const int64_t* input_rois,
const int nthreads, const T* input_data, const T* input_rois,
const float spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
......@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
int c = (i / pooled_width / pooled_height) % channels;
int n = i / pooled_width / pooled_height / channels;
const int64_t* offset_input_rois = input_rois + n * kROISize;
const T* offset_input_rois = input_rois + n * kROISize;
int roi_batch_ind = roi_batch_id_data[n];
int roi_start_w = round(offset_input_rois[0] * spatial_scale);
int roi_start_h = round(offset_input_rois[1] * spatial_scale);
......@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
template <typename T>
__global__ void GPUROIPoolBackward(
const int nthreads, const int64_t* input_rois, const T* output_grad,
const int nthreads, const T* input_rois, const T* output_grad,
const int64_t* argmax_data, const int num_rois, const float spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, int* roi_batch_id_data,
......@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
GPUROIPoolForward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
channels, height, width, pooled_height, pooled_width,
output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
height, width, pooled_height, pooled_width,
roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
argmax->mutable_data<int64_t>(ctx.GetPlace()));
}
......@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
if (output_grad_size > 0) {
GPUROIPoolBackward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
output_grad_size, rois->data<T>(), out_grad->data<T>(),
argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
width, pooled_height, pooled_width,
roi_batch_id_list_gpu.data<int>(),
......
......@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
T* output_data = out->mutable_data<T>(ctx.GetPlace());
int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
const int64_t* rois_data = rois->data<int64_t>();
const T* rois_data = rois->data<T>();
for (int n = 0; n < rois_num; ++n) {
int roi_batch_id = roi_batch_id_data[n];
int roi_start_w = round(rois_data[0] * spatial_scale);
......@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
}
}
const int64_t* rois_data = rois->data<int64_t>();
const T* rois_data = rois->data<T>();
const T* out_grad_data = out_grad->data<T>();
const int64_t* argmax_data = argmax->data<int64_t>();
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
#include <sstream>
......@@ -23,40 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace operators {
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveCombineOp : public framework::OperatorBase {
public:
SaveCombineOp(const std::string &type,
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
......@@ -25,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace operators {
......@@ -33,36 +33,6 @@ namespace operators {
// to directory specified.
constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveOp : public framework::OperatorBase {
public:
SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/sequence_enumerate_op.h"
namespace paddle {
namespace operators {
class SequenceEnumerateOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of SequecceEnumerate operator should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(X) of SequenceEnumerate operator should not be null.");
const auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
x_dims.size(), 2UL,
"Input(X) of SequenceEnumerate operator's rank should be 2.");
PADDLE_ENFORCE_EQ(
x_dims[1], 1UL,
"Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
const auto win_size = ctx->Attrs().Get<int>("win_size");
ctx->SetOutputDim("Out", {x_dims[0], win_size});
ctx->ShareLoD("X", "Out");
}
};
class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(2-D LoDTensor with the 2nd dimension equal to 1) "
"Input LoDTensor of SequenceEnumerate operator.");
AddOutput("Out",
"(2-D LoDTensor with the 2nd dimension equal to win_size) "
"Output LoDTensor of SequenceEnumerate operator.");
AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
.AddCustomChecker([](const int& win_size) {
PADDLE_ENFORCE(win_size >= 2,
"The window size should be not less than 2.");
});
AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
.SetDefault(0);
AddComment(R"DOC(
Sequence Enumerate Operator.
Generate a new sequence for the input index sequence, which enumerates all the
sub-sequences with length `win_size` of the input.
The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples:
Case 1:
Input:
X.lod = [[0, 3, 5]]
X.data = [[1], [2], [3], [4], [5]]
X.dims = [5, 1]
Attrs:
win_size = 2
pad_value = 0
Output:
Out.lod = [[0, 3, 5]]
Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
Out.dims = [5, 2]
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp,
ops::SequenceEnumerateOpMaker);
REGISTER_OP_CPU_KERNEL(
sequence_enumerate,
ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_enumerate_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = framework::LoDTensor;
template <typename T>
__global__ void CalcOutPut(const T* in_data, const size_t* in_lod,
const size_t lod_len, const int64_t win_size,
const int64_t pad_value, T* out_data) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < in_lod[lod_len - 1]) {
int end_idx = 0;
// Get LoD interval of index
for (int i = 1; i < lod_len; ++i) {
if (index < in_lod[i]) {
end_idx = in_lod[i];
break;
}
}
for (size_t i = 0; i < win_size; ++i) {
int word_pos = index + i;
out_data[index * win_size + i] =
word_pos < end_idx ? in_data[word_pos] : pad_value;
}
}
}
template <typename T>
class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
int win_size = context.Attr<int>("win_size");
int pad_value = context.Attr<int>("pad_value");
auto in_dims = in->dims();
auto in_lod = in->lod();
PADDLE_ENFORCE_EQ(
static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
"The actual input data's size mismatched with LoD information.");
/* Generate enumerate sequence set */
auto stream = context.cuda_device_context().stream();
auto lod0 = in_lod[0];
auto in_len = in->numel();
auto in_data = in->data<T>();
auto out_data = out->mutable_data<T>(context.GetPlace());
// Copy LoD to GPU
const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
// Calc output tensor
CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
sequence_enumerate,
paddle::operators::SequenceEnumerateOpCUDAKernel<int32_t>,
paddle::operators::SequenceEnumerateOpCUDAKernel<int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class SequenceEnumerateKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
int win_size = context.Attr<int>("win_size");
int pad_value = context.Attr<int>("pad_value");
auto in_dims = in->dims();
auto in_lod = in->lod();
PADDLE_ENFORCE_EQ(
static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
"The actual input data's size mismatched with LoD information.");
// Generate enumerate sequence set
auto lod0 = in_lod[0];
auto in_data = in->data<T>();
auto out_data = out->mutable_data<T>(context.GetPlace());
for (size_t i = 0; i < lod0.size() - 1; ++i) {
for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
for (int word_idx = 0; word_idx < win_size; ++word_idx) {
size_t word_pos = idx + word_idx;
out_data[win_size * idx + word_idx] =
word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
}
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cfloat>
// Disable the copy and assignment operator for a class.
#ifndef DISABLE_COPY_AND_ASSIGN
......@@ -23,3 +24,7 @@ limitations under the License. */
classname& operator=(const classname&) = delete; \
classname& operator=(classname&&) = delete
#endif
#if defined(__FLT_MAX__)
#define FLT_MAX __FLT_MAX__
#endif // __FLT_MAX__
......@@ -14,24 +14,141 @@
#pragma once
#include <cstdio>
#include <stdexcept>
#include <memory>
#include <string>
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#if !defined(_WIN32)
#include <dlfcn.h> // for dladdr
#include <execinfo.h> // for backtrace
#define UNUSED __attribute__((unused))
#include <dlfcn.h> // dladdr
#include <execinfo.h> // backtrace
#include <sys/stat.h>
#include <algorithm> // std::accumulate
#else
#include <Shlwapi.h>
#include <Windows.h>
#include <io.h> // _popen, _pclose
#include <windows.h>
#if defined(_WIN32)
#include <numeric> // std::accumulate in msvc
#endif
// windows version of __attribute__((unused))
#define UNUSED __pragma(warning(suppress : 4100))
static void* dlsym(void* handle, const char* symbol_name) {
#ifndef S_ISDIR // windows port for sys/stat.h
#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
#endif // S_ISDIR
static void *dlsym(void *handle, const char *symbol_name) {
FARPROC found_symbol;
found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
if (found_symbol == NULL) {
throw std::runtime_error(std::string(symbol_name) + " not found.");
}
return reinterpret_cast<void*>(found_symbol);
return reinterpret_cast<void *>(found_symbol);
}
#endif
static void *dlopen(const char *filename, int flag) {
std::string file_name(filename);
file_name.replace(0, file_name.size() - 1, '/', '\\');
HMODULE hModule = LoadLibrary(file_name.c_str());
if (!hModule) {
throw std::runtime_error(file_name + " not found.");
}
return reinterpret_cast<void *>(hModule);
}
#endif // !_WIN32
static void ExecShellCommand(const std::string &cmd, std::string *message) {
char buffer[128];
#if !defined(_WIN32)
std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
#else
std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
#endif // _WIN32
if (!pipe) {
LOG(ERROR) << "error running command: " << cmd;
return;
}
while (!feof(pipe.get())) {
if (fgets(buffer, 128, pipe.get()) != nullptr) {
*message += buffer;
}
}
}
static bool PathExists(const std::string &path) {
#if !defined(_WIN32)
struct stat statbuf;
if (stat(path.c_str(), &statbuf) != -1) {
if (S_ISDIR(statbuf.st_mode)) {
return true;
}
}
#else
struct _stat statbuf;
if (_stat(path.c_str(), &statbuf) != -1) {
if (S_ISDIR(statbuf.st_mode)) {
return true;
}
}
#endif // !_WIN32
return false;
}
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
#if !defined(_WIN32)
constexpr char kSEP = '/';
#else
constexpr char kSEP = '\\';
#endif // _WIN32
static bool FileExists(const std::string &filepath) {
#if !defined(_WIN32)
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
#else
struct _stat buffer;
return (_stat(filepath.c_str(), &buffer) == 0);
#endif // !_WIN32
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
std::string path_error(path);
path_error += " mkdir failed!";
#if !defined(_WIN32)
if (mkdir(path, 0755)) {
if (errno != EEXIST) {
throw std::runtime_error(path_error);
}
}
#else
CreateDirectory(path, NULL);
auto errorno = GetLastError();
if (errorno != ERROR_ALREADY_EXISTS) {
throw std::runtime_error(path_error);
}
#endif // !_WIN32
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
......@@ -98,10 +98,9 @@ class Inferencer(object):
raise ValueError(
"inputs should be a map of {'input_name': input_var}")
with executor.scope_guard(self.scope):
results = self.exe.run(self.inference_program,
feed=inputs,
fetch_list=[self.predict_var],
with self._prog_and_scope_guard():
results = self.exe.run(feed=inputs,
fetch_list=[self.predict_var.name],
return_numpy=return_numpy)
return results
......
......@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
"""
helper = LayerHelper('rpn_target_assign', **locals())
# 1. Compute the regression target bboxes
target_bbox = box_coder(
prior_box=anchor_box,
prior_box_var=anchor_var,
target_box=gt_box,
code_type='encode_center_size',
box_normalized=False)
# 2. Compute overlaps between the prior boxes and the gt boxes overlaps
# Compute overlaps between the prior boxes and the gt boxes overlaps
iou = iou_similarity(x=gt_box, y=anchor_box)
# 3. Assign target label to anchors
loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
score_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
target_label = helper.create_tmp_variable(dtype=anchor_box.dtype)
# Assign target label to anchors
loc_index = helper.create_tmp_variable(dtype='int32')
score_index = helper.create_tmp_variable(dtype='int32')
target_label = helper.create_tmp_variable(dtype='int64')
target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
helper.append_op(
type="rpn_target_assign",
inputs={'DistMat': iou},
inputs={'Anchor': anchor_box,
'GtBox': gt_box,
'DistMat': iou},
outputs={
'LocationIndex': loc_index,
'ScoreIndex': score_index,
'TargetLabel': target_label
'TargetLabel': target_label,
'TargetBBox': target_bbox,
},
attrs={
'rpn_batch_size_per_im': rpn_batch_size_per_im,
......@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
'fg_fraction': fg_fraction
})
# 4. Reshape and gather the target entry
scores = nn.reshape(x=scores, shape=(-1, 2))
loc = nn.reshape(x=loc, shape=(-1, 4))
target_label = nn.reshape(x=target_label, shape=(-1, 1))
target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
loc_index.stop_gradient = True
score_index.stop_gradient = True
target_label.stop_gradient = True
target_bbox.stop_gradient = True
scores = nn.reshape(x=scores, shape=(-1, 1))
loc = nn.reshape(x=loc, shape=(-1, 4))
predicted_scores = nn.gather(scores, score_index)
predicted_location = nn.gather(loc, loc_index)
target_label = nn.gather(target_label, score_index)
target_bbox = nn.gather(target_bbox, loc_index)
return predicted_scores, predicted_location, target_label, target_bbox
......
......@@ -111,6 +111,7 @@ __all__ = [
'stack',
'pad2d',
'unstack',
'sequence_enumerate',
]
......@@ -5823,6 +5824,51 @@ def flatten(x, axis=1, name=None):
return out
def sequence_enumerate(input, win_size, pad_value=0, name=None):
"""
Generate a new sequence for the input index sequence, which enumerates all the
sub-sequences with length `win_size` of the input.
The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples:
Case 1:
Input:
X.lod = [[0, 3, 5]]
X.data = [[1], [2], [3], [4], [5]]
X.dims = [5, 1]
Attrs:
win_size = 2
pad_value = 0
Output:
Out.lod = [[0, 3, 5]]
Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
Out.dims = [5, 2]
Args:
input (Variable): The input variable which is a index sequence.
win_size (int): The window size for enumerating all sub-sequences.
pad_value (int): The padding value, default 0.
Returns:
Variable: The enumerate sequence variable which is a LoDTensor.
Examples:
.. code-block:: python
x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
"""
helper = LayerHelper('sequence_enumerate', **locals())
out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
helper.append_op(
type='sequence_enumerate',
inputs={'X': input},
outputs={'Out': out},
attrs={'win_size': win_size,
'pad_value': pad_value})
def sequence_mask(x, maxlen=None, dtype='int64', name=None):
"""
**SequenceMask Layer**
......@@ -5902,6 +5948,7 @@ def stack(x, axis=0):
helper.append_op(
type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis})
return out
......
......@@ -16,7 +16,9 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy
import os
import cifar10_small_test_set
......@@ -89,7 +91,7 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, parallel, params_dirname):
BATCH_SIZE = 128
EPOCH_NUM = 1
......@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program, optimizer_func=optimizer_func, place=place)
train_func=train_program,
optimizer_func=optimizer_func,
place=place,
parallel=parallel)
trainer.train(
reader=train_reader,
......@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results)
def main(use_cuda):
def main(use_cuda, parallel):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "image_classification_resnet.inference.model"
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -16,7 +16,9 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy
import os
import cifar10_small_test_set
......@@ -68,7 +70,7 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, parallel, params_dirname):
BATCH_SIZE = 128
train_reader = paddle.batch(
paddle.reader.shuffle(
......@@ -93,7 +95,10 @@ def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_func)
train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=parallel)
trainer.train(
reader=train_reader,
......@@ -102,10 +107,13 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
......@@ -116,22 +124,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
def main(use_cuda, parallel):
save_path = "image_classification_vgg.inference.model"
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_network,
params_dirname=save_path)
params_dirname=save_path,
parallel=parallel)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -64,14 +64,14 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, parallel, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=True)
parallel=parallel)
def event_handler(event):
if isinstance(event, fluid.EndEpochEvent):
......@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0])
def main(use_cuda):
def main(use_cuda, parallel):
params_dirname = "recognize_digits_conv.inference.model"
# call train() with is_local argument to run distributed train
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
if __name__ == '__main__':
# for use_cuda in (False, True):
main(use_cuda=core.is_compiled_with_cuda())
for use_cuda in (False, True):
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -16,6 +16,7 @@ from __future__ import print_function
import argparse
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle
import sys
import numpy
......@@ -50,11 +51,14 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, params_dirname, parallel):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_func)
train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=parallel)
def event_handler(event):
if isinstance(event, fluid.EndEpochEvent):
......@@ -86,11 +90,14 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['img', 'label'])
def infer(use_cuda, inference_program, params_dirname=None):
def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place)
infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
......@@ -101,20 +108,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0])
def main(use_cuda):
def main(use_cuda, parallel):
params_dirname = "recognize_digits_mlp.inference.model"
# call train() with is_local argument to run distributed train
os.environ['CPU_NUM'] = str(4)
train(
use_cuda=use_cuda,
train_program=train_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer(
use_cuda=use_cuda,
inference_program=inference_program,
params_dirname=params_dirname)
params_dirname=params_dirname,
parallel=parallel)
if __name__ == '__main__':
# for use_cuda in (False, True):
main(use_cuda=False)
for use_cuda in (False, True):
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
gt_box = layers.data(
name='gt_box', shape=[4], lod_level=1, dtype='float32')
predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign(
pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
loc=loc,
scores=scores,
anchor_box=anchor_box,
......@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3)
self.assertIsNotNone(predicted_scores)
self.assertIsNotNone(predicted_location)
self.assertIsNotNone(target_label)
self.assertIsNotNone(target_bbox)
assert predicted_scores.shape[1] == 2
assert predicted_location.shape[1] == 4
assert predicted_location.shape[1] == target_bbox.shape[1]
print(str(program))
self.assertIsNotNone(pred_scores)
self.assertIsNotNone(pred_loc)
self.assertIsNotNone(tgt_lbl)
self.assertIsNotNone(tgt_bbox)
assert pred_scores.shape[1] == 1
assert pred_loc.shape[1] == 4
assert pred_loc.shape[1] == tgt_bbox.shape[1]
class TestGenerateProposals(unittest.TestCase):
......
......@@ -37,7 +37,7 @@ def fusion_gru(
h0,
wh,
np.zeros(
(1, wh.shape[1]), dtype='float64'),
(1, wh.shape[1]), dtype='float32'),
is_reverse,
act_state,
act_gate)
......@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest):
T = sum(self.lod[0])
N = len(self.lod[0])
x = np.random.rand(T, self.M).astype('float64')
wx = np.random.rand(self.M, 3 * self.D).astype('float64')
wh = np.random.rand(self.D, 3 * self.D).astype('float64')
x = np.random.rand(T, self.M).astype('float32')
wx = np.random.rand(self.M, 3 * self.D).astype('float32')
wh = np.random.rand(self.D, 3 * self.D).astype('float32')
bias = np.random.rand(
1, 3 * self.D).astype('float64') if self.with_bias else np.zeros(
(1, 3 * self.D), dtype='float64')
1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
(1, 3 * self.D), dtype='float32')
h0 = np.random.rand(
N, self.D).astype('float64') if self.with_h0 else np.zeros(
(N, self.D), dtype='float64')
N, self.D).astype('float32') if self.with_h0 else np.zeros(
(N, self.D), dtype='float32')
_, _, _, hidden = fusion_gru(
x, self.lod, h0, wx, wh, bias, self.is_reverse,
......@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest):
}
def test_check_output(self):
self.check_output(atol=1e-8)
for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq
self.check_output()
class TestFusionGRUOpNoInitial(TestFusionGRUOp):
......
......@@ -114,7 +114,9 @@ class TestFusionLSTMOp(OpTest):
}
def test_check_output(self):
self.check_output()
for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq
self.check_output()
class TestFusionLSTMOpInit(TestFusionLSTMOp):
......
......@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
dw = (np.log(gt_w / ex_w)) / ex_w / weights[2]
dh = (np.log(gt_h / ex_h)) / ex_h / weights[3]
dw = (np.log(gt_w / ex_w)) / weights[2]
dh = (np.log(gt_h / ex_h)) / weights[3]
targets = np.vstack([dx, dy, dw, dh]).transpose()
return targets
......
......@@ -549,6 +549,13 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out)
print(str(program))
def test_sequence_enumerate(self):
program = Program()
with program_guard(program):
x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
print(str(program))
if __name__ == '__main__':
unittest.main()
......@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):
for i in range(self.rois_num):
roi = self.rois[i]
roi_batch_id = roi[0]
roi_batch_id = int(roi[0])
roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
......@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
roi = [bno, x1, y1, x2, y2]
rois.append(roi)
self.rois_num = len(rois)
self.rois = np.array(rois).astype("int64")
self.rois = np.array(rois).astype("float32")
def setUp(self):
self.op_type = "roi_pool"
......
......@@ -18,12 +18,17 @@ import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from test_anchor_generator_op import anchor_generator_in_python
from test_generate_proposal_labels import _generate_groundtruth
from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
rpn_negative_overlap, fg_fraction):
iou = np.transpose(iou)
def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
iou = np.transpose(gt_anchor_iou)
anchor_to_gt_max = iou.max(axis=1)
anchor_to_gt_argmax = iou.argmax(axis=1)
gt_to_anchor_argmax = iou.argmax(axis=0)
gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
......@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
tgt_lbl[bg_inds] = 0
if len(bg_inds) > num_bg:
enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
tgt_lbl[enable_inds] = 0
bg_inds = np.where(tgt_lbl == 0)[0]
tgt_lbl[bg_inds] = 0
loc_index = fg_inds
score_index = np.hstack((fg_inds, bg_inds))
tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
return loc_index, score_index, tgt_lbl
gt_inds = anchor_to_gt_argmax[fg_inds]
return loc_index, score_index, tgt_lbl, gt_inds
def get_anchor(n, c, h, w):
input_feat = np.random.random((n, c, h, w)).astype('float32')
anchors, _ = anchor_generator_in_python(
input_feat=input_feat,
anchor_sizes=[32., 64.],
aspect_ratios=[0.5, 1.0],
variances=[1.0, 1.0, 1.0, 1.0],
stride=[16.0, 16.0],
offset=0.5)
return anchors
def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
loc_indexes = []
score_indexes = []
tmp_tgt_labels = []
tgt_bboxes = []
anchor_num = anchor.shape[0]
batch_size = len(lod) - 1
for i in range(batch_size):
b, e = lod[i], lod[i + 1]
iou_slice = iou[b:e, :]
bboxes_slice = gt_boxes[b:e, :]
loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
rpn_negative_overlap, fg_fraction)
fg_bboxes = bboxes_slice[gt_inds]
fg_anchors = anchor[loc_idx]
box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
if i == 0:
loc_indexes = loc_idx
score_indexes = score_idx
tmp_tgt_labels = tgt_lbl
tgt_bboxes = box_deltas
else:
loc_indexes = np.concatenate(
[loc_indexes, loc_idx + i * anchor_num])
score_indexes = np.concatenate(
[score_indexes, score_idx + i * anchor_num])
tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
tgt_labels = tmp_tgt_labels[score_indexes]
return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
class TestRpnTargetAssignOp(OpTest):
def setUp(self):
iou = np.random.random((10, 8)).astype("float32")
self.op_type = "rpn_target_assign"
self.inputs = {'DistMat': iou}
self.attrs = {
'rpn_batch_size_per_im': 256,
'rpn_positive_overlap': 0.95,
'rpn_negative_overlap': 0.3,
'fg_fraction': 0.25,
'fix_seed': True
}
loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
0.25)
self.outputs = {
'LocationIndex': loc_index,
'ScoreIndex': score_index,
'TargetLabel': tgt_lbl,
}
n, c, h, w = 2, 4, 14, 14
anchor = get_anchor(n, c, h, w)
gt_num = 10
anchor = anchor.reshape(-1, 4)
anchor_num = anchor.shape[0]
def test_check_output(self):
self.check_output()
im_shapes = [[64, 64], [64, 64]]
gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
bbox = np.vstack([v['boxes'] for v in gt_box])
iou = _bbox_overlaps(bbox, anchor)
anchor = anchor.astype('float32')
bbox = bbox.astype('float32')
iou = iou.astype('float32')
loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
class TestRpnTargetAssignOp2(OpTest):
def setUp(self):
iou = np.random.random((10, 20)).astype("float32")
self.op_type = "rpn_target_assign"
self.inputs = {'DistMat': iou}
self.inputs = {
'Anchor': anchor,
'GtBox': (bbox, [[4, 4]]),
'DistMat': (iou, [[4, 4]]),
}
self.attrs = {
'rpn_batch_size_per_im': 128,
'rpn_positive_overlap': 0.5,
'rpn_negative_overlap': 0.5,
'fg_fraction': 0.5,
'rpn_batch_size_per_im': 25600,
'rpn_positive_overlap': 0.95,
'rpn_negative_overlap': 0.03,
'fg_fraction': 0.25,
'fix_seed': True
}
loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
0.5)
self.outputs = {
'LocationIndex': loc_index,
'ScoreIndex': score_index,
'TargetLabel': tgt_lbl,
'LocationIndex': loc_index.astype('int32'),
'ScoreIndex': score_index.astype('int32'),
'TargetBBox': tgt_bbox.astype('float32'),
'TargetLabel': tgt_lbl.astype('int64'),
}
def test_check_output(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
lod0 = [0]
for i in range(0, len(in_lod[0])):
lod0.append(lod0[i] + in_lod[0][i])
out_seq = []
for i in range(0, len(lod0) - 1):
for idx in range(lod0[i], lod0[i + 1]):
single_seq = []
for word_idx in range(win_size):
word_pos = idx + word_idx
dat = input_seq[word_pos] if word_pos < lod0[i+1] \
else pad_value
single_seq.append(dat)
out_seq.append(single_seq)
return out_seq
class TestSequenceEnumerateOp(OpTest):
def setUp(self):
self.op_type = "sequence_enumerate"
self.init_test_case()
self.inputs = {'X': (self.in_seq, self.lod)}
self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value}
self.outputs = {'Out': (self.out_seq, self.lod)}
def test_check_output(self):
self.check_output()
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 2
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
self.lod = [[9, 4, 11, 6]]
self.win_size = 2
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int64")
class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 5
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 30
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 5
self.pad_value = 5
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
if __name__ == "__main__":
unittest.main()
......@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object):
self.table_name]
zero_dim = int(
math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints)))
math.ceil(origin_param_var.shape[0] / float(
len(self.pserver_endpoints))))
table_shape = list(origin_param_var.shape)
table_shape[0] = zero_dim
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册