提交 3a25ceeb 编写于 作者: Q qiaolongfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update-api-reference-1

...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
| jczaja | Jacek Czaja | | jczaja | Jacek Czaja |
| JiayiFeng | Jia-Yi Feng | | JiayiFeng | Jia-Yi Feng |
| kbinias | Krzysztof Binias | | kbinias | Krzysztof Binias |
| kexinzhao | Ke-Xin Zhao |
| kuke | Yi-Bing Liu | | kuke | Yi-Bing Liu |
| lcy-seso | Ying Cao | | lcy-seso | Ying Cao |
| lipeng-unisound | Peng Li | | lipeng-unisound | Peng Li |
......
...@@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) ...@@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
...@@ -193,7 +194,10 @@ set(EXTERNAL_LIBS ...@@ -193,7 +194,10 @@ set(EXTERNAL_LIBS
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
include(tensorrt) include(tensorrt)
endif(WITH_GPU) include(external/anakin)
else()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
endif()
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
find_package(HIP) find_package(HIP)
......
if (NOT WITH_ANAKIN)
return()
endif()
set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
"Anakin install path." FORCE)
set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
function(fetch_include_recursively root_dir)
if (IS_DIRECTORY ${root_dir})
include_directories(${root_dir})
endif()
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
foreach(sub ${ALL_SUB})
if (IS_DIRECTORY ${root_dir}/${sub})
fetch_include_recursively(${root_dir}/${sub})
endif()
endforeach()
endfunction()
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
if (WITH_ANAKIN)
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
fetch_include_recursively(${ANAKIN_INCLUDE})
link_directories(${ANAKIN_LIBRARY})
endif()
...@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20") SET(OPENBLAS_COMMIT "v0.2.20")
......
...@@ -155,6 +155,15 @@ copy(inference_lib DEPS paddle_fluid_shared paddle_fluid ...@@ -155,6 +155,15 @@ copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
DSTS ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module}
) )
if(WITH_CONTRIB)
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
copy(contrib_inference_lib DEPS paddle_inference_api
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
DSTS ${contrib_dst_dir} ${contrib_dst_dir}
)
endif()
set(module "platform") set(module "platform")
copy(platform_lib DEPS profiler_py_proto copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
......
#!/bin/bash #!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor detection > layers.rst python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
do do
......
...@@ -342,6 +342,12 @@ conv2d ...@@ -342,6 +342,12 @@ conv2d
.. autofunction:: paddle.fluid.layers.conv2d .. autofunction:: paddle.fluid.layers.conv2d
:noindex: :noindex:
conv3d
------
.. autofunction:: paddle.fluid.layers.conv3d
:noindex:
sequence_pool sequence_pool
------------- -------------
...@@ -366,6 +372,12 @@ pool2d ...@@ -366,6 +372,12 @@ pool2d
.. autofunction:: paddle.fluid.layers.pool2d .. autofunction:: paddle.fluid.layers.pool2d
:noindex: :noindex:
pool3d
------
.. autofunction:: paddle.fluid.layers.pool3d
:noindex:
batch_norm batch_norm
---------- ----------
...@@ -384,6 +396,13 @@ conv2d_transpose ...@@ -384,6 +396,13 @@ conv2d_transpose
.. autofunction:: paddle.fluid.layers.conv2d_transpose .. autofunction:: paddle.fluid.layers.conv2d_transpose
:noindex: :noindex:
conv3d_transpose
----------------
.. autofunction:: paddle.fluid.layers.conv2d_transpose
:noindex:
sequence_expand sequence_expand
--------------- ---------------
...@@ -1041,3 +1060,42 @@ box_coder ...@@ -1041,3 +1060,42 @@ box_coder
.. autofunction:: paddle.fluid.layers.box_coder .. autofunction:: paddle.fluid.layers.box_coder
:noindex: :noindex:
learning_rate_scheduler
=======================
exponential_decay
-----------------
.. autofunction:: paddle.fluid.layers.exponential_decay
:noindex:
natural_exp_decay
-----------------
.. autofunction:: paddle.fluid.layers.natural_exp_decay
:noindex:
inverse_time_decay
------------------
.. autofunction:: paddle.fluid.layers.inverse_time_decay
:noindex:
polynomial_decay
----------------
.. autofunction:: paddle.fluid.layers.polynomial_decay
:noindex:
piecewise_decay
---------------
.. autofunction:: paddle.fluid.layers.piecewise_decay
:noindex:
noam_decay
----------
.. autofunction:: paddle.fluid.layers.noam_decay
:noindex:
...@@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr ...@@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr
## What can fluid learn from them? ## What can fluid learn from them?
TBD Please refer to `paddle/contrib/dynamic/`.
# Appendix # Appendix
......
...@@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a") ...@@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest" ➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
``` ```
关于构建和测试的更多信息,请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst) 关于构建和测试的更多信息,请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)
## 提交(commit) ## 提交(commit)
......
...@@ -14,3 +14,4 @@ ...@@ -14,3 +14,4 @@
# #
add_subdirectory(inference) add_subdirectory(inference)
add_subdirectory(tape)
...@@ -17,48 +17,9 @@ if(APPLE) ...@@ -17,48 +17,9 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE) endif(APPLE)
set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
set(inference_deps paddle_inference_api paddle_fluid_api) set(inference_deps paddle_inference_api paddle_fluid_api)
# if anakin is set enable anakin api implementation
if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
set(ANAKIN_FOUND ON)
else()
set(ANAKIN_FOUND OFF)
endif()
function(fetch_include_recursively root_dir)
if (IS_DIRECTORY ${root_dir})
include_directories(${root_dir})
endif()
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
foreach(sub ${ALL_SUB})
if (IS_DIRECTORY ${root_dir}/${sub})
fetch_include_recursively(${root_dir}/${sub})
endif()
endforeach()
endfunction()
if (ANAKIN_FOUND)
# Anakin's code style doesn't follow google c style.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
fetch_include_recursively(${ANAKIN_INCLUDE})
link_directories(${ANAKIN_LIBRARY})
nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
list(APPEND inference_deps inference_anakin_api)
endif()
function(inference_api_test TARGET_NAME) function(inference_api_test TARGET_NAME)
if (WITH_TESTING) if (WITH_TESTING)
set(options "") set(options "")
...@@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api ...@@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api
inference_api_test(test_paddle_inference_api_impl inference_api_test(test_paddle_inference_api_impl
ARGS test_word2vec test_image_classification) ARGS test_word2vec test_image_classification)
if (ANAKIN_FOUND) if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
DEPS ${inference_deps}) ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif() endif()
if(WITH_TESTING) if(WITH_TESTING)
......
...@@ -12,9 +12,8 @@ ...@@ -12,9 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <cuda.h>
#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h" #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
#include <cuda.h>
namespace paddle { namespace paddle {
......
...@@ -19,10 +19,9 @@ limitations under the License. */ ...@@ -19,10 +19,9 @@ limitations under the License. */
#pragma once #pragma once
// NOTE This header file do not have namespace.
//#include <test/framework/net/paddle_api.h>
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/contrib/inference/paddle_inference_api.h"
// from anakin
#include "framework/core/net/net.h" #include "framework/core/net/net.h"
#include "saber/saber_types.h" #include "saber/saber_types.h"
......
...@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/contrib/inference/paddle_inference_api.h"
DEFINE_string(model, "", "Directory of the inference model.");
namespace paddle { namespace paddle {
AnakinConfig GetConfig() { AnakinConfig GetConfig() {
AnakinConfig config; AnakinConfig config;
config.model_file = "./mobilenet_v2.anakin.bin"; config.model_file = FLAGS_model;
config.device = 0; config.device = 0;
config.max_batch_size = 1; config.max_batch_size = 1;
return config; return config;
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE)
cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context framework_proto proto_desc operator)
cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
cc_test(test_tape
SRCS test_tape.cc
DEPS tape tape_variable)
# Dynamic Graph on Fluid
PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
challenging and we are still way from there. DyNet and PyTorch provide a good design
idea, the *tape*, that significantly eases the challenge. Also, DyNet provides
a C++ API that is as convenient as Python but with higher efficiency and could
conveniently integrate with industrial/production systems. This package, `tape`,
combines the good of
1. tape from PyTorch and DyNet
2. C++ API and core from DyNet
3. rich set of operators from PaddlePaddle
## Overview
We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
by wrapping Paddle Fluid's `Operator` and `Variable`.
The user API is straight forward since
1. it is imperative. And it uses host language's control flow logic.
1. it avoids extra concepts such as `Scope` and `Executor`.
All of these benefits come at the cost of just adding one line `reset_global_tape`
at every iteration.
## Code Structure
In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
`type`, the pointers to the `Variable`s, and necessary attributes.
```c++
class Variable {
public:
VriableHandle Grad(); // returns its gradient variable
private:
framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
framework::Variable var_; // run time variable, holds data memory
};
using VariableHandle = shared_ptr<Variable>;
struct OpHandle {
string type_;
map<string, vector<VariableHandle>> inputs_;
map<string, vector<VariableHandle>> outputs_;
AttributeMap attrs_;
};
class Tape {
public:
void AddOp(OpHandle); // add op
void Forward(); // execute the tape_
void Backward(); // execute the backward of the tape_
private:
vector<OpHandle> tape_;
};
```
We uses `Function` to indicate layers. It takes care of parameter
initialization and `AddOp` to the Tape when it is called.
```c++
class Linear {
public:
Linear(int in_dim, int out_dim, const std::string &act)
: w_(new Variable("LinearWeight")),
b_(new Variable("LinearBias")),
act_(act) {
Tape init_tape;
std::string initializer = "fill_constant";
framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{in_dim, out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
init_tape.Forward();
}
VariableHandle operator()(VariableHandle input) {
VariableHandle pre_bias(new Variable("linear"));
get_global_tape().AddOp("mul",
{{"X", {input}}, {"Y", {w_}}},
{{"Out", {pre_bias}}},
{{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
VariableHandle pre_act(new Variable("linear"));
get_global_tape().AddOp("elementwise_add",
{{"X", {pre_bias}}, {"Y", {b_}}},
{{"Out", {pre_act}}},
{{"axis", 1}});
VariableHandle post_act(new Variable("linear"));
get_global_tape().AddOp(act_,
{{"X", {pre_act}}},
{{"Out", {post_act}}},
{});
return post_act;
}
std::vector<VariableHandle> Params() { return {w_, b_}; }
private:
VariableHandle w_;
VariableHandle b_;
std::string act_;
};
```
## User API
```c++
// Model function
paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
paddle::tape::Mean mean;
// Optimizer
paddle::tape::SGD sgd(0.001);
// Data Feeder
paddle::tape::Fill data_feeder(...);
VariableHandle input(new paddle::tape::Variable("input"));
VariableHandle label(new paddle::tape::Variable("label"));
for (int i = 0; i < 2; ++i) {
reset_global_tape();
data_feeder(input, label);
auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
LOG(INFO) << loss.value(); // Run forward up to loss
// Run backward, store gradient of w at w->Grad()
get_global_tape.Backward(loss);
// Update w
sgd(linear1.Params());
sgd(linear2.Params());
}
```
<details>
<summary></summary>
digraph G {
subgraph cluster_0 {
node [shape=record,style=filled];
style=filled;
color=lightgrey;
linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} | {output |<before_bias1> Out: before_bias1}}"];
elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} | {output |<before_act1> Out: before_act1}}"];
relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} | {output |<after_act1> Out: after_act1}}"];
linear1 -> elementwise_add1->relu1;
label = "forward tape";
}
linear1:before_mul1->before_mul1
linear1:weight1->weight1
linear1:before_bias1->before_bias1
elementwise_add1:bias1->bias1
elementwise_add1:before_bias1->before_bias1
elementwise_add1:before_act1->before_act1
relu1:before_act1->before_act1
relu1:after_act1->after_act1
subgraph cluster_1 {
node [shape=record,style=filled];
style=filled;
color=lightgrey;
linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} | {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} | {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
relu1_grad [label="{type: relu_grad | {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
label = "backward tape";
}
relu1_grad:after_act1_grad->after_act1_grad
relu1_grad:before_act1_grad->before_act1_grad
elementwise_add1_grad:before_act1_grad->before_act1_grad
elementwise_add1_grad:before_bias1_grad->before_bias1_grad
elementwise_add1_grad:bias1_grad->bias1_grad
linear1_grad:before_mul1->before_mul1
linear1_grad:weight1->weight1
linear1_grad:before_bias1_grad->before_bias1_grad
linear1_grad:before_mul1_grad->before_mul1_grad
linear1_grad:weight1_grad->weight1_grad
subgraph cluster_2 {
node [shape=record];
label = "Linear1";
weight1
bias1
}
weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
bias1 -> bias1_grad [ label="Grad()", style="dashed"];
}
</details>
![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
## Code Reuse
We want to stay close to Paddle Fluid as much as possible.
### Reuse All Operators
As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
is about 10 lines of code, similar to expose an operator to Python.
### Reuse Compile Time InferShape and InferVarType
Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
`InferVarType` every time we `AddOp` to the tape.
### Reuse Operator::Run
We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
`Scope` for every `Operator::Run()`.
## Possible Feature
### Release Memory on Backward
We can release memory aggressively. During backward, we can delete the OpHandle once
we have finished its backward. Since all the variable is managed by smart pointer, the
memory is automatically released when its `ref_count` goes to 0.
### Kernel Fusion
As a symbolic representation of the Tape is constructed first before the actual
execution, it would be possible to perform graph optimization. One use case is kernel
fusion.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/contrib/tape/tape.h"
#include "paddle/contrib/tape/variable.h"
#include "paddle/fluid/framework/type_defs.h"
namespace paddle {
namespace tape {
class Function {};
class Fill {
public:
Fill(const std::string &initializer, const framework::AttributeMap &attrs)
: initializer_(initializer), attrs_(attrs) {}
void operator()(VariableHandle var) {
get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
}
private:
const std::string initializer_;
const framework::AttributeMap attrs_;
};
class Mean {
public:
VariableHandle operator()(VariableHandle var) {
VariableHandle out(new Variable("mean"));
get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
return out;
}
};
class Linear {
public:
Linear(int in_dim, int out_dim, const std::string &act)
: w_(new Variable("LinearWeight")),
b_(new Variable("LinearBias")),
act_(act) {
Tape init_tape;
std::string initializer = "fill_constant";
framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{in_dim, out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
init_tape.Forward();
}
VariableHandle operator()(VariableHandle input) {
VariableHandle pre_bias(new Variable("linear"));
get_global_tape().AddOp("mul",
{{"X", {input}}, {"Y", {w_}}},
{{"Out", {pre_bias}}},
{{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
VariableHandle pre_act(new Variable("linear"));
get_global_tape().AddOp("elementwise_add",
{{"X", {pre_bias}}, {"Y", {b_}}},
{{"Out", {pre_act}}},
{{"axis", 1}});
VariableHandle post_act(new Variable("linear"));
get_global_tape().AddOp(
act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
return post_act;
}
std::vector<VariableHandle> Params() { return {w_, b_}; }
private:
VariableHandle w_;
VariableHandle b_;
std::string act_;
};
class SGD {
public:
SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
Tape init_tape;
std::string initializer = "fill_constant";
framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{1};
attrs["value"] = learning_rate;
init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
init_tape.Forward();
}
void operator()(VariableHandle input) {
PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
"optimization must happen after the backward");
Tape temp_tape;
temp_tape.AddOp("sgd",
{{"Param", {input}},
{"LearningRate", {learning_rate_}},
{"Grad", {input->Grad()}}},
{{"ParamOut", {input}}},
{});
temp_tape.Forward();
}
private:
VariableHandle learning_rate_;
};
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/contrib/tape/tape.h"
#include <list>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/dim.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace tape {
// borrowed from
// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
inline bool ends_with(std::string const &value, std::string const &ending) {
if (ending.size() > value.size()) return false;
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}
std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
os << var_desc.Name();
os << "[" << var_desc.GetType() << "]";
os << "[" << var_desc.GetDataType() << "]";
os << "{";
for (auto &i : var_desc.GetShape()) {
os << i << ",";
}
os << "}";
return os;
}
std::string to_string(const std::string &type,
const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars,
const framework::AttributeMap &attrs) {
std::stringstream ss;
ss << type << " ";
for (auto &param_name : in_vars) {
for (auto &var : param_name.second) {
ss << param_name.first << ":(" << var->Desc() << ") ";
}
}
for (auto &param_name : out_vars) {
for (auto &var : param_name.second) {
ss << param_name.first << ":(" << var->Desc() << ") ";
}
}
return ss.str();
}
framework::OpDesc CreateOpDesc(const std::string &type,
const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars,
const framework::AttributeMap &attrs) {
framework::VariableNameMap inputs;
for (auto &param_name : in_vars) {
for (auto &var : param_name.second) {
inputs[param_name.first].emplace_back(var->Name());
}
}
framework::VariableNameMap outputs;
for (auto &param_name : out_vars) {
for (auto &var : param_name.second) {
outputs[param_name.first].emplace_back(var->Name());
}
}
return framework::OpDesc(type, inputs, outputs, attrs);
}
void InferShapeAndVarType(const std::string &type,
const VariableHandleMap &in_vars,
VariableHandleMap *out_vars,
const framework::AttributeMap &attrs) {
framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
// Create a temporary block for compile-time
framework::ProgramDesc program_desc;
framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
PADDLE_ENFORCE(block_desc);
for (auto &param_name : in_vars) {
for (auto &var : param_name.second) {
*block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
}
}
for (auto &param_name : *out_vars) {
for (auto &var : param_name.second) {
*block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
}
}
LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
op_desc.InferShape(*block_desc);
op_desc.InferVarType(block_desc);
for (auto &param_name : *out_vars) {
for (auto &var : param_name.second) {
*var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
}
}
LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
}
void Tape::AddOp(const std::string &type,
const VariableHandleMap &in_vars,
VariableHandleMap out_vars,
const framework::AttributeMap &attrs) {
InferShapeAndVarType(type, in_vars, &out_vars, attrs);
tape_.emplace_back(type, in_vars, out_vars, attrs);
}
// Temporary Scope for Operator::Run()
class ScopeWrapper : public framework::Scope {
public:
ScopeWrapper(const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars) {
for (auto &v : in_vars) {
for (auto &vv : v.second) {
if (!vars_.count(vv->Name())) {
vars_[vv->Name()].reset(vv->Var());
}
}
}
for (auto &v : out_vars) {
for (auto &vv : v.second) {
if (!vars_.count(vv->Name())) {
vars_[vv->Name()].reset(vv->Var());
}
}
}
}
~ScopeWrapper() {
for (auto &pair : vars_) {
pair.second.release();
}
}
};
void Tape::Forward() {
LOG(INFO) << "Starting forward -------------------------";
PADDLE_ENFORCE(!has_been_backwarded_);
while (current_position_ < tape_.size()) {
OpHandle &op = tape_[current_position_];
// Create Output Tensor, this is only necessary for OpWithKernel
for (auto &param2var : op.outputs_) {
for (auto &var : param2var.second) {
var->InitializeVariable();
}
}
framework::OpDesc op_desc =
CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
ScopeWrapper scope(op.inputs_, op.outputs_);
framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
current_position_++;
}
LOG(INFO) << "Finishing forward -------------------------";
}
void Tape::Backward(VariableHandle target) {
PADDLE_ENFORCE(!has_been_backwarded_);
Forward();
// TODO(tonyyang-svail): check output of last op is target
backward_tape_.reset(new Tape());
framework::AttributeMap attrs;
// FIXME(tonyyang-svail): Need to infer_data_type
attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{1};
attrs["value"] = 1.0f;
backward_tape_->AddOp(
"fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
framework::OpDesc op_desc =
CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
std::unordered_map<std::string, std::string> grad_to_var;
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance()
.Get(op_desc.Type())
.GradOpMaker()(op_desc, {}, &grad_to_var, {});
for (auto &op_desc : grad_op_descs) {
std::unordered_map<std::string, VariableHandle> name2var;
for (auto &param2vars : it->inputs_) {
for (auto &a : param2vars.second) {
name2var[a->Name()] = a;
}
}
for (auto &param2vars : it->outputs_) {
for (auto &a : param2vars.second) {
name2var[a->Name()] = a;
}
}
VariableHandleMap in_vars;
VariableHandleMap out_vars;
std::map<const framework::VariableNameMap *, VariableHandleMap *>
loop_over{{&op_desc->Inputs(), &in_vars},
{&op_desc->Outputs(), &out_vars}};
for (auto &each : loop_over) {
auto &vmp = *each.first;
auto &vhm = *each.second;
for (auto &p2a : vmp) {
for (auto &argu : p2a.second) {
if (name2var.count(argu)) {
vhm[p2a.first].push_back(name2var[argu]);
} else {
PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
argu.c_str());
std::string name = argu.substr(
0, argu.size() - std::strlen(framework::kGradVarSuffix));
PADDLE_ENFORCE(name2var.count(name), name.c_str());
vhm[p2a.first].push_back(name2var[name]->Grad());
}
}
}
}
backward_tape_->AddOp(
op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
}
// TODO(tonyyang-svail): how to fill empty grad?
// TODO(tonyyang-svail): Sum var grad is necessary
}
backward_tape_->Forward();
has_been_backwarded_ = true;
}
Tape &get_global_tape() {
static Tape T;
return T;
}
void reset_global_tape() { get_global_tape() = Tape(); }
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/contrib/tape/variable.h"
namespace paddle {
namespace tape {
using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
struct OpHandle {
OpHandle(const std::string &type,
const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars,
const framework::AttributeMap &attrs)
: type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
std::string type_;
VariableHandleMap inputs_;
VariableHandleMap outputs_;
framework::AttributeMap attrs_;
};
class Tape {
public:
void AddOp(const std::string &type,
const VariableHandleMap &in_vars,
VariableHandleMap out_vars,
const framework::AttributeMap &attrs);
void Forward();
void Backward(VariableHandle target);
bool HasBeenBackwarded() { return has_been_backwarded_; }
private:
bool has_been_backwarded_ = false;
size_t current_position_ = 0;
std::vector<OpHandle> tape_;
std::shared_ptr<Tape> backward_tape_;
};
Tape &get_global_tape();
void reset_global_tape();
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include "paddle/contrib/tape/function.h"
using namespace paddle::tape;
TEST(Tape, TestMLP) {
LOG(INFO) << "TestMLP";
Linear linear1(3, 3, "relu");
Linear linear2(3, 3, "relu");
Mean mean;
SGD sgd(0.001);
std::string initializer = "fill_constant";
paddle::framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{3, 3};
attrs["value"] = 1.0f;
Fill filler(initializer, attrs);
for (int i = 0; i < 2; ++i) {
reset_global_tape();
VariableHandle input(new Variable("input"));
filler(input);
auto loss = mean(linear2(linear1(input)));
get_global_tape().Backward(loss);
for (auto w : linear1.Params()) {
sgd(w);
}
for (auto w : linear2.Params()) {
sgd(w);
}
}
}
int main(int argc, char** argv) {
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/contrib/tape/variable.h"
namespace paddle {
namespace tape {
void Variable::InitializeVariable() {
LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
framework::proto::VarType::Type var_type = desc_.GetType();
if (var_type == framework::proto::VarType::LOD_TENSOR) {
var_.GetMutable<framework::LoDTensor>();
} else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
var_.GetMutable<framework::SelectedRows>();
} else {
PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
var_type);
}
}
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/operator.h" // framework::kGradVarSuffix
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace tape {
class Variable;
using VariableHandle = std::shared_ptr<Variable>;
/*
* Combination of
* framework::VarDesc desc_;
* framework::Variable var_;
*/
class Variable {
public:
Variable(const std::string pre_fix)
: desc_(pre_fix + std::to_string(count())) {}
Variable(const std::string pre_fix, bool is_grad)
: desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
: std::to_string(count()))) {}
~Variable() { LOG(INFO) << "Deleting " << Name(); }
// Instantiate LoDTensor/SelectedRow
void InitializeVariable();
VariableHandle Grad() {
if (grad_.expired()) {
VariableHandle new_grad(new Variable(desc_.Name(), true));
grad_ = new_grad;
return new_grad;
} else {
return VariableHandle(grad_);
}
}
// Stochastic Gradient Descent with Momentum
// VariableHandle Momentum ();
// void init(const std::string& initializer,
// const framework::AttributeMap& attrs);
// void value() {};
const framework::VarDesc& Desc() const { return desc_; }
framework::VarDesc* MutableDesc() { return &desc_; }
// TODO(tonyyang-svail): No need to expose name
std::string Name() const { return desc_.Name(); }
framework::Variable* Var() { return &var_; }
private:
int count() {
static int counter = 0;
return counter++;
}
framework::VarDesc desc_;
framework::Variable var_;
std::weak_ptr<Variable> grad_;
};
}
}
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/piece.h" #include "paddle/fluid/string/piece.h"
...@@ -113,6 +114,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -113,6 +114,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
} }
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
#ifndef PADDLE_WITH_MKLDNN
operators::math::SetNumThreads(1);
#endif
} }
void InitGLOG(const std::string &prog_name) { void InitGLOG(const std::string &prog_name) {
......
...@@ -98,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -98,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(10) << "- " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place); PADDLE_THROW("Cannot run operator on place %s", place);
...@@ -107,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -107,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif #endif
} }
RunImpl(scope, place); RunImpl(scope, place);
VLOG(10) << "+ " << DebugStringEx(&scope);
} }
bool OperatorBase::HasInputs(const std::string& name) const { bool OperatorBase::HasInputs(const std::string& name) const {
......
...@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<void *> buffers;
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
...@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
t->Resize(dims); t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type()); buffer = t->mutable_data(place, main_tensor.type());
} }
auto &nccl_ctx = member_->nccl_ctxs_->at(place); buffers.push_back(buffer);
platform::dynload::ncclBcast(buffer, numel, data_type, 0, }
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
"variables' buffer size to bcast NOT equal to places");
{
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream()); nccl_ctx.comm_, nccl_ctx.stream());
} }
member_->nccl_ctxs_->WaitAll(); member_->nccl_ctxs_->WaitAll();
}
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
......
...@@ -81,6 +81,9 @@ class Scope { ...@@ -81,6 +81,9 @@ class Scope {
// Rename variable to a new name and return the new name // Rename variable to a new name and return the new name
std::string Rename(const std::string& origin_name) const; std::string Rename(const std::string& origin_name) const;
protected:
mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
private: private:
// Call Scope::NewScope for a sub-scope. // Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {} explicit Scope(Scope const* parent) : parent_(parent) {}
...@@ -93,8 +96,6 @@ class Scope { ...@@ -93,8 +96,6 @@ class Scope {
// Caller doesn't own the returned Variable. // Caller doesn't own the returned Variable.
Variable* FindVarLocally(const std::string& name) const; Variable* FindVarLocally(const std::string& name) const;
mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
// Scope in `kids_` are owned by this class. // Scope in `kids_` are owned by this class.
mutable std::list<Scope*> kids_; mutable std::list<Scope*> kids_;
Scope const* parent_{nullptr}; Scope const* parent_{nullptr};
......
...@@ -20,16 +20,20 @@ limitations under the License. */ ...@@ -20,16 +20,20 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
DEFINE_bool(init_p2p, false, "Whether to init p2p."); DEFINE_bool(init_p2p, false, "Whether to init p2p.");
DEFINE_int32(math_num_threads, 1,
"Number of threads used to run math functions.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void Init(const std::vector<std::string> argv) { void Init(const std::vector<std::string> argv) {
framework::InitGflags(argv); framework::InitGflags(argv);
operators::math::SetNumThreads(FLAGS_math_num_threads);
// init devices // init devices
std::vector<int> devices; std::vector<int> devices;
std::string token; std::string token;
......
...@@ -271,18 +271,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -271,18 +271,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("X", "Input of HardShrink operator"); AddInput("X", "Input of HardShrink operator");
AddOutput("Out", "Output of HardShrink operator"); AddOutput("Out", "Output of HardShrink operator");
AddAttr<float>("threshold", "The value of threshold for HardShrink") AddAttr<float>("threshold",
"The value of threshold for HardShrink. [default: 0.5]")
.SetDefault(0.5f); .SetDefault(0.5f);
AddComment(R"DOC( AddComment(R"DOC(
HardShrink Activation Operator. :strong:`HardShrink activation operator`
$$ .. math::
out = \begin{cases} out = \begin{cases}
x, \text{if } x > \lambda \\ x, \text{if } x > \lambda \\
x, \text{if } x < -\lambda \\ x, \text{if } x < -\lambda \\
0, \text{otherwise} 0, \text{otherwise}
\end{cases} \end{cases}
$$
)DOC"); )DOC");
} }
...@@ -394,18 +394,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -394,18 +394,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("X", "Input of ThresholdedRelu operator"); AddInput("X", "Input of ThresholdedRelu operator");
AddOutput("Out", "Output of ThresholdedRelu operator"); AddOutput("Out", "Output of ThresholdedRelu operator");
AddAttr<float>("threshold", "The threshold location of activation") AddAttr<float>("threshold",
"The threshold location of activation. [default 1.0].")
.SetDefault(1.0f); .SetDefault(1.0f);
AddComment(R"DOC( AddComment(R"DOC(
ThresholdedRelu Activation Operator. :strong:`ThresholdedRelu activation operator`
$$ .. math::
out = \begin{cases}
out = \begin{cases}
x, \text{if } x > threshold \\ x, \text{if } x > threshold \\
0, \text{otherwise} 0, \text{otherwise}
\end{cases} \end{cases}
$$
)DOC"); )DOC");
} }
}; };
......
...@@ -23,30 +23,26 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -23,30 +23,26 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
OpComment comment; OpComment comment;
AddInput("X", AddInput("X", string::Sprintf("the left hand operand of %s operator",
string::Sprintf("(LoDTensor) the left hand operand of %s operator",
comment.type)); comment.type));
AddInput("Y", string::Sprintf( AddInput("Y", string::Sprintf("the right hand operand of %s operator",
"(LoDTensor) the right hand operand of %s operator",
comment.type)); comment.type));
AddAttr<bool>("force_cpu", AddAttr<bool>("force_cpu",
"(bool, default false) Force fill output variable to cpu " "Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running " "memory. Otherwise, fill output variable to the running "
"device") "device [default true].")
.SetDefault(false); .SetDefault(true);
AddOutput("Out", string::Sprintf( AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
"(LoDTensor) n-dim bool tensor. Each element is %s",
comment.equation)); comment.equation));
AddComment(string::Sprintf(R"DOC(%s Operator AddComment(string::Sprintf(R"DOC(
It operates element-wise on X and Y, and returns the Out. Each of them is a It operates element-wise on X and Y, and returns the Out. Each of them is a
N-dim tensor. X and Y could be any type. The each element of the Out tensor is N-dim tensor. X and Y could be any type. The each element of the Out tensor is
calculated by %s calculated by $%s$
)DOC", )DOC",
comment.type, comment.equation)); comment.equation));
AddAttr<int>("axis", AddAttr<int>(
"(int, default -1). The start dimension index " "axis",
"for broadcasting Y onto X.") "The start dimension index for broadcasting Y onto X. [default -1]")
.SetDefault(-1) .SetDefault(-1)
.EqualGreaterThan(-1); .EqualGreaterThan(-1);
} }
......
...@@ -107,7 +107,13 @@ REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, ...@@ -107,7 +107,13 @@ REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
false> /* set false to disable empty grad */); false> /* set false to disable empty grad */);
REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad); REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>); concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
concat_grad, concat_grad,
ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>); ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
...@@ -15,7 +15,13 @@ limitations under the License. */ ...@@ -15,7 +15,13 @@ limitations under the License. */
#include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/operators/concat_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>); concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, double>,
ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>,
ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
concat_grad, concat_grad,
ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>); ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
...@@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel { ...@@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel {
class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "Input of Cumsum operator"); AddInput("X", "Input of cumsum operator");
AddOutput("Out", "Output of Cumsum operator"); AddOutput("Out", "Output of cumsum operator");
AddAttr<int>("axis", AddAttr<int>("axis",
"(int, default -1). The dimenstion to accumulate along. " "The dimenstion to accumulate along. -1 means the last "
"-1 means the last dimenstion") "dimenstion [default -1].")
.SetDefault(-1) .SetDefault(-1)
.EqualGreaterThan(-1); .EqualGreaterThan(-1);
AddAttr<bool>("exclusive", AddAttr<bool>("exclusive",
"bool, default false). Whether to perform exclusive cumsum") "Whether to perform exclusive cumsum. [default false].")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("reverse", AddAttr<bool>("reverse",
"bool, default false). If true, the cumsum is performed in " "If true, the cumsum is performed in the reversed direction. "
"the reversed direction") "[default false].")
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
The cumulative sum of the elements along a given axis. The cumulative sum of the elements along a given axis.
......
...@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase { ...@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase {
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name); auto invar = scope->FindVar(in_var_name);
framework::Variable* outvar = scope->FindVar(out_var_name); // out var must be created in local scope!
framework::Variable* outvar = scope->Var(out_var_name);
request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
......
...@@ -85,7 +85,7 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -85,7 +85,7 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
.InEnum({"CUDA", "CPU", "AUTO"}) .InEnum({"CUDA", "CPU", "AUTO"})
.SetDefault("AUTO"); .SetDefault("AUTO");
AddComment(R"DOC( AddComment(R"DOC(
Returns a list of places based on flags. The list will be used for parallel Returns a list of places based on arguments. The list will be used for parallel
execution. execution.
)DOC"); )DOC");
} }
......
...@@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel { ...@@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel {
class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "(LoDTensor) The input tensor."); AddInput("X", "The input tensor.");
AddInput("Scale", AddInput("Scale",
"(Tensor, optional) Scale is a 1-dimensional tensor of size " "(optional) Scale is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.") "It is applied to the output.")
.AsDispensable(); .AsDispensable();
AddInput("Bias", AddInput("Bias",
"(Tensor, optional) Bias is a 1-dimensional tensor of size " "(optional) Bias is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.") "It is applied to the output.")
.AsDispensable(); .AsDispensable();
AddOutput("Y", "(LoDTensor) Result after normalization."); AddOutput("Y", "Result after normalization.");
AddOutput("Mean", "(Tensor) Mean of the current mini batch.") AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
.AsIntermediate(); AddOutput("Variance", "Variance of the current mini batch.")
AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
.AsIntermediate(); .AsIntermediate();
AddAttr<float>("epsilon", AddAttr<float>("epsilon",
"(float, default 1e-5) Constant for " "Constant for numerical stability [default 1e-5].")
"numerical stability")
.SetDefault(1e-5) .SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) { .AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
"'epsilon' should be between 0.0 and 0.001."); "'epsilon' should be between 0.0 and 0.001.");
}); });
AddAttr<int>("begin_norm_axis", AddAttr<int>("begin_norm_axis",
"(int default:1), the " "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
"axis of `begin_norm_axis ... Rank(X) - 1` will be "
"normalized. `begin_norm_axis` splits the tensor(`X`) to a " "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
"matrix [N,H].") "matrix [N,H]. [default 1].")
.SetDefault(1) .SetDefault(1)
.AddCustomChecker([](const int &begin_norm_axis) { .AddCustomChecker([](const int &begin_norm_axis) {
PADDLE_ENFORCE_GT(begin_norm_axis, 0, PADDLE_ENFORCE_GT(begin_norm_axis, 0,
...@@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
}); });
AddComment(R"DOC( AddComment(R"DOC(
Layer Normalization. Assume feature vectors exist on dimensions
Layer Norm has been implemented as discussed in the paper: :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
https://arxiv.org/abs/1607.06450 along these dimensions for each feature vector :math:`a` with size
... :math:`H`, then normalize each feature vector using the corresponding
statistics. After that, apply learnable gain and bias on the normalized
tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
)DOC"); )DOC");
} }
}; };
......
...@@ -20,13 +20,16 @@ ...@@ -20,13 +20,16 @@
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include <mkl_cblas.h>
#include <mkl_lapacke.h> #include <mkl_lapacke.h>
#include <mkl_service.h>
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
...@@ -46,6 +49,18 @@ namespace paddle { ...@@ -46,6 +49,18 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
static void SetNumThreads(int num_threads) {
#ifdef PADDLE_USE_OPENBLAS
int real_num_threads = num_threads > 1 ? num_threads : 1;
openblas_set_num_threads(real_num_threads);
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
mkl_set_num_threads(real_num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
}
/** /**
* Matrix Descriptor of a memory buffer. * Matrix Descriptor of a memory buffer.
* *
......
...@@ -21,8 +21,10 @@ limitations under the License. */ ...@@ -21,8 +21,10 @@ limitations under the License. */
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mean_iou_op.h"
namespace paddle {
namespace operators {
class MeanIoUOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Predictions"),
"Input (Predictions) of MeanIoU op should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input (labels) of MeanIoU op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"),
"Output (OutMeanIou) of MeanIoU op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutWrong"),
"Output (OutWrong) of MeanIoU op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"),
"Output (OutWrong) of MeanIoU op should not be null.");
int64_t num_classes =
static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
ctx->SetOutputDim("OutMeanIou", {1});
ctx->SetOutputDim("OutWrong", {num_classes});
ctx->SetOutputDim("OutCorrect", {num_classes});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
ctx.GetPlace());
}
};
class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Predictions",
"(Tensor), A Tensor of prediction results for semantic labels"
" with type int32 or int64. The rank should be greater than 1.");
AddInput(
"Labels",
"(Tensor), A Tensor of ground truth labels with type int32 or int64."
"Its shape should be the same as Input(Predictions).");
AddInput("InWrongs",
"(vector<Tensor>), A list of Tensor with shape "
"[num_classes]. They are used to collect wrong number among "
"batches. Empty list is also valid here.")
.AsDuplicable()
.AsDispensable();
AddInput(
"InCorrects",
"(vector<Tensor>), A list of Tensor with shape "
"[num_classes]. They are used to collect correct number among batches. "
"Empty list is also valid here.")
.AsDuplicable()
.AsDispensable();
AddInput("InMeanIou",
"(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
"be added to. Empty list is also valid here.")
.AsDuplicable()
.AsDispensable();
AddOutput("OutMeanIou",
"(vector<Tensor>), A Tensor representing the"
" mean intersection-over-union with shape [1].");
AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
AddAttr<int>("num_classes", "(int), The possible number of labels.");
AddComment(R"DOC(
mean-IOU Operator.
Mean Intersection-Over-Union is a common evaluation metric for
semantic image segmentation, which first computes the IOU for each
semantic class and then computes the average over classes.
IOU is defined as follows:
IOU = true_positive / (true_positive + false_positive + false_negative).
It is based on pixel level area while "IOU Similarity Operator"
is based on area of rectangle.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel<int>,
ops::MeanIoUKernel<int32_t>,
ops::MeanIoUKernel<int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/mean_iou_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void CountCUDAKernel(const int num_classes, const int count,
const T* predictions, const T* labels,
int* wrong, int* correct) {
extern __shared__ int blcok_cache[];
int* wrong_c = blcok_cache;
int* correct_c = blcok_cache + num_classes;
// init cache
for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
blcok_cache[i] = 0;
}
__syncthreads();
T pred;
T label;
CUDA_1D_KERNEL_LOOP(i, count) {
pred = predictions[i];
label = labels[i];
if (pred == label) {
atomicAdd(correct_c + pred, 1);
} else {
atomicAdd(wrong_c + pred, 1);
atomicAdd(wrong_c + label, 1);
}
}
__syncthreads();
for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
atomicAdd(wrong + i, wrong_c[i]);
atomicAdd(correct + i, correct_c[i]);
}
}
__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
int* correct, float* ious, float* iou) {
__shared__ int valid_count_c;
if (threadIdx.x == 0) {
valid_count_c = 0;
}
__syncthreads();
CUDA_1D_KERNEL_LOOP(i, num_classes) {
int wrong_n = wrong[i];
int correct_n = correct[i];
int denominator = wrong_n + correct_n;
if (denominator > 0) {
atomicAdd(&valid_count_c, 1);
ious[i] = static_cast<float>(correct_n) / denominator;
} else {
ious[i] = 0;
}
}
__syncthreads();
if (threadIdx.x == 0) {
float iou_sum = 0;
for (int i = 0; i < num_classes; ++i) {
iou_sum += ious[i];
}
iou[0] += iou_sum / valid_count_c;
}
}
template <typename T>
class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
// get input and output tensor
auto* predictions = ctx.Input<Tensor>("Predictions");
auto* labels = ctx.Input<Tensor>("Labels");
auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
auto* out_wrong = ctx.Output<Tensor>("OutWrong");
auto* out_correct = ctx.Output<Tensor>("OutCorrect");
int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
// Get data ptr
const T* predictions_data = predictions->data<T>();
const T* labels_data = labels->data<T>();
int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
float* out_mean_iou_data =
out_mean_iou->mutable_data<float>(ctx.GetPlace());
// Get Eigen tensor
auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
// Temporary tensor
Tensor ious;
float* ious_data = ious.mutable_data<float>(
{static_cast<int64_t>(num_classes)}, ctx.GetPlace());
auto ious_t = EigenTensor<float, 1>::From(ious);
// Init out_wrong, out_correct and out_mean_iou
out_wrong_t.device(place) = out_wrong_t.constant(0);
out_correct_t.device(place) = out_correct_t.constant(0);
out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
// collect pre wrong, correct and mean_iou
auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
for (int i = 0; i < in_mean_ious.size(); ++i) {
out_mean_iou_t.device(place) +=
EigenTensor<float, 1>::From(*in_mean_ious[i]);
}
auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
for (int i = 0; i < in_wrongs.size(); ++i) {
out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
}
auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
for (int i = 0; i < in_corrects.size(); ++i) {
out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
}
// compute
auto stream = ctx.cuda_device_context().stream();
int block = PADDLE_CUDA_NUM_THREADS;
int grid = (predictions->numel() + block - 1) / block;
int cache_size = (num_classes * 2 + 1) * sizeof(int);
CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
num_classes, predictions->numel(), predictions_data, labels_data,
out_wrong_data, out_correct_data);
ctx.device_context().Wait();
ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
out_correct_data, ious_data,
out_mean_iou_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel<int>,
ops::MeanIoUCUDAOpKernel<int64_t>,
ops::MeanIoUCUDAOpKernel<int32_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T>
class MeanIoUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
// get input and output tensor
auto* predictions = ctx.Input<Tensor>("Predictions");
auto* labels = ctx.Input<Tensor>("Labels");
auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
auto* out_wrong = ctx.Output<Tensor>("OutWrong");
auto* out_correct = ctx.Output<Tensor>("OutCorrect");
int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
// get data ptr
const T* predictions_data = predictions->data<T>();
const T* labels_data = labels->data<T>();
float* out_mean_iou_data =
out_mean_iou->mutable_data<float>(ctx.GetPlace());
int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
// get eigen tensor
auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
// Tmp tensor
Tensor denominator;
Tensor valid_count;
Tensor iou_sum;
// get data ptr of tmp tensor
int* denominator_data = denominator.mutable_data<int>(
{static_cast<int64_t>(num_classes)}, ctx.GetPlace());
int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
// get eigen tensor of tmp tensor
auto denominator_t = EigenTensor<int, 1>::From(denominator);
auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
// init out_wrong, out_correct and out_mean_iou
out_wrong_t = out_wrong_t.constant(0);
out_correct_t = out_correct_t.constant(0);
out_mean_iou_t = out_mean_iou_t.constant(0);
// collect pre wrong, correct and mean_iou
auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
for (size_t i = 0; i < in_mean_ious.size(); ++i) {
out_mean_iou_t.device(place) +=
EigenTensor<float, 1>::From(*in_mean_ious[i]);
}
auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
for (size_t i = 0; i < in_wrongs.size(); ++i) {
out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
}
auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
for (size_t i = 0; i < in_corrects.size(); ++i) {
out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
}
// compute
for (int64_t i = 0; i < predictions->numel(); ++i) {
if (predictions_data[i] == labels_data[i]) {
out_correct_data[predictions_data[i]] += 1;
} else {
out_wrong_data[labels_data[i]] += 1;
out_wrong_data[predictions_data[i]] += 1;
}
}
denominator_t = out_wrong_t + out_correct_t;
valid_count_t =
(denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
for (int i = 0; i < num_classes; ++i) {
if (denominator_data[i] == 0) {
denominator_data[i] = 1;
}
}
iou_sum_t =
(out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
}
};
} // namespace operators
} // namespace paddle
...@@ -62,26 +62,46 @@ class MultiplexOp : public framework::OperatorWithKernel { ...@@ -62,26 +62,46 @@ class MultiplexOp : public framework::OperatorWithKernel {
class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker { class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Ids", "The index tensor of multiplex operator."); AddInput("Ids",
AddInput("X", "The candidate tensors of multiplex operator.") "Tensor<int32>, index variable which is a 2-D tensor with shape "
"[M, 1] where M is the batch size.");
AddInput("X",
"A list of variables to gather from. All variables have the same "
"shape and the rank is at least 2.")
.AsDuplicable(); .AsDuplicable();
AddOutput("Out", "The output tensor of multiplex operator."); AddOutput("Out", "The output tensor of multiplex operator.");
AddComment(R"DOC( AddComment(R"DOC(
Multiplex Operator. Referring to the given index variable, this layer selects rows from the
input variables to construct a multiplex variable. Assuming that there are
Multiplex multiple tensors according to the index provided by the index tensor. :math:`m` input variables and :math:`I_i` represents the i-th input
variable and :math:`i` is in [0, :math:`m`). All input variables are
Ids: the index tensor. tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
X[0 : N - 1]: the candidate tensors for output (N >= 2). Please note that rank of the input tensor should be at least 2. Each input
For each index i from 0 to batchSize - 1, the output is the i-th row of the variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
variable. The given index variable should be a 2-D tensor with shape
[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
Then the output variable will be a tensor with shape [:math:`d_0`,
:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
* Ids: the index tensor.
* X[0 : N - 1]: the candidate tensors for output (N >= 2).
* For each index i from 0 to batchSize - 1, the output is the i-th row of the
the (Ids[i])-th tensor. the (Ids[i])-th tensor.
For i-th row of the output tensor: For i-th row of the output tensor:
$$y[i] = x_{k}[i]$$ $$
y[i] = x_{k}[i]
$$
where `y` is the output tensor, `x_{k}` is the k-th input tensor, where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
and `k = Ids[i]`. and $k = Ids[i]$.
)DOC"); )DOC");
} }
......
...@@ -78,11 +78,15 @@ class CreateRecordIOReaderOp : public framework::OperatorBase { ...@@ -78,11 +78,15 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
class CreateRecordIOReaderOpMaker : public FileReaderMakerBase { class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
protected: protected:
void Apply() override { void Apply() override {
AddAttr<std::string>("filename", "The filename of record io reader"); AddAttr<std::string>(
"filename",
"The filename of record file. This file will given to reader.");
AddComment(R"DOC( AddComment(R"DOC(
CreateRecordIOReader Operator Open a recordio file and return the reader object. The returned reader object
is thread-safe.
Create a reader from a record io file NOTE: This is a very low-level API. It is used for debugging data file or
training. Please use `open_files` instead of this API for production usage.
)DOC"); )DOC");
} }
}; };
......
...@@ -54,7 +54,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName( ...@@ -54,7 +54,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
} }
void FileReaderMakerBase::Make() { void FileReaderMakerBase::Make() {
AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable(); AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes."); AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"ranks", "ranks",
......
...@@ -78,23 +78,23 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -78,23 +78,23 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("X",
"(LoDTensor), the input(X) is a LodTensor, which supports " "the input(X) is a LodTensor, which supports "
"variable time-length input sequences. The underlying tensor " "variable time-length input sequences. The underlying tensor "
"in this LoDTensor is a matrix with shape (T x N), where T " "in this LoDTensor is a matrix with shape (T x N), where T "
"is the total time steps in this mini-batch and N is the input " "is the total time steps in this mini-batch and N is the input "
"data dimension."); "data dimension.");
AddInput("Filter", AddInput("Filter",
"(Tensor), the input(Filter) is a learnable parameter. It " "the input(Filter) is a learnable parameter. It "
"is a 2-D tensor with shape (future_context x N), where, " "is a 2-D tensor with shape (future_context x N), where, "
"future_context is the future context length and N is the data " "future_context is the future context length and N is the data "
"dimension."); "dimension.");
AddOutput("Out", AddOutput("Out",
"(LoDTensor), the output(Out) is a LodTensor, which supports " "the output(Out) is a LodTensor, which supports "
"variable time-length input sequences. The underlying tensor " "variable time-length input sequences. The underlying tensor "
"in this LodTensor is a matrix with shape T x N, i.e., the " "in this LodTensor is a matrix with shape T x N, i.e., the "
"same shape as X."); "same shape as X.");
AddComment(R"DOC( AddComment(R"DOC(
Row-convolution Operator. :strong:`Row-convolution operator`
The row convolution is called lookahead convolution. This operator was The row convolution is called lookahead convolution. This operator was
introduced in the following paper for DeepSpeech2: introduced in the following paper for DeepSpeech2:
...@@ -114,9 +114,23 @@ and a filter ($W$) of size $context \times d$, ...@@ -114,9 +114,23 @@ and a filter ($W$) of size $context \times d$,
the output sequence is convolved as: the output sequence is convolved as:
$$ $$
out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :} out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
$$ $$
In the above equation:
* $Out_{i}$: The i-th row of output variable with shape [1, D].
* $\\tau$: Future context size.
* $X_{j}$: The j-th row of input variable with shape [1, D].
* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
More details about row_conv please refer to
the design document
https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
)DOC"); )DOC");
} }
}; };
......
...@@ -115,4 +115,7 @@ USE_CPU_ONLY_OP(concat); ...@@ -115,4 +115,7 @@ USE_CPU_ONLY_OP(concat);
REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker); REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
REGISTER_OP_CPU_KERNEL(split, REGISTER_OP_CPU_KERNEL(split,
ops::SplitOpKernel<paddle::platform::CPUPlace, float>); ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
...@@ -15,4 +15,7 @@ limitations under the License. */ ...@@ -15,4 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/split_op.h" #include "paddle/fluid/operators/split_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>); split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int>);
...@@ -86,32 +86,24 @@ class UniformRandomOp : public framework::OperatorWithKernel { ...@@ -86,32 +86,24 @@ class UniformRandomOp : public framework::OperatorWithKernel {
class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker { class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddOutput("Out", "(Tensor) The output tensor of uniform random op"); AddOutput("Out", "The output tensor of uniform random op");
AddComment(R"DOC( AddComment(R"DOC(
Uniform random operator.
This operator initializes a tensor with random values sampled from a This operator initializes a tensor with random values sampled from a
uniform distribution. uniform distribution. The random result is in set [min, max].
)DOC"); )DOC");
AddAttr<std::vector<int>>("shape", AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
"(vector<int>) The shape of the output tensor"); AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
AddAttr<float>("min",
"(float, default -1.0) "
"Minimum value of uniform random")
.SetDefault(-1.0f); .SetDefault(-1.0f);
AddAttr<float>("max", AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
"(float, default 1.0) "
"Maximun value of uniform random")
.SetDefault(1.0f); .SetDefault(1.0f);
AddAttr<int>("seed", AddAttr<int>("seed",
"(int, default 0) "
"Random seed used for generating samples. " "Random seed used for generating samples. "
"0 means use a seed generated by the system." "0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always " "Note that if seed is not 0, this operator will always "
"generate the same random numbers every time.") "generate the same random numbers every time. [default 0].")
.SetDefault(0); .SetDefault(0);
AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type") AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
.SetDefault(framework::proto::VarType::FP32); .SetDefault(framework::proto::VarType::FP32);
} }
}; };
......
...@@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer {
DisableActivity(); DisableActivity();
dynload::cuptiUnsubscribe(subscriber_); dynload::cuptiUnsubscribe(subscriber_);
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
PADDLE_ENFORCE(dynload::cuptiFinalize());
enabled_ = false; enabled_ = false;
} }
......
...@@ -72,7 +72,6 @@ extern void *cupti_dso_handle; ...@@ -72,7 +72,6 @@ extern void *cupti_dso_handle;
__macro(cuptiGetResultString); \ __macro(cuptiGetResultString); \
__macro(cuptiActivityGetNumDroppedRecords); \ __macro(cuptiActivityGetNumDroppedRecords); \
__macro(cuptiActivityFlushAll); \ __macro(cuptiActivityFlushAll); \
__macro(cuptiFinalize); \
__macro(cuptiSubscribe); \ __macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \ __macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback); \ __macro(cuptiEnableCallback); \
......
...@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { ...@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
} }
} }
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
// ncclGroupEnd will wait for all communicators to be initialized, which will
// cause blocking problem when a runtime_error was thrown, so try only guard
// NCCL actions when use it.
class NCCLGroupGuard { class NCCLGroupGuard {
public: public:
static std::mutex &NCCLMutex() { static std::mutex &NCCLMutex() {
......
...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef MATHFUNCTIONS_H_ #pragma once
#define MATHFUNCTIONS_H_
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include <mkl_cblas.h>
...@@ -21,7 +20,7 @@ limitations under the License. */ ...@@ -21,7 +20,7 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#if defined(PADDLE_USE_VECLIB) #ifdef PADDLE_USE_VECLIB
extern "C" { extern "C" {
#include <cblas.h> #include <cblas.h>
#include <clapack.h> #include <clapack.h>
...@@ -30,8 +29,10 @@ extern "C" { ...@@ -30,8 +29,10 @@ extern "C" {
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
...@@ -126,5 +127,3 @@ template <class T> ...@@ -126,5 +127,3 @@ template <class T>
void vTanh(const int n, const T* a, T* r); void vTanh(const int n, const T* a, T* r);
} // namespace paddle } // namespace paddle
#endif // MATHFUNCTIONS_H_
...@@ -132,7 +132,8 @@ EOF ...@@ -132,7 +132,8 @@ EOF
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_ANAKIN=ON
} }
function abort(){ function abort(){
......
...@@ -20,6 +20,7 @@ from ..framework import Program, Variable, Operator ...@@ -20,6 +20,7 @@ from ..framework import Program, Variable, Operator
from ..layer_helper import LayerHelper, unique_name from ..layer_helper import LayerHelper, unique_name
from ..initializer import force_init_on_cpu from ..initializer import force_init_on_cpu
from ops import logical_and, logical_not, logical_or from ops import logical_and, logical_not, logical_or
import numpy
__all__ = [ __all__ = [
'split_lod_tensor', 'split_lod_tensor',
...@@ -913,37 +914,40 @@ def create_array(dtype): ...@@ -913,37 +914,40 @@ def create_array(dtype):
dtype=dtype) dtype=dtype)
def less_than(x, y, force_cpu=True, cond=None, **ignored): @templatedoc()
def less_than(x, y, force_cpu=None, cond=None, **ignored):
""" """
**Less than** ${comment}
This layer returns the truth value of :math:`x < y` elementwise. >>> import paddle.fluid as fluid
>>> less = fluid.layers.less_than(x=label, y=limit)
Args: Args:
x(Variable): First operand of *less_than* x(${x_type}): ${x_comment}.
y(Variable): Second operand of *less_than* y(${y_type}): ${y_comment}.
force_cpu(Bool|True): The output data will be on CPU if set true. force_cpu(${force_cpu_type}): ${force_cpu_comment}.
cond(Variable|None): Optional output variable to store the result of *less_than* cond(Variable|None): Optional output variable to store the result of *less_than*
Returns: Returns:
Variable: The tensor variable storing the output of *less_than*. ${out_comment}.
Examples:
.. code-block:: python
less = fluid.layers.less_than(x=label, y=limit)
""" """
helper = LayerHelper("less_than", **locals()) helper = LayerHelper("less_than", **locals())
if cond is None: if cond is None:
cond = helper.create_tmp_variable(dtype='bool') cond = helper.create_tmp_variable(dtype='bool')
cond.stop_gradient = True cond.stop_gradient = True
attrs = dict()
if force_cpu is not None:
attrs['force_cpu'] = force_cpu
elif force_init_on_cpu():
attrs['force_cpu'] = force_init_on_cpu()
helper.append_op( helper.append_op(
type='less_than', type='less_than',
inputs={'X': [x], inputs={'X': [x],
'Y': [y]}, 'Y': [y]},
outputs={'Out': [cond]}, outputs={'Out': [cond]},
attrs={'force_cpu': force_cpu or force_init_on_cpu()}) attrs=attrs)
return cond return cond
...@@ -1009,8 +1013,28 @@ def array_read(array, i): ...@@ -1009,8 +1013,28 @@ def array_read(array, i):
def shrink_memory(x, i, table): def shrink_memory(x, i, table):
""" """
This function creates an operator to shrink_rnn_memory using the RankTable This function creates an operator to shrink rnn memory using the RankTable
as mentioned in the input parameter. as mentioned in the input parameter.
NOTE: This API is very low-level API. It is used by DynamicRNN only.
Since the Dynamic RNN uses no-padding way to implement RNN. The sequence
will be sorted by order, and the length of valid memory will be shrink after
each time step.
Args:
x(Variable): The memory object in the previous time step.
i(Variable): The step count variable. A int scalar as LoDTensor.
table(Variable): The RNNRankTable object.
Returns:
the memory variable after shrink.
Examples:
Since this API is very low level API. The example is not provided.
Please reference the implementation of class DynamicRNN for detail
usage.
""" """
helper = LayerHelper('shrink_memory', **locals()) helper = LayerHelper('shrink_memory', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_tmp_variable(dtype=x.dtype)
...@@ -1246,6 +1270,34 @@ class IfElseBlockGuard(object): ...@@ -1246,6 +1270,34 @@ class IfElseBlockGuard(object):
class IfElse(object): class IfElse(object):
"""
if-else control flow.
Args:
cond (Variable): condition used to compare.
name (str, default None): The name of this layer.
Examples:
.. code-block:: python
limit = fluid.layers.fill_constant_batch_size_like(
input=label, dtype='int64', shape=[1], value=5.0)
cond = fluid.layers.less_than(x=label, y=limit)
ie = fluid.layers.IfElse(cond)
with ie.true_block():
true_image = ie.input(image)
hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
ie.output(prob)
with ie.false_block():
false_image = ie.input(image)
hidden = fluid.layers.fc(
input=false_image, size=200, act='tanh')
prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
ie.output(prob)
prob = ie()
"""
OUT_IF_ELSE_BLOCKS = 0 OUT_IF_ELSE_BLOCKS = 0
IN_IF_ELSE_TRUE_BLOCKS = 1 IN_IF_ELSE_TRUE_BLOCKS = 1
IN_IF_ELSE_FALSE_BLOCKS = 2 IN_IF_ELSE_FALSE_BLOCKS = 2
...@@ -1348,6 +1400,38 @@ class IfElse(object): ...@@ -1348,6 +1400,38 @@ class IfElse(object):
class DynamicRNN(object): class DynamicRNN(object):
"""
The dynamic RNN can process a batch of sequence data. The length of each
sample sequence can be different. This API automatically process them in
batch.
The input lod must be set. Please reference `lod_tensor`
>>> import paddle.fluid as fluid
>>> data = fluid.layers.data(name='sentence', dtype='int64', lod_level=1)
>>> embedding = fluid.layers.embedding(input=data, size=[65535, 32],
>>> is_sparse=True)
>>>
>>> drnn = fluid.layers.DynamicRNN()
>>> with drnn.block():
>>> word = drnn.step_input(embedding)
>>> prev = drnn.memory(shape=[200])
>>> hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
>>> drnn.update_memory(prev, hidden) # set prev to hidden
>>> drnn.output(hidden)
>>>
>>> # last is the last time step of rnn. It is the encoding result.
>>> last = fluid.layers.sequence_last_step(drnn())
The dynamic RNN will unfold sequence into timesteps. Users need to define
how to process each time step during the :code:`with` block.
The `memory` is used staging data cross time step. The initial value of
memory can be zero or another variable.
The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
get the output sequence.
"""
BEFORE_RNN = 0 BEFORE_RNN = 0
IN_RNN = 1 IN_RNN = 1
AFTER_RNN = 2 AFTER_RNN = 2
...@@ -1370,6 +1454,15 @@ class DynamicRNN(object): ...@@ -1370,6 +1454,15 @@ class DynamicRNN(object):
self.mem_link = [] self.mem_link = []
def step_input(self, x): def step_input(self, x):
"""
Mark a sequence as a dynamic RNN input.
Args:
x(Variable): The input sequence.
Returns:
The current timestep in the input sequence.
"""
self._assert_in_rnn_block_("step_input") self._assert_in_rnn_block_("step_input")
if not isinstance(x, Variable): if not isinstance(x, Variable):
raise TypeError( raise TypeError(
...@@ -1413,6 +1506,15 @@ class DynamicRNN(object): ...@@ -1413,6 +1506,15 @@ class DynamicRNN(object):
return array_read(array=input_array, i=self.step_idx) return array_read(array=input_array, i=self.step_idx)
def static_input(self, x): def static_input(self, x):
"""
Mark a variable as a RNN input. The input will not be scattered into
time steps.
Args:
x(Variable): The input variable.
Returns:
The input variable that can access in RNN.
"""
self._assert_in_rnn_block_("static_input") self._assert_in_rnn_block_("static_input")
if not isinstance(x, Variable): if not isinstance(x, Variable):
raise TypeError( raise TypeError(
...@@ -1434,6 +1536,10 @@ class DynamicRNN(object): ...@@ -1434,6 +1536,10 @@ class DynamicRNN(object):
@contextlib.contextmanager @contextlib.contextmanager
def block(self): def block(self):
"""
The block for user to define operators in RNN. See the class docstring
for more details.
"""
if self.status != DynamicRNN.BEFORE_RNN: if self.status != DynamicRNN.BEFORE_RNN:
raise ValueError("rnn.block() can only be invoke once") raise ValueError("rnn.block() can only be invoke once")
self.step_idx = fill_constant( self.step_idx = fill_constant(
...@@ -1460,6 +1566,9 @@ class DynamicRNN(object): ...@@ -1460,6 +1566,9 @@ class DynamicRNN(object):
x=each_array, table=self.lod_rank_table)) x=each_array, table=self.lod_rank_table))
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
"""
Get the output of RNN. This API should only be invoked after RNN.block()
"""
if self.status != DynamicRNN.AFTER_RNN: if self.status != DynamicRNN.AFTER_RNN:
raise ValueError(("Output of the dynamic RNN can only be visited " raise ValueError(("Output of the dynamic RNN can only be visited "
"outside the rnn block.")) "outside the rnn block."))
...@@ -1474,6 +1583,70 @@ class DynamicRNN(object): ...@@ -1474,6 +1583,70 @@ class DynamicRNN(object):
value=0.0, value=0.0,
need_reorder=False, need_reorder=False,
dtype='float32'): dtype='float32'):
"""
Create a memory variable for dynamic rnn.
If the :code:`init` is not None, :code:`memory` will be initialized by
this variable. The :code:`need_reorder` is used to reorder the memory as
the input variable. It should be set to true when the initialized memory
depends on the input sample.
For example,
>>> import paddle.fluid as fluid
>>> sentence = fluid.layers.data(
>>> name='sentence', dtype='float32', shape=[32])
>>> boot_memory = fluid.layers.data(
>>> name='boot', dtype='float32', shape=[10])
>>>
>>> drnn = fluid.layers.DynamicRNN()
>>> with drnn.block():
>>> word = drnn.step_input(sentence)
>>> memory = drnn.memory(init=boot_memory, need_reorder=True)
>>> hidden = fluid.layers.fc(
>>> input=[word, memory], size=10, act='tanh')
>>> drnn.update_memory(ex_mem=memory, new_mem=hidden)
>>> drnn.output(hidden)
>>> rnn_output = drnn()
Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the
:code:`memory` will be initialized by this :code:`value`.
For example,
>>> import paddle.fluid as fluid
>>> sentence = fluid.layers.data(
>>> name='sentence', dtype='float32', shape=[32])
>>>
>>> drnn = fluid.layers.DynamicRNN()
>>> with drnn.block():
>>> word = drnn.step_input(sentence)
>>> memory = drnn.memory(shape=[10], dtype='float32', value=0)
>>> hidden = fluid.layers.fc(
>>> input=[word, memory], size=10, act='tanh')
>>> drnn.update_memory(ex_mem=memory, new_mem=hidden)
>>> drnn.output(hidden)
>>> rnn_output = drnn()
Args:
init(Variable|None): The initialized variable.
shape(list|tuple): The memory shape. NOTE the shape does not contain
batch_size.
value(float): the initalized value.
need_reorder(bool): True if the initialized memory depends on the
input sample.
dtype(str|numpy.dtype): The data type of the initialized memory.
Returns:
the memory variable.
"""
self._assert_in_rnn_block_('memory') self._assert_in_rnn_block_('memory')
if init is not None: if init is not None:
if not isinstance(init, Variable): if not isinstance(init, Variable):
...@@ -1541,6 +1714,16 @@ class DynamicRNN(object): ...@@ -1541,6 +1714,16 @@ class DynamicRNN(object):
return self.memory(init=init) return self.memory(init=init)
def update_memory(self, ex_mem, new_mem): def update_memory(self, ex_mem, new_mem):
"""
Update the memory from ex_mem to new_mem. NOTE that the shape and data
type of :code:`ex_mem` and :code:`new_mem` must be same.
Args:
ex_mem(Variable): the memory variable.
new_mem(Variable): the plain variable generated in RNN block.
Returns:
None
"""
self._assert_in_rnn_block_('update_memory') self._assert_in_rnn_block_('update_memory')
if not isinstance(ex_mem, Variable): if not isinstance(ex_mem, Variable):
raise TypeError("The input arg `ex_mem` of update_memory() must " raise TypeError("The input arg `ex_mem` of update_memory() must "
...@@ -1558,6 +1741,15 @@ class DynamicRNN(object): ...@@ -1558,6 +1741,15 @@ class DynamicRNN(object):
self.mem_link.append((new_mem, mem_array)) self.mem_link.append((new_mem, mem_array))
def output(self, *outputs): def output(self, *outputs):
"""
mark the RNN output variables.
Args:
outputs: The output variables.
Returns:
None
"""
self._assert_in_rnn_block_('output') self._assert_in_rnn_block_('output')
parent_block = self._parent_block_() parent_block = self._parent_block_()
for each in outputs: for each in outputs:
......
...@@ -210,53 +210,68 @@ def bipartite_match(dist_matrix, ...@@ -210,53 +210,68 @@ def bipartite_match(dist_matrix,
dist_threshold=None, dist_threshold=None,
name=None): name=None):
""" """
**Bipartite matchint operator** This operator implements a greedy bipartite matching algorithm, which is
used to obtain the matching with the maximum distance based on the input
This operator is a greedy bipartite matching algorithm, which is used to
obtain the matching with the maximum distance based on the input
distance matrix. For input 2D matrix, the bipartite matching algorithm can distance matrix. For input 2D matrix, the bipartite matching algorithm can
find the matched column for each row, also can find the matched row for find the matched column for each row (matched means the largest distance),
each column. And this operator only calculate matched indices from column also can find the matched row for each column. And this operator only
to row. For each instance, the number of matched indices is the number of calculate matched indices from column to row. For each instance,
of columns of the input ditance matrix. the number of matched indices is the column number of the input distance
matrix.
There are two outputs to save matched indices and distance.
A simple description, this algothrim matched the best (maximum distance) There are two outputs, matched indices and distance.
A simple description, this algorithm matched the best (maximum distance)
row entity to the column entity and the matched indices are not duplicated row entity to the column entity and the matched indices are not duplicated
in each row of ColToRowMatchIndices. If the column entity is not matched in each row of ColToRowMatchIndices. If the column entity is not matched
any row entity, set -1 in ColToRowMatchIndices. any row entity, set -1 in ColToRowMatchIndices.
Please note that the input DistMat can be LoDTensor (with LoD) or Tensor. NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
If Tensor, the height of ColToRowMatchIndices is 1. If Tensor, the height of ColToRowMatchIndices is 1.
NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
layer. Please consider to use :code:`ssd_loss` instead.
Args: Args:
dist_matrix(Variable): This input is a 2-D LoDTensor with shape dist_matrix(Variable): This input is a 2-D LoDTensor with shape
[K, M]. It is pair-wise distance matrix between the entities [K, M]. It is pair-wise distance matrix between the entities
represented by each row and each column. For example, assumed one represented by each row and each column. For example, assumed one
entity is A with shape [K], another entity is B with shape [M]. The entity is A with shape [K], another entity is B with shape [M]. The
dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger
the distance is, the better macthing the pairs are. Please note, the distance is, the better matching the pairs are.
This tensor can contain LoD information to represent a batch of
inputs. One instance of this batch can contain different numbers of NOTE: This tensor can contain LoD information to represent a batch
entities. of inputs. One instance of this batch can contain different numbers
of entities.
match_type(string|None): The type of matching method, should be match_type(string|None): The type of matching method, should be
'bipartite' or 'per_prediction', 'bipartite' by defalut. 'bipartite' or 'per_prediction'. [default 'bipartite'].
dist_threshold(float|None): If `match_type` is 'per_prediction', dist_threshold(float|None): If `match_type` is 'per_prediction',
this threshold is to determine the extra matching bboxes based this threshold is to determine the extra matching bboxes based
on the maximum distance, 0.5 by defalut. on the maximum distance, 0.5 by default.
Returns: Returns:
match_indices(Variable): A 2-D Tensor with shape [N, M] in int type. tuple: a tuple with two elements is returned. The first is
matched_indices, the second is matched_distance.
The matched_indices is a 2-D Tensor with shape [N, M] in int type.
N is the batch size. If match_indices[i][j] is -1, it N is the batch size. If match_indices[i][j] is -1, it
means B[j] does not match any entity in i-th instance. means B[j] does not match any entity in i-th instance.
Otherwise, it means B[j] is matched to row Otherwise, it means B[j] is matched to row
match_indices[i][j] in i-th instance. The row number of match_indices[i][j] in i-th instance. The row number of
i-th instance is saved in match_indices[i][j]. i-th instance is saved in match_indices[i][j].
match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
N is batch size. If match_indices[i][j] is -1, The matched_distance is a 2-D Tensor with shape [N, M] in float type
. N is batch size. If match_indices[i][j] is -1,
match_distance[i][j] is also -1.0. Otherwise, assumed match_distance[i][j] is also -1.0. Otherwise, assumed
match_distance[i][j] = d, and the row offsets of each instance match_distance[i][j] = d, and the row offsets of each instance
are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j]. are called LoD. Then match_distance[i][j] =
dist_matrix[d+LoD[i]][j].
Examples:
>>> x = fluid.layers.data(name='x', shape=[4], dtype='float32')
>>> y = fluid.layers.data(name='y', shape=[4], dtype='float32')
>>> iou = fluid.layers.iou_similarity(x=x, y=y)
>>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
""" """
helper = LayerHelper('bipartite_match', **locals()) helper = LayerHelper('bipartite_match', **locals())
match_indices = helper.create_tmp_variable(dtype='int32') match_indices = helper.create_tmp_variable(dtype='int32')
...@@ -364,7 +379,7 @@ def ssd_loss(location, ...@@ -364,7 +379,7 @@ def ssd_loss(location,
normalize=True, normalize=True,
sample_size=None): sample_size=None):
""" """
**Multi-box loss layer for object dection algorithm of SSD** **Multi-box loss layer for object detection algorithm of SSD**
This layer is to compute dection loss for SSD given the location offset This layer is to compute dection loss for SSD given the location offset
predictions, confidence predictions, prior boxes and ground-truth boudding predictions, confidence predictions, prior boxes and ground-truth boudding
...@@ -372,21 +387,35 @@ def ssd_loss(location, ...@@ -372,21 +387,35 @@ def ssd_loss(location,
is a weighted sum of the localization loss (or regression loss) and is a weighted sum of the localization loss (or regression loss) and
confidence loss (or classification loss) by performing the following steps: confidence loss (or classification loss) by performing the following steps:
1. Find matched boundding box by bipartite matching algorithm. 1. Find matched bounding box by bipartite matching algorithm.
1.1 Compute IOU similarity between ground-truth boxes and prior boxes. 1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
1.2 Compute matched boundding box by bipartite matching algorithm. 1.2 Compute matched boundding box by bipartite matching algorithm.
2. Compute confidence for mining hard examples 2. Compute confidence for mining hard examples
2.1. Get the target label based on matched indices. 2.1. Get the target label based on matched indices.
2.2. Compute confidence loss. 2.2. Compute confidence loss.
3. Apply hard example mining to get the negative example indices and update 3. Apply hard example mining to get the negative example indices and update
the matched indices. the matched indices.
4. Assign classification and regression targets 4. Assign classification and regression targets
4.1. Encoded bbox according to the prior boxes. 4.1. Encoded bbox according to the prior boxes.
4.2. Assign regression targets. 4.2. Assign regression targets.
4.3. Assign classification targets. 4.3. Assign classification targets.
5. Compute the overall objective loss. 5. Compute the overall objective loss.
5.1 Compute confidence loss. 5.1 Compute confidence loss.
5.1 Compute localization loss. 5.1 Compute localization loss.
5.3 Compute the overall weighted loss. 5.3 Compute the overall weighted loss.
Args: Args:
...@@ -421,39 +450,36 @@ def ssd_loss(location, ...@@ -421,39 +450,36 @@ def ssd_loss(location,
mining_type (str): The hard example mining type, should be 'hard_example' mining_type (str): The hard example mining type, should be 'hard_example'
or 'max_negative', now only support `max_negative`. or 'max_negative', now only support `max_negative`.
normalize (bool): Whether to normalize the SSD loss by the total number normalize (bool): Whether to normalize the SSD loss by the total number
of output locations, True by defalut. of output locations, True by default.
sample_size (int): The max sample size of negative box, used only when sample_size (int): The max sample size of negative box, used only when
mining_type is 'hard_example'. mining_type is 'hard_example'.
Returns: Returns:
Variable: The weighted sum of the localization loss and confidence loss, The weighted sum of the localization loss and confidence loss, with \
with shape [N * Np, 1], N and Np are the same as they are shape [N * Np, 1], N and Np are the same as they are in `location`.
in `location`.
Raises: Raises:
ValueError: If mining_type is 'hard_example', now only support ValueError: If mining_type is 'hard_example', now only support mining \
mining type of `max_negative`. type of `max_negative`.
Examples: Examples:
.. code-block:: python >>> pb = fluid.layers.data(
>>> name='prior_box',
pb = layers.data( >>> shape=[10, 4],
name='prior_box', >>> append_batch_size=False,
shape=[10, 4], >>> dtype='float32')
append_batch_size=False, >>> pbv = fluid.layers.data(
dtype='float32') >>> name='prior_box_var',
pbv = layers.data( >>> shape=[10, 4],
name='prior_box_var', >>> append_batch_size=False,
shape=[10, 4], >>> dtype='float32')
append_batch_size=False, >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32')
dtype='float32') >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32')
loc = layers.data(name='target_box', shape=[10, 4], dtype='float32') >>> gt_box = fluid.layers.data(
scores = layers.data(name='scores', shape=[10, 21], dtype='float32') >>> name='gt_box', shape=[4], lod_level=1, dtype='float32')
gt_box = layers.data( >>> gt_label = fluid.layers.data(
name='gt_box', shape=[4], lod_level=1, dtype='float32') >>> name='gt_label', shape=[1], lod_level=1, dtype='float32')
gt_label = layers.data( >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
name='gt_label', shape=[1], lod_level=1, dtype='float32')
loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
""" """
helper = LayerHelper('ssd_loss', **locals()) helper = LayerHelper('ssd_loss', **locals())
......
...@@ -292,6 +292,7 @@ def _copy_reader_create_op_(block, op): ...@@ -292,6 +292,7 @@ def _copy_reader_create_op_(block, op):
return new_op return new_op
@templatedoc(op_type='create_recordio_file_reader')
def open_recordio_file(filename, def open_recordio_file(filename,
shapes, shapes,
lod_levels, lod_levels,
...@@ -299,34 +300,30 @@ def open_recordio_file(filename, ...@@ -299,34 +300,30 @@ def open_recordio_file(filename,
pass_num=1, pass_num=1,
for_parallel=True): for_parallel=True):
""" """
Open a RecordIO file ${comment}
This layer takes a RecordIO file to read from and returns a Reader Variable.
Via the Reader Variable, we can get data from the given RecordIO file.
Args: Args:
filename(str): The RecordIO file's name. filename(${filename_type}): ${filename_comment}.
shapes(list): List of tuples which declaring data shapes. shapes(list): List of tuples which declaring data shapes.
lod_levels(list): List of ints which declaring data lod_level. lod_levels(${lod_levels_type}): ${lod_levels_comment}.
dtypes(list): List of strs which declaring data type. dtypes(list): List of strs which declaring data type.
pass_num(int): Number of passes to run. pass_num(int): Number of passes to run.
for_parallel(Bool): Set it as True if you are going to run for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel. subsequent operators in parallel.
Returns: Returns:
Variable: A Reader Variable via which we can get RecordIO file data. ${out_comment}.
Examples: Examples:
.. code-block:: python
reader = fluid.layers.io.open_recordio_file( >>> import paddle.fluid as fluid
filename='./data.recordio', >>> reader = fluid.layers.io.open_recordio_file(
shapes=[(3,224,224), (1)], >>> filename='./data.recordio',
lod_levels=[0, 0], >>> shapes=[(3,224,224), (1)],
dtypes=['float32', 'int64']) >>> lod_levels=[0, 0],
>>> dtypes=['float32', 'int64'])
# Via the reader, we can use 'read_file' layer to get data: >>> # Via the reader, we can use 'read_file' layer to get data:
image, label = fluid.layers.io.read_file(reader) >>> image, label = fluid.layers.io.read_file(reader)
""" """
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
shape_concat = [] shape_concat = []
...@@ -554,6 +551,29 @@ def batch(reader, batch_size): ...@@ -554,6 +551,29 @@ def batch(reader, batch_size):
def double_buffer(reader, place=None, name=None): def double_buffer(reader, place=None, name=None):
"""
Wrap a double buffer reader. The data will copy to target place with a
double buffer queue. If the target place is None, the place that executor
perform on will be used.
Args:
reader(Variable): the reader variable need to be wrapped.
place(Place): the place of target data. Default is the sample place of
executor perform.
name(str): Variable name. None if the user does not care.
Returns:
wrapped reader with double buffer.
Examples:
>>> reader = fluid.layers.open_files(filenames=['somefile'],
>>> shapes=[[-1, 784], [-1, 1]],
>>> dtypes=['float32', 'int64'])
>>> reader = fluid.layers.double_buffer(reader)
>>> img, label = fluid.layers.read_file(reader)
"""
attrs = dict() attrs = dict()
if place is not None: if place is not None:
attrs['place'] = str(place).upper() attrs['place'] = str(place).upper()
...@@ -587,6 +607,26 @@ def read_file(file_obj): ...@@ -587,6 +607,26 @@ def read_file(file_obj):
class Preprocessor(object): class Preprocessor(object):
"""
A block for data pre-processing in reader.
Args:
reader (Variable): A reader variable.
name (str, default None): The name of the reader.
Examples:
.. code-block:: python
preprocessor = fluid.layers.io.Preprocessor(reader=reader)
with preprocessor.block():
img, lbl = preprocessor.inputs()
img_out = img / 2
lbl_out = lbl + 1
preprocessor.outputs(img_out, lbl_out)
data_file = fluid.layers.io.double_buffer(preprocessor())
"""
BEFORE_SUB_BLOCK = 0 BEFORE_SUB_BLOCK = 0
IN_SUB_BLOCK = 1 IN_SUB_BLOCK = 1
AFTER_SUB_BLOCK = 2 AFTER_SUB_BLOCK = 2
......
此差异已折叠。
...@@ -40,8 +40,6 @@ __activations__ = [ ...@@ -40,8 +40,6 @@ __activations__ = [
'relu6', 'relu6',
'pow', 'pow',
'stanh', 'stanh',
'hard_shrink',
'thresholded_relu',
'hard_sigmoid', 'hard_sigmoid',
'swish', 'swish',
] ]
...@@ -64,11 +62,9 @@ __all__ = [ ...@@ -64,11 +62,9 @@ __all__ = [
'logical_or', 'logical_or',
'logical_xor', 'logical_xor',
'logical_not', 'logical_not',
'uniform_random',
'uniform_random_batch_size_like', 'uniform_random_batch_size_like',
'gaussian_random', 'gaussian_random',
'gaussian_random_batch_size_like', 'gaussian_random_batch_size_like',
'cumsum',
'scatter', 'scatter',
'sum', 'sum',
'slice', 'slice',
...@@ -79,3 +75,88 @@ __all__ = [ ...@@ -79,3 +75,88 @@ __all__ = [
for _OP in set(__all__): for _OP in set(__all__):
globals()[_OP] = generate_layer_fn(_OP) globals()[_OP] = generate_layer_fn(_OP)
__all__ += ["uniform_random"]
_uniform_random_ = generate_layer_fn('uniform_random')
def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
kwargs = dict()
for name in locals():
val = locals()[name]
if val is not None:
kwargs[name] = val
return _uniform_random_(**kwargs)
uniform_random.__doc__ = _uniform_random_.__doc__ + """
Examples:
>>> result = fluid.layers.uniform_random(shape=[32, 784])
"""
__all__ += ['hard_shrink']
_hard_shrink_ = generate_layer_fn('hard_shrink')
def hard_shrink(x, threshold=None):
kwargs = dict()
for name in locals():
val = locals()[name]
if val is not None:
kwargs[name] = val
return _hard_shrink_(**kwargs)
hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
Examples:
>>> data = fluid.layers.data(name="input", shape=[784])
>>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
"""
__all__ += ['cumsum']
_cum_sum_ = generate_layer_fn('cumsum')
def cumsum(x, axis=None, exclusive=None, reverse=None):
kwargs = dict()
for name in locals():
val = locals()[name]
if val is not None:
kwargs[name] = val
return _cum_sum_(**kwargs)
cumsum.__doc__ = _cum_sum_.__doc__ + """
Examples:
>>> data = fluid.layers.data(name="input", shape=[32, 784])
>>> result = fluid.layers.cumsum(data, axis=0)
"""
__all__ += ['thresholded_relu']
_thresholded_relu_ = generate_layer_fn('thresholded_relu')
def thresholded_relu(x, threshold=None):
kwargs = dict()
for name in locals():
val = locals()[name]
if val is not None:
kwargs[name] = val
_thresholded_relu_(**kwargs)
thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
Examples:
>>> data = fluid.layers.data(name="input", shape=[1])
>>> result = fluid.layers.thresholded_relu(data, threshold=0.4)
"""
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unlessf required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
...@@ -69,7 +69,12 @@ def create_parameter(shape, ...@@ -69,7 +69,12 @@ def create_parameter(shape,
is_bias=False, is_bias=False,
default_initializer=None): default_initializer=None):
""" """
Create a parameter Create a parameter. The parameter is a learnable variable, which can have
gradient, and can be optimized.
NOTE: this is a very low-level API. This API is useful when you create
operator by your self. instead of using layers.
Args: Args:
shape(list[int]): shape of the parameter shape(list[int]): shape of the parameter
dtype(string): element type of the parameter dtype(string): element type of the parameter
...@@ -81,7 +86,12 @@ def create_parameter(shape, ...@@ -81,7 +86,12 @@ def create_parameter(shape,
default_initializer(Initializer): initializer for the parameter default_initializer(Initializer): initializer for the parameter
Returns: Returns:
Parameter: the created parameter the created parameter.
Examples:
>>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
>>> data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
>>> hidden = fluid.layers.matmul(x=data, y=W)
""" """
helper = LayerHelper("create_parameter", **locals()) helper = LayerHelper("create_parameter", **locals())
if attr is None: if attr is None:
......
...@@ -76,8 +76,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -76,8 +76,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
hidden_0_layers = [ hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
for emb in emb_layers
] ]
hidden_0 = fluid.layers.sums(input=hidden_0_layers) hidden_0 = fluid.layers.sums(input=hidden_0_layers)
...@@ -94,8 +93,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -94,8 +93,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
for i in range(1, depth): for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[ mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
]) ])
lstm = fluid.layers.dynamic_lstm( lstm = fluid.layers.dynamic_lstm(
......
...@@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME) ...@@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME)
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_dist_train)
#list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
#list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
# TODO(wuyi): this test hungs on CI, will add it back later # TODO(wuyi): this test hungs on CI, will add it back later
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
...@@ -50,3 +50,5 @@ foreach(TEST_OP ${TEST_OPS}) ...@@ -50,3 +50,5 @@ foreach(TEST_OP ${TEST_OPS})
endforeach(TEST_OP) endforeach(TEST_OP)
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL) py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
import unittest
import numpy as np
from op_test import OpTest
def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
in_mean_ious):
assert predictions.shape == labels.shape
predictions = predictions.flatten()
labels = labels.flatten()
out_wrong = np.zeros([num_classes]).astype("int32")
for _, wrong in in_wrongs:
out_wrong += wrong
out_correct = np.zeros([num_classes]).astype("int32")
for _, correct in in_corrects:
out_correct += correct
for pred, label in zip(predictions, labels):
if pred == label:
out_correct[pred] += 1
else:
out_wrong[pred] += 1
out_wrong[label] += 1
denominator = out_wrong + out_correct
valid_count = (denominator != 0).sum()
denominator = np.where(denominator > 0, denominator,
np.ones(denominator.shape))
mean_iou = (out_correct / denominator).sum() / valid_count
for _, in_mean_iou in in_mean_ious:
mean_iou += in_mean_iou
return mean_iou, out_wrong, out_correct
class TestMeanIOUOp(OpTest):
def setUp(self):
self.config()
self.op_type = "mean_iou"
predictions = np.random.randint(0, self.num_classes,
self.image_size).astype("int32")
labels = np.random.randint(0, self.num_classes,
self.image_size).astype("int32")
in_wrongs = []
for i in range(self.in_wrong_num):
in_wrongs.append(("in_wrong_%d" % i, np.random.randint(
0, 10, [self.num_classes]).astype("int32")))
in_corrects = []
for i in range(self.in_correct_num):
in_corrects.append(("in_correct_%d" % i, np.random.randint(
0, 10, [self.num_classes]).astype("int32")))
in_mean_ious = []
for i in range(self.in_mean_iou_num):
in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform(
0, 1, [1]).astype("float32")))
self.inputs = {
'Predictions': predictions,
'Labels': labels,
'InWrongs': in_wrongs,
'InCorrects': in_corrects,
'InMeanIou': in_mean_ious
}
self.attrs = {'num_classes': long(self.num_classes)}
mean_iou, out_wrong, out_correct = compute_mean_iou(
predictions, labels, self.num_classes, in_wrongs, in_corrects,
in_mean_ious)
self.outputs = {
'OutMeanIou': mean_iou,
'OutWrong': out_wrong,
'OutCorrect': out_correct
}
def config(self):
self.num_classes = 10
self.image_size = [128, 128]
self.in_wrong_num = 0
self.in_correct_num = 0
self.in_mean_iou_num = 0
def test_check_output(self):
self.check_output()
class TestCase1(TestMeanIOUOp):
def config(self):
self.num_classes = 5
self.image_size = [100, 128]
self.in_wrong_num = 2
self.in_correct_num = 2
self.in_mean_iou_num = 2
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册