提交 136a5919 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into no_counter_on_pserver

......@@ -156,6 +156,7 @@ include(rdma) # set rdma libraries
include(flags) # set paddle compile flags
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
......
......@@ -28,9 +28,3 @@ endif()
add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
IF(NOT WITH_C_API AND WITH_FLUID)
INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
ENDIF()
......@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
IF(WITH_C_API OR WITH_FLUID)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID)
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
......
......@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
IF(WITH_C_API OR WITH_FLUID)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
IF(ANDROID)
INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
......
......@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
CACHE FILEPATH "protoc library." FORCE)
IF(WITH_C_API OR WITH_FLUID)
IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
......
......@@ -52,6 +52,7 @@ ExternalProject_Add(
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DBUILD_TESTS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
......
......@@ -179,20 +179,24 @@ function(cc_library TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (cc_library_SRCS)
if (cc_library_SHARED OR cc_library_shared) # build *.so
if(cc_library_SRCS)
if(cc_library_SHARED OR cc_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
else()
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif()
if (cc_library_DEPS)
if(cc_library_DEPS)
# Don't need link libwarpctc.so
if ("${cc_library_DEPS};" MATCHES "warpctc;")
if("${cc_library_DEPS};" MATCHES "warpctc;")
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
# cpplint code style
......
# make package for paddle fluid shared and static library
function(copy TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
endif()
math(EXPR len "${copy_lib_SRCS_len} - 1")
add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
foreach(index RANGE ${len})
list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst)
add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
if(IS_DIRECTORY ${src})
add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
else()
add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
endif()
endforeach()
endfunction()
# third party
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
copy(eigen3_lib
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
copy(gflags_lib
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
copy(glog_lib
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
)
IF(NOT PROTOBUF_FOUND)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib
)
ENDIF(NOT PROTOBUF_FOUND)
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
set(module "framework")
copy(framework_lib DEPS framework_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
)
set(module "memory")
copy(memory_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
)
set(module "inference")
copy(inference_lib DEPENDS paddle_fluid_shared
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform")
copy(platform_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
)
set(module "string")
copy(string_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
)
add_custom_target(inference_lib_dist DEPENDS
inference_lib framework_lib memory_lib platform_lib string_lib
gflags_lib glog_lib protobuf_lib eigen3_lib)
......@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式:
pip_install_cn.rst
docker_install_cn.rst
../../howto/dev/build_cn.md
build_cn.md
编译流程
++++++++
......
......@@ -13,7 +13,7 @@ You can choose either pip or Docker to complete your install:
pip_install_en.rst
docker_install_en.rst
../../howto/dev/build_en.md
build_en.md
Build from Source
......
## Auto Gradient Checker Design
## Auto Gradient Check Design
## Backgraound:
- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
1. you should get the right backpropagation formula according to the forward computation.
2. you should implement it right in CPP.
3. it's difficult to prepare test data.
## Background:
- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
1. The formula for backpropagation formula should be correct according to the forward computation.
2. The Implementation of the above shoule be correct in CPP.
3. It is difficult to prepare an unbiased test data.
- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
1. numerical gradient checker only need forward operator.
2. user only need to prepare the input data for forward Operator.
- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
1. Numerical gradient checker only needs the forward operator.
2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
## Mathematical Theory
The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
## Numeric Gradient Implementation
## Numerical Gradient Implementation
### Python Interface
```python
def get_numerical_gradient(op,
......@@ -27,73 +27,76 @@ def get_numerical_gradient(op,
delta=0.005,
local_scope=None):
"""
Get Numeric Gradient for an operator's input.
Get Numerical Gradient for the input of an operator.
:param op: C++ operator instance, could be an network
:param op: C++ operator instance, could be an network.
:param input_values: The input variables. Should be an dictionary, whose key is
variable name, and value is numpy array.
variable name, and value is a numpy array.
:param output_name: The final output variable name.
:param input_to_check: The input variable with respect to which to compute the gradient.
:param delta: The perturbation value for numeric gradient method. The
smaller delta is, the more accurate result will get. But if that delta is
too small, it will suffer from numerical stability problem.
:param input_to_check: The input variable with respect to which the gradient has to be computed.
:param delta: The perturbation value for numerical gradient method. The
smaller the delta, the more accurate the result. But if the delta is too
small, it will suffer from the numerical stability problem.
:param local_scope: The local scope used for get_numeric_gradient.
:return: The gradient array in numpy format.
"""
```
### Explaination:
### Explanation:
- Why need `output_name`
- An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
- Why do we need an `output_name`
- An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
- Why need `input_to_check`
- One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
- Why do we need `input_to_check`
- One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
### Core Algorithm Implementation
```python
# we only compute gradient of one element a time.
# we only compute the gradient of one element a time.
# we use a for loop to compute the gradient of each element.
for i in xrange(tensor_size):
# get one input element by its index i.
origin = tensor_to_check.get_float_element(i)
# get one input element using the index i.
original = tensor_to_check.get_float_element(i)
# add delta to it, run op and then get the new value of the result tensor.
x_pos = origin + delta
# add delta to it, run the forward op and then
# get the new value of the result tensor.
x_pos = original + delta
tensor_to_check.set_float_element(i, x_pos)
y_pos = get_output()
# plus delta to this element, run op and get the new value of the result tensor.
x_neg = origin - delta
# Subtract delta from this element, run the op again
# and get the new value of the result tensor.
x_neg = original - delta
tensor_to_check.set_float_element(i, x_neg)
y_neg = get_output()
# restore old value
tensor_to_check.set_float_element(i, origin)
tensor_to_check.set_float_element(i, original)
# compute the gradient of this element and store it into a numpy array.
# compute the gradient of this element and store
# it into a numpy array.
gradient_flat[i] = (y_pos - y_neg) / delta / 2
# reshape the gradient result to the shape of the source tensor.
return gradient_flat.reshape(tensor_to_check.get_dims())
```
## Auto Graident Checker Framework
## Auto Gradient Check Framework
Each Operator Kernel has three kinds of Gradient:
1. Numerical gradient
2. CPU kernel gradient
3. GPU kernel gradient (if supported)
3. GPU kernel gradient (if supported by the device)
The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
1. calculate the numerical gradient
2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
1. Calculate the numerical gradient
2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
#### Python Interface
......@@ -109,26 +112,27 @@ The numerical gradient only relies on forward Operator. So we use the numerical
"""
:param forward_op: used to create backward_op
:param input_vars: numpy value of input variable. The following
computation will use these variables.
:param inputs_to_check: the input variable with respect to which to compute the gradient.
computation will use these variables.
:param inputs_to_check: the input variable with respect to which the
gradient will be computed.
:param output_name: The final output variable name.
:param max_relative_error: The relative tolerance parameter.
:param no_grad_set: used when create backward ops
:param no_grad_set: used to create backward ops
:param only_cpu: only compute and check gradient on cpu kernel.
:return:
"""
```
### How to check if two numpy array is close enough?
if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
### How to check if two numpy arrays are close enough?
if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
```python
numerical_grad = ...
operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
abs_numerical_grad = numpy.abs(numerical_grad)
# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
# error.
# if abs_numerical_grad is nearly zero, then use abs error for
# numeric_grad, instead of relative error.
abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
......@@ -137,10 +141,10 @@ max_diff = numpy.max(diff_mat)
#### Notes:
The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
#### Refs:
#### References:
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
# C++ Data Feeding
In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
## Reader
A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
### `ReaderBase`
`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
```cpp
class ReaderBase {
public:
explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
PADDLE_ENFORCE(!shapes_.empty());
}
// Read the next batch of data. (A 'batch' can be only one instance)
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
// Show whether the next bacth exists.
virtual bool HasNext() const = 0;
// Reinitialize the reader and read the file from the begin.
virtual void ReInit() = 0;
// Get a certain read in data's shape.
DDim shape(size_t idx) const;
// Get shapes of all read in data.
std::vector<DDim> shapes() const { return shapes_; }
// Set shapes of read in data.
void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
virtual ~ReaderBase() {}
protected:
std::vector<DDim> shapes_;
};
```
### `FileReader` and `DecoratedReader`
These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
### `ReaderHolder`
Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
```cpp
var->Get<ReaderBase>("batch_reader");
```
we have to write:
```cpp
var->Get<BatchReader>("batch_reader");
```
This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
## Related Operators
To create and invoke readers, some now ops are introduced:
### `CreateReaderOp`
Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
### `ReadOp`
A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
......@@ -144,8 +144,9 @@ ch = fluid.make_channel(dtype=INT, buffer_size)
# Now write three elements to the channel
with fluid.while(steps=buffer_size):
fluid.send(ch, step)
fluid.close_channel(ch)
fluid.close_channel(ch)
with fluid.while(steps=buffer_size):
fluid.print(fluid.recv(ch))
```
......
......@@ -10,8 +10,7 @@ The following example shows the usage of `fluid.switch`.
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch() as switch:
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
......
../../CONTRIBUTING.md
\ No newline at end of file
开发标准
========
.. toctree::
:maxdepth: 1
contribute_to_paddle_cn.md
write_docs_cn.rst
Development
------------
.. toctree::
:maxdepth: 1
new_layer_en.rst
contribute_to_paddle_en.md
write_docs_en.rst
##################
如何贡献/修改文档
##################
#############
如何贡献文档
#############
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
......
##################
########################
Contribute Documentation
##################
########################
PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
......
......@@ -4,7 +4,7 @@
PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API,可以轻松地完成神经网络配置,模型训练等任务。
这里将介绍PaddlePaddle的基本使用概念,并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
在使用该文档之前,请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
在使用该文档之前,请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
配置网络
......
新手入门
============
.. _quick_install:
快速安装
++++++++
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装,版本为cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
pip install paddlepaddle-gpu
更详细的安装和编译方法参考:
.. toctree::
:maxdepth: 1
build_and_install/index_cn.rst
.. _quick_start:
快速开始
++++++++
创建一个 housing.py 并粘贴此Python代码:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
.. toctree::
:maxdepth: 1
quickstart_cn.rst
concepts/use_concepts_cn.rst
GET STARTED
============
.. _quick_install:
Quick Install
----------------------
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash
pip install paddlepaddle-gpu
For more details about installation and build:
.. toctree::
:maxdepth: 1
build_and_install/index_en.rst
.. _quick_start:
Quick Start
++++++++
Create a new file called housing.py, and paste this Python
code:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions
for the test housing data.
quickstart_en.rst
快速开始
========
快速安装
--------
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装,版本为cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
pip install paddlepaddle-gpu
更详细的安装和编译方法参考::ref:`install_steps` 。
快速使用
--------
创建一个 housing.py 并粘贴此Python代码:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
Quick Start
============
Quick Install
-------------
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash
pip install paddlepaddle-gpu
For more details about installation and build: :ref:`install_steps` .
Quick Use
---------
Create a new file called housing.py, and paste this Python
code:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions
for the test housing data.
## 编译 PaddlePaddle 预测库
## 安装与编译C-API预测库
### 概述
......
PaddlePaddle C-API
C-API预测库
==================
.. toctree::
......
## C-API 使用流程
## C-API使用流程
这篇文档介绍 PaddlePaddle C-API 整体使用流程。
......
# 分布式训练
## 概述
本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
## 环境准备
1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
## 启动参数说明
下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
下面以`doc/howto/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
## 启动参数说明
### 启动参数服务器
执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
```bash
......@@ -167,22 +133,3 @@ test.txt-00002
- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。
- `test_data_dir`:包含测试数据集的目录。
## 使用分布式计算平台或工具
PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
## 在不同集群中运行
- [fabric集群](fabric_cn.md)
- [openmpi集群](openmpi_cn.md)
- [kubernetes单机](k8s_cn.md)
- [kubernetes distributed分布式](k8s_distributed_cn.md)
- [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
# Distributed Training
## Introduction
In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
## Preparations
1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0rc, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
## Command-line arguments
We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
### Starting parameter server
Type the below command to start a parameter server which will wait for trainers to connect:
......@@ -171,21 +138,3 @@ Your workspace may looks like:
- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
- `test_data_dir`: containing testing data.
## Use cluster platforms or cluster management tools
PaddlePaddle supports running jobs on several platforms including:
- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
## Use different clusters
- [fabric](fabric_en.md)
- [openmpi](openmpi_en.md)
- [kubernetes](k8s_en.md)
- [kubernetes on AWS](k8s_aws_en.md)
分布式训练
==========
本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
.. image:: src/ps_cn.png
:width: 500
- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
.. toctree::
:maxdepth: 1
preparations_cn.md
cmd_argument_cn.md
multi_cluster/index_cn.rst
Distributed Training
====================
In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
.. image:: src/ps_en.png
:width: 500
- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
.. toctree::
:maxdepth: 1
preparations_en.md
cmd_argument_en.md
multi_cluster/index_en.rst
在不同集群中运行
================
PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
.. toctree::
:maxdepth: 1
fabric_cn.md
openmpi_cn.md
k8s_cn.md
k8s_distributed_cn.md
k8s_aws_cn.md
Use different clusters
======================
PaddlePaddle supports running jobs on several platforms including:
- `Kubernetes <http://kubernetes.io>`_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
- `OpenMPI <https://www.open-mpi.org>`_ Mature high performance parallel computing framework.
- `Fabric <http://www.fabfile.org>`_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
.. toctree::
:maxdepth: 1
fabric_en.md
openmpi_en.md
k8s_en.md
k8s_aws_en.md
## 环境准备
1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
## Preparations
1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0rc, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
.. _cmd_line_index:
设置命令行参数
命令行参数设置
===============
.. toctree::
......
../../../CONTRIBUTING.md
\ No newline at end of file
进阶指南
进阶使用
========
使用说明
--------
.. toctree::
:maxdepth: 1
usage/cmd_parameter/index_cn.rst
usage/cluster/cluster_train_cn.md
usage/capi/index_cn.rst
开发标准
--------
.. toctree::
:maxdepth: 1
dev/contribute_to_paddle_cn.md
dev/write_docs_cn.rst
模型配置
--------
.. toctree::
:maxdepth: 1
deep_model/rnn/index_cn.rst
性能优化
--------
.. toctree::
:maxdepth: 1
cmd_parameter/index_cn.rst
cluster/index_cn.rst
capi/index_cn.rst
rnn/index_cn.rst
optimization/gpu_profiling_cn.rst
HOW TO
=======
Usage
-------
.. toctree::
:maxdepth: 1
usage/cmd_parameter/index_en.rst
usage/cluster/cluster_train_en.md
Development
------------
.. toctree::
:maxdepth: 1
dev/new_layer_en.rst
dev/contribute_to_paddle_en.md
dev/write_docs_en.rst
Configuration
-------------
.. toctree::
:maxdepth: 1
deep_model/rnn/index_en.rst
Optimization
-------------
.. toctree::
:maxdepth: 1
cmd_parameter/index_en.rst
cluster/index_en.rst
rnn/index_en.rst
optimization/gpu_profiling_en.rst
==================
GPU性能分析与调优
==================
============
GPU性能调优
============
.. contents::
......
......@@ -5,6 +5,8 @@ PaddlePaddle 文档
:maxdepth: 1
getstarted/index_cn.rst
build_and_install/index_cn.rst
howto/index_cn.rst
dev/index_cn.rst
api/index_cn.rst
faq/index_cn.rst
......@@ -5,5 +5,7 @@ PaddlePaddle Documentation
:maxdepth: 1
getstarted/index_en.rst
build_and_install/index_en.rst
howto/index_en.rst
dev/index_en.rst
api/index_en.rst
......@@ -20,10 +20,13 @@ endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
......@@ -92,11 +95,4 @@ cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
if(NOT WITH_C_API AND WITH_FLUID)
file(GLOB FRAMEWORK_HEADERS *.h)
install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
endif()
cc_test(channel_test SRCS channel_test.cc)
......@@ -186,9 +186,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
: prog_(prog), desc_(desc) {
need_update_ = true;
for (auto &op : other.ops_) {
ops_.emplace_back(new OpDesc(*op, this));
ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
}
for (auto &it : other.vars_) {
auto *var = new VarDesc(*it.second);
vars_[it.first].reset(var);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -22,6 +22,28 @@ limitations under the License. */
using paddle::framework::Channel;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
}
TEST(Channel, MakeAndClose) {
using paddle::framework::details::Buffered;
......@@ -60,6 +82,86 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
delete ch;
}
// This tests that a channel must return false
// on send and receive performed after closing the channel.
// Receive will only return false after close when queue is empty.
// By creating separate threads for sending and receiving, we make this
// function able to test both buffered and unbuffered channels.
void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
const size_t data = 5;
std::thread send_thread{[&]() {
size_t i = data;
EXPECT_EQ(ch->Send(&i), true); // should not block
}};
std::thread recv_thread{[&]() {
size_t i;
EXPECT_EQ(ch->Receive(&i), true); // should not block
EXPECT_EQ(i, data);
}};
send_thread.join();
recv_thread.join();
// After closing send should return false. Receive should
// also return false as there is no data in queue.
CloseChannel(ch);
send_thread = std::thread{[&]() {
size_t i = data;
EXPECT_EQ(ch->Send(&i), false); // should return false
}};
recv_thread = std::thread{[&]() {
size_t i;
// should return false because channel is closed and queue is empty
EXPECT_EQ(ch->Receive(&i), false);
}};
send_thread.join();
recv_thread.join();
}
TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
SendReceiveWithACloseChannelShouldPanic(ch);
delete ch;
}
TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
auto ch = MakeChannel<size_t>(0);
SendReceiveWithACloseChannelShouldPanic(ch);
delete ch;
}
TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // sending should not block
}
size_t out;
for (size_t i = 0; i < buffer_size / 2; ++i) {
EXPECT_EQ(ch->Receive(&out), true); // receiving should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out),
true); // receving should return residual values.
EXPECT_EQ(out, i);
}
for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out),
false); // receiving on closed channel should return false
}
delete ch;
}
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
......@@ -74,7 +176,7 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
sum += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
EXPECT_EQ(sum, 45U);
CloseChannel(ch);
......@@ -82,31 +184,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
delete ch;
}
TEST(Channel, SimpleUnbufferedChannelTest) {
TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
RecevingOrderEqualToSendingOrder(ch);
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch);
}
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
......@@ -117,15 +205,14 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
// Verify that all threads are blocked
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
......@@ -134,7 +221,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
......@@ -142,13 +229,12 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
......@@ -168,34 +254,56 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (thread_ended[i] == false) ct++;
if (dynamic_cast<Buffered<int> *>(ch)) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
// Explicitly close the thread
// This should unblock all senders
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Verify that only 1 send was successful
ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
if (dynamic_cast<Buffered<int> *>(ch)) {
// Verify that only 1 send was successful
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
ChannelCloseUnblocksReceiversTest(ch);
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
ChannelCloseUnblocksSendersTest(ch);
delete ch;
}
......@@ -203,40 +311,7 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
ChannelCloseUnblocksReceiversTest(ch);
delete ch;
}
......@@ -244,40 +319,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
// unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data = 10;
EXPECT_EQ(ch->Send(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
ChannelCloseUnblocksReceiversTest(ch);
delete ch;
}
......@@ -340,3 +382,129 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
EXPECT_EQ(sum_receive, 28U);
delete ch;
}
// This tests that destroying a channel unblocks
// any senders waiting for channel to have write space
void ChannelDestroyUnblockSenders(Channel<int> *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
*success = ch->Send(&data);
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
bool is_buffered_channel = false;
if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
if (is_buffered_channel) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Explicitly destroy the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successfuld sends
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
if (is_buffered_channel) {
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
} else {
// In unbuffered channel, no send should be successful
EXPECT_EQ(ct, 0);
}
// Join all threads
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
// This tests that destroying a channel also unblocks
// any receivers waiting on the channel
void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
size_t buffer_size = 1;
auto ch = MakeChannel<int>(buffer_size);
ChannelDestroyUnblockReceivers(ch);
}
TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
size_t buffer_size = 1;
auto ch = MakeChannel<int>(buffer_size);
ChannelDestroyUnblockSenders(ch);
}
// This tests that destroying an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
ChannelDestroyUnblockReceivers(ch);
}
TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
ChannelDestroyUnblockSenders(ch);
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <deque>
#include <mutex>
......@@ -24,6 +25,14 @@ namespace paddle {
namespace framework {
namespace details {
// Four of the properties of Buffered Channel:
// - A send to a full channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from an empty channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template <typename T>
class Buffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
......@@ -41,8 +50,11 @@ class Buffered : public paddle::framework::Channel<T> {
std::mutex mu_;
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::condition_variable destructor_cond_var_;
std::deque<T> channel_;
bool closed_;
std::atomic<bool> closed_{false};
std::atomic<unsigned> send_ctr{0};
std::atomic<unsigned> recv_ctr{0};
Buffered(size_t cap) : cap_(cap), closed_(false) {
PADDLE_ENFORCE_GT(cap, 0);
......@@ -53,35 +65,52 @@ class Buffered : public paddle::framework::Channel<T> {
template <typename T>
bool Buffered<T>::Send(T* item) {
bool ret = false;
if (closed_) {
return ret;
}
send_ctr++;
std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
bool ret = false;
if (!closed_) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
ret = true;
}
send_ctr--;
destructor_cond_var_.notify_one();
return ret;
}
template <typename T>
bool Buffered<T>::Receive(T* item) {
bool ret = false;
// Once the channel has been closed and all data has been consumed,
// just return false. Don't even try acquiring the mutex.
if (closed_ && channel_.empty()) {
return false;
}
recv_ctr++;
std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
bool ret = false;
if (!channel_.empty()) {
*item = std::move(channel_.front());
channel_.pop_front();
full_cond_var_.notify_one();
ret = true;
}
recv_ctr--;
destructor_cond_var_.notify_one();
return ret;
}
template <typename T>
void Buffered<T>::Close() {
if (closed_) {
return;
}
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
NotifyAllParticipants(&lock);
......@@ -93,6 +122,12 @@ Buffered<T>::~Buffered() {
closed_ = true;
channel_.clear();
NotifyAllParticipants(&lock);
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
lock.lock();
destructor_cond_var_.wait(
lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
}
template <typename T>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......
......@@ -23,6 +23,13 @@ namespace paddle {
namespace framework {
namespace details {
// Four of the properties of UnBuffered Channel:
// - A send to a channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from a channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template <typename T>
class UnBuffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
......@@ -45,9 +52,11 @@ class UnBuffered : public paddle::framework::Channel<T> {
// A transaction occurs only when both are true
std::atomic<bool> reader_found_{false}, writer_found_{false};
std::condition_variable cv_channel_;
std::condition_variable_any cv_reader_, cv_writer_;
std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
T* item{nullptr};
std::atomic<bool> closed_{false};
std::atomic<unsigned> send_ctr{0};
std::atomic<unsigned> recv_ctr{0};
UnBuffered() : closed_(false) {}
......@@ -58,6 +67,11 @@ class UnBuffered : public paddle::framework::Channel<T> {
// be sent from a writer to a reader.
template <typename T>
bool UnBuffered<T>::Send(T* data) {
bool ret = false;
if (closed_) {
return ret;
}
send_ctr++;
// Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true;
......@@ -66,7 +80,6 @@ bool UnBuffered<T>::Send(T* data) {
cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data;
......@@ -78,6 +91,8 @@ bool UnBuffered<T>::Send(T* data) {
ret = true;
}
writer_found_ = false;
send_ctr--;
cv_destructor_.notify_one();
return ret;
}
......@@ -85,6 +100,12 @@ bool UnBuffered<T>::Send(T* data) {
// data that was sent by a writer is read from a reader.
template <typename T>
bool UnBuffered<T>::Receive(T* data) {
bool ret = false;
// If channel is closed, we don't even want any reader to enter.
// Unlike a buffered channel, an unbuffered channel does not allow
// readers to read after closing because there is no buffer to be consumed.
if (closed_) return ret;
recv_ctr++;
// Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true;
......@@ -93,7 +114,6 @@ bool UnBuffered<T>::Receive(T* data) {
cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one();
bool ret = false;
if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data
......@@ -107,6 +127,8 @@ bool UnBuffered<T>::Receive(T* data) {
cv_channel_.notify_one();
}
reader_found_ = false;
recv_ctr--;
cv_destructor_.notify_one();
return ret;
}
......@@ -114,6 +136,9 @@ bool UnBuffered<T>::Receive(T* data) {
// that take place once the channel is closed.
template <typename T>
void UnBuffered<T>::Close() {
if (closed_) {
return;
}
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
......@@ -129,6 +154,9 @@ UnBuffered<T>::~UnBuffered() {
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
lock.lock();
cv_destructor_.wait(lock,
[this]() { return send_ctr == 0 && recv_ctr == 0; });
}
// This function notifies all the readers, writers and
......
......@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/reader.h"
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
......@@ -52,11 +53,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarDesc::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarDesc::READER) {
var->GetMutable<ReaderHolder>();
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
" PLACE_LIST]",
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER]",
var_type);
}
}
......
......@@ -116,7 +116,7 @@ message LoDTensorArrayDesc {
optional int32 lod_level = 2 [ default = 0 ];
}
message Reader { repeated LoDTensorDesc lod_tensor = 1; }
message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
message VarDesc {
enum VarType {
......@@ -136,7 +136,7 @@ message VarDesc {
optional LoDTensorDesc lod_tensor = 4;
optional TensorDesc selected_rows = 5;
optional LoDTensorArrayDesc tensor_array = 6;
optional Reader reader = 7;
optional ReaderDesc reader = 7;
}
message BlockDesc {
......
......@@ -48,12 +48,26 @@ namespace framework {
*/
struct LoD : public std::vector<Vector<size_t>> {
using std::vector<Vector<size_t>>::vector;
platform::Place place() const {
if (this->size() == 0) {
// Not Initialze Yet.
return platform::CPUPlace();
} else {
return this->front().place();
}
}
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
void CopyToPeer(platform::Place place) {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyToPeer(place);
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod);
......
......@@ -28,28 +28,6 @@ __global__ void test(size_t* a, int size) {
}
}
TEST(Vector, Normal) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
paddle::framework::InitDevices();
paddle::framework::Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(LoD, data) {
paddle::framework::InitDevices();
......
......@@ -40,26 +40,35 @@ class Vector : public std::vector<T> {
Vector() {}
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT
virtual ~Vector() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
}
#endif
}
inline platform::Place place() const { return place_; }
/*! Return a pointer to constant memory block. */
inline const T *data(platform::Place place) const;
/*! Return a pointer to mutable memory block. */
inline T *mutable_data(platform::Place place);
// TODO(dzhwinter): below interfaces should be removed
/* Get device vector */
T *cuda_data() {
CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_);
return static_cast<T *>(cuda_ptr_.get());
}
/* Get host vector */
T *data() { return std::vector<T>::data(); }
const T *data() const { return std::vector<T>::data(); }
T *data(const platform::Place &place) {
if (platform::is_cpu_place(place)) {
return data();
} else {
return cuda_data();
}
}
/* Synchronize host vector to device vector */
void CopyToCUDA();
/* Synchronize device vector to host vector */
......@@ -68,25 +77,73 @@ class Vector : public std::vector<T> {
void CopyToPeer(platform::Place);
private:
void *cuda_ptr_ = nullptr;
std::shared_ptr<void> cuda_ptr_;
size_t cuda_size_ = 0; // device vector numel
platform::CUDAPlace place_;
};
template <typename T>
void Vector<T>::CopyToCUDA() {
inline const T *Vector<T>::data(platform::Place place) const {
if (platform::is_cpu_place(place)) {
return std::vector<T>::data();
} else if (platform::is_gpu_place(place)) {
if (cuda_ptr_ == nullptr) {
return nullptr;
}
if (boost::get<platform::CUDAPlace>(place) == place_) {
return static_cast<const T *>(cuda_ptr_.get());
} else {
PADDLE_THROW(
"Unmatched place. Please use `mutable_data` copy lod to the target "
"Place first.");
}
} else {
PADDLE_THROW("Unsupport Place.");
}
}
template <typename T>
inline T *Vector<T>::mutable_data(platform::Place place) {
if (platform::is_cpu_place(place)) {
return std::vector<T>::data();
} else if (platform::is_gpu_place(place)) {
if (boost::get<platform::CUDAPlace>(place) != place_) {
place_ = boost::get<platform::CUDAPlace>(place);
}
#ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size()) {
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
cuda_ptr_.reset(
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
}
cuda_ptr_ =
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
return static_cast<T *>(cuda_ptr_.get());
#else
return nullptr;
#endif
} else {
PADDLE_THROW("Unsupport Place.");
}
}
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
cuda_ptr_.reset(
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
......@@ -104,32 +161,32 @@ void Vector<T>::CopyFromCUDA() {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
ctx->stream());
static_cast<const void *>(cuda_ptr_.get()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) {
void Vector<T>::CopyToPeer(platform::Place place) {
#ifdef PADDLE_WITH_CUDA
auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
if (boost::get<platform::CUDAPlace>(place) != place_) {
place_ = boost::get<platform::CUDAPlace>(place);
}
if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
cuda_ptr_.reset(
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr;
#endif
}
template class Vector<int>;
template class Vector<unsigned>;
template class Vector<size_t>;
template class Vector<int64_t>;
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda.h>
#include <cuda_runtime.h>
#include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/framework/mixed_vector.h"
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
template <typename T>
__global__ void test(T* data, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) {
data[i] *= 2;
}
}
TEST(Vector, Normal) {
// fill the device context pool.
InitDevices();
Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(Vector, MultipleCopy) {
InitDevices();
Vector<size_t> vec({1, 2, 3});
CUDAPlace place(0);
vec.mutable_data(place);
auto vec2 = Vector<size_t>(vec);
{
const size_t* ptr = vec2.data(CPUPlace());
for (size_t i = 0; i < vec2.size(); ++i) {
EXPECT_EQ(*(ptr + i), vec[i]);
}
}
test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
vec2.CopyFromCUDA();
{
const size_t* ptr = vec2.data(CPUPlace());
for (size_t i = 0; i < vec2.size(); ++i) {
EXPECT_EQ(*(ptr + i), vec[i] * 2);
}
}
}
......@@ -72,6 +72,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void SetDim(const std::string &name, const DDim &dim) override;
std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
void SetRepeatedDims(const std::string &name,
const std::vector<DDim> &dims) override;
const OpDesc &op_;
const BlockDesc &block_;
};
......@@ -120,11 +125,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
// restore attrs_
for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
std::string attr_name = attr.name();
// The sub_block referred to by the BLOCK attr hasn't been added
// to ProgramDesc class yet, we skip setting BLOCK attr here.
if (attr.type() != proto::AttrType::BLOCK) {
attrs_[attr_name] = GetAttrValue(attr);
} else {
auto bid = attr.block_idx();
attrs_[attr_name] = prog->MutableBlock(bid);
}
}
this->block_ = block;
......@@ -457,23 +461,48 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
DDim res;
try {
auto shape = var->GetShape();
if (shape.empty()) {
return framework::make_ddim({0UL});
} else {
return framework::make_ddim(var->GetShape());
}
res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
return res;
}
std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
std::vector<DDim> res;
try {
auto shapes = var->GetShapes();
for (const auto &s : shapes) {
res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
}
} catch (...) {
VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
std::rethrow_exception(std::current_exception());
}
return res;
}
void CompileTimeInferShapeContext::SetDim(const std::string &name,
const DDim &dim) {
block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
block_.FindVarRecursive(name)->SetShape(vectorize(dim));
}
void CompileTimeInferShapeContext::SetRepeatedDims(
const std::string &name, const std::vector<DDim> &dims) {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
std::vector<std::vector<int64_t>> dim_vec(dims.size());
std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize);
var->SetShapes(dim_vec);
}
bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
......
......@@ -320,8 +320,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
name);
PADDLE_ENFORCE_EQ(length, 1UL,
"Input %s should not have more than one inputs", name);
auto ipt = ins[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
......@@ -333,8 +333,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
if (length == 0) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
name);
PADDLE_ENFORCE_EQ(length, 1UL,
"Output %s should not have more than one inputs", name);
auto ipt = outs[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
......@@ -421,8 +421,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
} else if (var->IsType<SelectedRows>()) {
return var->Get<SelectedRows>().GetCompleteDims();
} else {
PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
name, var->Type().name());
PADDLE_THROW(
"Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
"type_id is %s.",
name, var->Type().name());
}
}
std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
Variable* var = scope_.FindVar(name);
if (var->IsType<ReaderHolder>()) {
return var->Get<ReaderHolder>().shapes();
} else {
PADDLE_THROW(
"Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
"type_id is %s.",
name, var->Type().name());
}
}
......@@ -438,6 +452,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
}
}
void SetRepeatedDims(const std::string& name,
const std::vector<DDim>& dims) override {
Variable* var = scope_.FindVar(name);
if (var->IsType<ReaderHolder>()) {
var->GetMutable<ReaderHolder>()->set_shapes(dims);
} else {
PADDLE_THROW(
"Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
"type_id is %s.",
name, var->Type().name());
}
}
proto::VarDesc::VarType GetVarType(const std::string& name) const override {
auto* var = scope_.FindVar(name);
return ToVarType(var->Type());
......
......@@ -43,11 +43,20 @@ ProgramDesc::ProgramDesc() {
ProgramDesc::ProgramDesc(const ProgramDesc &o) {
desc_ = o.desc_;
for (int i = 0; i < desc_.blocks_size(); ++i) {
auto *block = desc_.mutable_blocks(i);
blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
}
for (auto &block : blocks_) {
for (auto *op : block->AllOps()) {
for (const auto &attr : op->Proto()->attrs()) {
if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
}
}
}
}
}
ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
......@@ -55,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
for (auto &block_desc : *desc_.mutable_blocks()) {
blocks_.emplace_back(new BlockDesc(this, &block_desc));
}
for (auto &block : blocks_) {
for (auto *op : block->AllOps()) {
for (const auto &attr : op->Proto()->attrs()) {
if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
}
}
}
}
}
ProgramDesc::ProgramDesc(const std::string &binary_str) {
......
......@@ -49,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) {
return false;
}
void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
int block_id) {
// TODO(tonyyang-svail):
// - will change to use multiple blocks for RNN op and Cond Op
int GetSubBlockIndex(const proto::OpDesc& op_desc) {
for (auto& attr : op_desc.attrs()) {
if (attr.type() == proto::AttrType::BLOCK) {
PADDLE_ENFORCE(attr.has_block_idx());
return attr.block_idx();
}
}
return -1;
}
bool HasSubBlock(const proto::OpDesc& op_desc) {
return GetSubBlockIndex(op_desc) > 0;
}
// block_id is the idx of the current block in the input desc
// parent_block_id is the idx of the parent of the current block
// in the output desc, -1 means the current block is global block
// dependent_vars is passed recursively from the parent block to
// the child block to help pruning
void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
int block_id, int parent_block_id,
std::set<std::string>& dependent_vars) {
auto& block = input.blocks(block_id);
auto& ops = block.ops();
......@@ -72,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
expect_fetch = (op_desc.type() == kFetchOpType);
}
std::set<std::string> dependent_vars;
std::vector<bool> should_run;
for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
auto& op_desc = *op_iter;
if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
// insert its input to the dependency graph
for (auto& var : op_desc.inputs()) {
......@@ -84,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
dependent_vars.insert(argu);
}
}
should_run.push_back(true);
} else {
should_run.push_back(false);
......@@ -95,45 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
// we reverse the should_run vector
std::reverse(should_run.begin(), should_run.end());
*output = input;
auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
// copy the current block from input to output
auto* block_field = output->mutable_blocks();
*block_field->Add() = input.blocks(block_id);
int output_block_id = output->blocks_size() - 1;
auto* output_block = output->mutable_blocks(output_block_id);
output_block->set_idx(output_block_id);
output_block->set_parent_idx(parent_block_id);
auto* op_field = output_block->mutable_ops();
op_field->Clear();
for (size_t i = 0; i < should_run.size(); ++i) {
if (should_run[i]) {
*op_field->Add() = input.blocks(block_id).ops(i);
auto* op = op_field->Add();
*op = input.blocks(block_id).ops(i);
if (HasSubBlock(*op)) {
// create sub_block_dependent_vars here to help prune the sub block
std::set<std::string> sub_block_dependent_vars;
for (auto& var : op->inputs()) {
for (auto& argu : var.arguments()) {
sub_block_dependent_vars.insert(argu);
}
}
for (auto& var : op->outputs()) {
for (auto& argu : var.arguments()) {
sub_block_dependent_vars.insert(argu);
}
}
// GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
// output_block_id is the idx of the current block in the output desc
prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
sub_block_dependent_vars);
}
}
}
// remove the VarDescs in BlockDesc that are not referenced in
// the pruned OpDescs
std::unordered_map<std::string, proto::VarDesc> var_map;
auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
for (const auto& var : *var_field) {
var_map[var.name()] = var;
}
var_field->Clear();
std::set<std::string> var_names;
for (const auto& op : *op_field) {
// add VarDescs of all input arguments for each OpDesc
auto& input_field = op.inputs();
for (auto& input_var : input_field) {
for (auto& arg : input_var.arguments()) {
*var_field->Add() = var_map[arg];
if (var_map.count(arg) != 0) {
var_names.insert(arg);
}
}
}
// add VarDescs of all output arguments for each OpDesc
auto& output_field = op.outputs();
for (auto& output_var : output_field) {
for (auto& arg : output_var.arguments()) {
*var_field->Add() = var_map[arg];
if (var_map.count(arg) != 0) {
var_names.insert(arg);
}
}
}
}
var_field->Clear();
for (const auto& name : var_names) {
*var_field->Add() = var_map[name];
}
}
// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
prune_impl(input, output, 0);
std::set<std::string> dependent_vars;
output->clear_blocks();
prune_impl(input, output, 0, -1, dependent_vars);
}
void inference_optimize_impl(const proto::ProgramDesc& input,
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/framework/reader.h"
namespace paddle {
namespace framework {
DDim ReaderBase::shape(size_t idx) const {
PADDLE_ENFORCE_LT(
idx, shapes_.size(),
"Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
shapes_.size());
return shapes_[idx];
}
void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
if (iteration_pos_ >= buffer_.size()) {
// Reload buffer with new data
buffer_.clear();
buffer_.reserve(buffer_size_);
for (int i = 0; i < buffer_size_; ++i) {
if (reader_->HasNext()) {
buffer_.push_back(std::vector<LoDTensor>());
reader_->ReadNext(&buffer_.back());
} else {
break;
}
}
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
// optimize.
std::random_shuffle(buffer_.begin(), buffer_.end());
iteration_pos_ = 0;
}
out->clear();
if (!buffer_.empty()) {
std::swap(*out, buffer_[iteration_pos_++]);
}
// if buffer_ is empty, the 'out' will return as an empty vector.
}
void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
buffer_.clear();
buffer_.reserve(batch_size_);
for (int i = 0; i < batch_size_; ++i) {
if (reader_->HasNext()) {
buffer_.push_back(std::vector<LoDTensor>());
reader_->ReadNext(&buffer_.back());
} else {
break;
}
}
// Concat instances
out->clear();
if (buffer_.empty()) {
// if buffer_ is empty, the 'out' will return as an empty vector.
return;
}
int out_num = buffer_[0].size();
out->reserve(out_num);
for (int j = 0; j < out_num; ++j) {
// Merge shape and check date type
std::type_index batch_type = buffer_[0][j].type();
DDim batch_shape = buffer_[0][j].dims();
for (size_t i = 1; i < buffer_.size(); ++i) {
std::type_index ins_type = buffer_[i][j].type();
DDim ins_shape = buffer_[i][j].dims();
PADDLE_ENFORCE_EQ(batch_type, ins_type);
PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
slice_ddim(ins_shape, 1, ins_shape.size()));
PADDLE_ENFORCE_GT(ins_shape[0], 0);
batch_shape[0] += ins_shape[0];
}
LoDTensor out_tensor;
out_tensor.Resize(batch_shape);
out_tensor.mutable_data(platform::CPUPlace(), batch_type);
int64_t dst_offset = 0;
// Merge lod and data
LoD batch_lod;
std::vector<size_t> top_level_lod({0});
for (size_t i = 0; i < buffer_.size(); ++i) {
DDim ins_shape = buffer_[i][j].dims();
LoD ins_lod = buffer_[i][j].lod();
if (i == 0) {
batch_lod = ins_lod;
} else {
PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
auto& lod_level = batch_lod[level_idx];
for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
}
}
}
top_level_lod.push_back(
top_level_lod.back() +
(ins_lod.empty() ? ins_shape[0] : (ins_lod[0].size() - 1)));
Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
Copy(buffer_[i][j], platform::CPUPlace(), &dst);
dst_offset += ins_shape[0];
}
batch_lod.insert(batch_lod.begin(), top_level_lod);
out_tensor.set_lod(batch_lod);
out->push_back(out_tensor);
}
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/framework/ddim.h"
#include "paddle/framework/lod_tensor_array.h"
namespace paddle {
namespace framework {
class ReaderBase {
public:
explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
PADDLE_ENFORCE(!shapes_.empty());
}
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
virtual bool HasNext() const = 0;
virtual void ReInit() = 0;
DDim shape(size_t idx) const;
std::vector<DDim> shapes() const { return shapes_; }
void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
virtual ~ReaderBase() {}
protected:
std::vector<DDim> shapes_;
};
class FileReader : public ReaderBase {
public:
explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
};
class DecoratedReader : public ReaderBase {
public:
explicit DecoratedReader(ReaderBase* reader)
: ReaderBase(reader->shapes()), reader_(reader) {
PADDLE_ENFORCE_NOT_NULL(reader_);
}
bool HasNext() const override { return reader_->HasNext(); }
void ReInit() override { reader_->ReInit(); }
protected:
ReaderBase* reader_;
};
// file readers
template <typename T>
class RandomDataGenerator : public FileReader {
public:
RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
: FileReader(shapes), min_(min), max_(max) {
PADDLE_ENFORCE_LE(
min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
unsigned int seed = std::random_device()();
engine_.seed(seed);
dist_ = std::uniform_real_distribution<float>(min_, max_);
}
void ReadNext(std::vector<LoDTensor>* out) override {
out->clear();
out->reserve(shapes_.size());
for (const DDim& shape : shapes_) {
PADDLE_ENFORCE_GE(
shape.size(), 2,
"The rank of reader's output data should be 2 at least.(Now it's %d)",
shape.size());
LoDTensor out_tensor;
out_tensor.Resize(shape);
T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
int64_t numel = product(shape);
for (int64_t i = 0; i < numel; ++i) {
data[i] = dist_(engine_);
}
out->push_back(out_tensor);
}
}
bool HasNext() const override { return true; }
void ReInit() override { return; }
private:
float min_;
float max_;
std::minstd_rand engine_;
std::uniform_real_distribution<float> dist_;
};
// decorated readers
class ShuffleReader : public DecoratedReader {
public:
ShuffleReader(ReaderBase* reader, int buffer_size)
: DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
buffer_.reserve(buffer_size);
}
void ReadNext(std::vector<LoDTensor>* out) override;
private:
int buffer_size_;
std::vector<std::vector<LoDTensor>> buffer_;
size_t iteration_pos_;
};
class BatchReader : public DecoratedReader {
public:
BatchReader(ReaderBase* reader, int batch_size)
: DecoratedReader(reader), batch_size_(batch_size) {
buffer_.reserve(batch_size_);
}
void ReadNext(std::vector<LoDTensor>* out) override;
private:
int batch_size_;
std::vector<std::vector<LoDTensor>> buffer_;
};
// The ReaderHolder is used as readers' unified wrapper,
// making it easier to access different type readers in Variables.
class ReaderHolder {
public:
void Reset(ReaderBase* reader) { reader_.reset(reader); }
ReaderBase* Get() const { return reader_.get(); }
void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
bool HasNext() const { return reader_->HasNext(); }
void ReInit() { reader_->ReInit(); }
DDim shape(size_t idx) const { return reader_->shape(idx); }
std::vector<DDim> shapes() const { return reader_->shapes(); }
void set_shapes(const std::vector<DDim>& shapes) {
reader_->set_shapes(shapes);
}
private:
std::unique_ptr<ReaderBase> reader_;
};
} // namespace framework
} // namespace paddle
......@@ -32,6 +32,16 @@ std::vector<DDim> InferShapeContext::GetInputsDim(
return GetDims(arg_names);
}
std::vector<DDim> InferShapeContext::GetReaderDims(
const std::string &name) const {
const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(
arg_names.size(), 1UL,
"Reader input '%s' should hold one element, but now it holds %d", name,
arg_names.size());
return this->GetRepeatedDims(arg_names[0]);
}
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
int idx) const {
const std::vector<std::string> &names = Inputs(name);
......@@ -52,6 +62,16 @@ void InferShapeContext::SetOutputsDim(const std::string &name,
SetDims(names, dims);
}
void InferShapeContext::SetReaderDims(const std::string &name,
const std::vector<DDim> &dims) {
const std::vector<std::string> &arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(
arg_names.size(), 1UL,
"Reader output '%s' should hold one element, but now it holds %d", name,
arg_names.size());
return this->SetRepeatedDims(arg_names[0], dims);
}
std::vector<DDim> InferShapeContext::GetDims(
const std::vector<std::string> &names) const {
std::vector<DDim> ret;
......@@ -61,6 +81,7 @@ std::vector<DDim> InferShapeContext::GetDims(
[this](const std::string &name) { return this->GetDim(name); });
return ret;
}
void InferShapeContext::SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims) {
size_t length = names.size();
......@@ -72,14 +93,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
SetDim(names[i], dims[i]);
}
}
std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<proto::VarDesc::VarType> retv;
......
......@@ -36,12 +36,13 @@ class InferShapeContext {
virtual bool HasOutputs(const std::string &name) const = 0;
DDim GetInputDim(const std::string &name) const;
std::vector<DDim> GetInputsDim(const std::string &name) const;
std::vector<DDim> GetReaderDims(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
void SetOutputDim(const std::string &name, const DDim &dim);
void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
void SetReaderDims(const std::string &name, const std::vector<DDim> &dims);
virtual AttrReader Attrs() const = 0;
virtual const std::vector<std::string> &Inputs(
......@@ -61,6 +62,9 @@ class InferShapeContext {
protected:
virtual DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const DDim &dim) = 0;
virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
virtual void SetRepeatedDims(const std::string &name,
const std::vector<DDim> &dims) = 0;
std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
std::vector<proto::VarDesc::VarType> GetVarTypes(
......
......@@ -21,7 +21,8 @@ limitations under the License. */
#include <queue>
#include <thread>
#include <vector>
#include "glog/logging.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
......@@ -31,7 +32,7 @@ namespace framework {
// number of threads.
class ThreadPool {
public:
typedef std::packaged_task<void()> Task;
using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
// Returns the singleton of ThreadPool.
static ThreadPool* GetInstance();
......@@ -52,9 +53,28 @@ class ThreadPool {
// std::future::wait().
template <typename Callback>
std::future<void> Run(Callback fn) {
auto f = this->RunAndGetException(fn);
return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
}
template <typename Callback>
std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
Callback fn) {
std::unique_lock<std::mutex> lock(mutex_);
Task task(std::bind(fn));
std::future<void> f = task.get_future();
Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
try {
fn();
return nullptr;
} catch (platform::EnforceNotMet ex) {
return std::unique_ptr<platform::EnforceNotMet>(
new platform::EnforceNotMet(ex));
} catch (...) {
LOG(FATAL)
<< "Unexpected exception is catched in thread pool. All "
"throwable exception in Fluid should be an EnforceNotMet.";
}
});
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
tasks_.push(std::move(task));
lock.unlock();
scheduled_.notify_one();
......@@ -65,6 +85,22 @@ class ThreadPool {
void Wait();
private:
struct ExceptionHandler {
mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
explicit ExceptionHandler(
std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
: future_(std::move(f)) {}
void operator()() const {
auto ex = this->future_.get();
if (ex != nullptr) {
LOG(FATAL) << "The exception is thrown inside the thread pool. You "
"should use RunAndGetException to handle the exception.\n"
"The default exception handler is LOG(FATAL)."
<< ex->what();
}
}
};
DISABLE_COPY_AND_ASSIGN(ThreadPool);
explicit ThreadPool(int num_threads);
......
......@@ -57,10 +57,13 @@ size_t VarDesc::GetTensorDescNum() const {
void VarDesc::SetShapes(
const std::vector<std::vector<int64_t>> &multiple_dims) {
PADDLE_ENFORCE_EQ(multiple_dims.size(), GetTensorDescNum(),
"The number of given shapes(%d) doesn't equal to the "
"number of sub tensor.",
multiple_dims.size(), GetTensorDescNum());
if (multiple_dims.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_dims.size());
}
std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
for (size_t i = 0; i < multiple_dims.size(); ++i) {
VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
......@@ -87,10 +90,14 @@ void VarDesc::SetDataType(proto::DataType data_type) {
void VarDesc::SetDataTypes(
const std::vector<proto::DataType> &multiple_data_type) {
PADDLE_ENFORCE_EQ(multiple_data_type.size(), GetTensorDescNum(),
"The number of given data types(%d) doesn't equal to the "
"number of sub tensor.",
multiple_data_type.size(), GetTensorDescNum());
if (multiple_data_type.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given data types("
<< multiple_data_type.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_data_type.size());
}
std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
for (size_t i = 0; i < multiple_data_type.size(); ++i) {
tensor_descs[i]->set_data_type(multiple_data_type[i]);
......@@ -127,10 +134,14 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
}
void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
PADDLE_ENFORCE_EQ(multiple_lod_level.size(), GetTensorDescNum(),
"The number of given data types(%d) doesn't equal to the "
"number of sub tensor.",
multiple_lod_level.size(), GetTensorDescNum());
if (multiple_lod_level.size() != GetTensorDescNum()) {
VLOG(3) << "WARNING: The number of given lod_levels("
<< multiple_lod_level.size()
<< ") doesn't match the existing tensor number("
<< GetTensorDescNum()
<< "). The Reader is going to be reinitialized.";
SetTensorDescNum(multiple_lod_level.size());
}
switch (desc_.type()) {
case proto::VarDesc::READER: {
size_t i = 0;
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/reader.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/variable.h"
......@@ -31,6 +32,8 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
} else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
return proto::VarDesc_VarType_SELECTED_ROWS;
} else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
return proto::VarDesc_VarType_READER;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
}
......@@ -40,7 +43,7 @@ template <typename Visitor>
inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
switch (ToVarType(var.Type())) {
case proto::VarDesc_VarType_LOD_TENSOR:
visitor(var.Get<framework::LoDTensor>());
visitor(var.Get<LoDTensor>());
return;
case proto::VarDesc_VarType_LOD_RANK_TABLE:
visitor(var.Get<LoDRankTable>());
......@@ -51,6 +54,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
case proto::VarDesc_VarType_SELECTED_ROWS:
visitor(var.Get<SelectedRows>());
return;
case proto::VarDesc_VarType_READER:
visitor(var.Get<ReaderHolder>());
return;
default:
PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
}
......
......@@ -212,6 +212,10 @@ TEST(compareSparse, NeuralNetwork) {
}
int main(int argc, char** argv) {
// FIXME(tonyyang-svail):
// Turn off this test due CI failure:
// https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
return 0;
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
initPython(argc, argv);
......
......@@ -4,25 +4,14 @@ cc_library(paddle_fluid_api
SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Merge all modules into a single static library
# Create static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create shared library
add_library(paddle_fluid_shared SHARED io.cc)
target_circle_link_libraries(paddle_fluid_shared
ARCHIVE_START
${GLOB_OP_LIB}
ARCHIVE_END
${FLUID_CORE_MODULES})
SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
# install library & headers
if(NOT WITH_C_API AND WITH_FLUID)
install(FILES io.h DESTINATION include/paddle/inference)
install(TARGETS paddle_fluid_shared DESTINATION lib)
endif()
cc_library(paddle_fluid_shared SHARED
SRCS io.cc
DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(WITH_TESTING)
add_subdirectory(tests/book)
......
......@@ -21,6 +21,17 @@ limitations under the License. */
namespace paddle {
namespace inference {
void ReadBinaryFile(const std::string& filename, std::string& contents) {
VLOG(3) << "loading model from " << filename;
std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
inputfs.seekg(0, std::ios::end);
contents.clear();
contents.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
inputfs.read(&contents[0], contents.size());
inputfs.close();
}
bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) {
if (var->Persistable()) {
......@@ -44,12 +55,15 @@ bool IsParameter(const framework::VarDesc* var,
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const framework::ProgramDesc& main_program) {
const std::string& param_filename) {
const framework::BlockDesc& global_block = main_program.Block(0);
framework::ProgramDesc* load_program = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program->MutableBlock(0);
std::vector<std::string> paramlist;
for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) {
VLOG(3) << "parameter's name: " << var->Name();
......@@ -61,15 +75,33 @@ void LoadPersistables(framework::Executor& executor,
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
// append_op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
if (!param_filename.empty()) {
paramlist.push_back(new_var->Name());
} else {
// append_op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
}
}
}
if (!param_filename.empty()) {
// sort paramlist to have consistent ordering
std::sort(paramlist.begin(), paramlist.end());
// append just the load_combine op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load_combine");
op->SetOutput("Out", paramlist);
op->SetAttr("file_path", {param_filename});
op->CheckAttrs();
}
executor.Run(*load_program, &scope, 0, true, true);
VLOG(3) << "Ran loading successfully";
delete load_program;
}
......@@ -77,20 +109,29 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
ReadBinaryFile(model_filename, program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
LoadPersistables(executor, scope, *main_program, dirname, "");
return main_program;
}
std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor& executor,
framework::Scope& scope,
const std::string& prog_filename,
const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
LoadPersistables(executor, scope, dirname, *main_program);
LoadPersistables(executor, scope, *main_program, "", param_filename);
return main_program;
}
......
......@@ -26,12 +26,18 @@ namespace inference {
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const framework::ProgramDesc& main_program);
const std::string& param_filename);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& prog_filename,
const std::string& param_filename);
} // namespace inference
} // namespace paddle
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
cc_test(test_inference_recognize_digits_mlp
SRCS test_inference_recognize_digits.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits)
function(inference_test TARGET_NAME)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
set(arg_list "")
if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS})
list(APPEND arg_list "_${arg}")
endforeach()
else()
list(APPEND arg_list "_")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_inference_${TARGET_NAME}${arg}
SRCS test_inference_${TARGET_NAME}.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
set_tests_properties(test_inference_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
endforeach()
endfunction(inference_test)
inference_test(recognize_digits ARGS mlp)
inference_test(image_classification ARGS vgg resnet)
inference_test(label_semantic_roles)
inference_test(rnn_encoder_decoder)
inference_test(recommender_system)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include "paddle/framework/lod_tensor.h"
#include "paddle/inference/io.h"
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
T lower,
T upper) {
srand(time(0));
T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
for (int i = 0; i < input.numel(); ++i) {
input_ptr[i] =
(static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
lower;
}
}
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
std::vector<T>& data) {
CHECK_EQ(paddle::framework::product(dims), data.size());
T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
}
template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input,
paddle::framework::LoD& lod,
T lower,
T upper) {
input.set_lod(lod);
int dim = lod[0][lod[0].size() - 1];
SetupTensor<T>(input, {dim, 1}, lower, upper);
}
template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
paddle::framework::LoD lod,
std::vector<T>& data) {
const size_t level = lod.size() - 1;
CHECK_EQ(dims[0], (lod[level]).back());
input.set_lod(lod);
SetupTensor<T>(input, dims, data);
}
template <typename T>
void CheckError(paddle::framework::LoDTensor& output1,
paddle::framework::LoDTensor& output2) {
// Check lod information
EXPECT_EQ(output1.lod(), output2.lod());
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
T err = static_cast<T>(0);
if (typeid(T) == typeid(float)) {
err = 1E-3;
} else if (typeid(T) == typeid(double)) {
err = 1E-6;
} else {
err = 0;
}
size_t count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
}
template <typename Place, bool IsCombined = false>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// 1. Define place, executor, scope and inference_program
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
// 2. Initialize the inference_program and load parameters
std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (IsCombined) {
// All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest.
// Users are free to specify different filename.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program = paddle::inference::Load(executor,
*scope,
dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified `dirname`.
inference_program = paddle::inference::Load(executor, *scope, dirname);
}
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
inference_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames();
// 4. Prepare inputs: set up maps for feed targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
for (size_t i = 0; i < feed_target_names.size(); ++i) {
// Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
}
// 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
}
// 6. Run the inference program
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
delete scope;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, image_classification) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0].
SetupTensor<float>(input,
{batch_size, 3, 32, 32},
static_cast<float>(0),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, label_semantic_roles) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
ctx_p2, mark;
paddle::framework::LoD lod{{0, 4, 10}};
SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(
predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&word);
cpu_feeds.push_back(&predicate);
cpu_feeds.push_back(&ctx_n2);
cpu_feeds.push_back(&ctx_n1);
cpu_feeds.push_back(&ctx_0);
cpu_feeds.push_back(&ctx_p1);
cpu_feeds.push_back(&ctx_p2);
cpu_feeds.push_back(&mark);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
......@@ -13,93 +13,56 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <time.h>
#include <sstream>
#include "gflags/gflags.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/inference/io.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
template <typename Place, typename T>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// 1. Define place, executor and scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
// 2. Initialize the inference_program and load all parameters from file
auto inference_program = paddle::inference::Load(executor, *scope, dirname);
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
inference_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames();
// 4. Prepare inputs: set up maps for feed targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
for (size_t i = 0; i < feed_target_names.size(); ++i) {
// Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
// 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 6. Run the inference program
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
delete scope;
}
int64_t batch_size = 1;
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
T lower,
T upper) {
srand(time(0));
float* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
for (int i = 0; i < input.numel(); ++i) {
input_ptr[i] =
(static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
lower;
}
}
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(input,
{batch_size, 1, 28, 28},
static_cast<float>(-1),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
template <typename T>
void CheckError(paddle::framework::LoDTensor& output1,
paddle::framework::LoDTensor& output2) {
// Check lod information
EXPECT_EQ(output1.lod(), output2.lod());
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
T err = static_cast<T>(0);
if (typeid(T) == typeid(float)) {
err = 1E-3;
} else if (typeid(T) == typeid(double)) {
err = 1E-6;
} else {
err = 0;
}
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
size_t count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
TEST(inference, recognize_digits) {
TEST(inference, recognize_digits_combine) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
......@@ -123,7 +86,7 @@ TEST(inference, recognize_digits) {
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>(
TestInference<paddle::platform::CPUPlace, true>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
......@@ -133,7 +96,7 @@ TEST(inference, recognize_digits) {
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>(
TestInference<paddle::platform::CUDAPlace, true>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, recommender_system) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
category_id, movie_title;
// Use the first data from paddle.dataset.movielens.test() as input
std::vector<int64_t> user_id_data = {1};
SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
std::vector<int64_t> gender_id_data = {1};
SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
std::vector<int64_t> age_id_data = {0};
SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
std::vector<int64_t> job_id_data = {10};
SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
std::vector<int64_t> movie_id_data = {783};
SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
std::vector<int64_t> category_id_data = {10, 8, 9};
SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&user_id);
cpu_feeds.push_back(&gender_id);
cpu_feeds.push_back(&age_id);
cpu_feeds.push_back(&job_id);
cpu_feeds.push_back(&movie_id);
cpu_feeds.push_back(&category_id);
cpu_feeds.push_back(&movie_title);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, rnn_encoder_decoder) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor word_data, trg_word;
paddle::framework::LoD lod{{0, 4, 10}};
SetupLoDTensor(
word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(
trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&word_data);
cpu_feeds.push_back(&trg_word);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
......@@ -14,10 +14,3 @@ cc_library(paddle_memory
system_allocator)
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
if(NOT WITH_C_API AND WITH_FLUID)
file(GLOB MEMORY_HEADERS *.h)
file(GLOB MEMORY_DETAIL_HEADERS detail/*.h)
install(FILES ${MEMORY_HEADERS} DESTINATION include/paddle/memory)
install(FILES ${MEMORY_DETAIL_HEADERS} DESTINATION include/paddle/memory/detail)
endif()
......@@ -81,5 +81,23 @@ class PODDeleter {
Place place_;
};
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory
} // namespace paddle
......@@ -62,7 +62,7 @@ function(op_library TARGET)
endif()
# Define operators that don't need pybind here.
foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......@@ -155,6 +155,7 @@ op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
op_library(cos_sim_op DEPS cos_sim_functor)
op_library(parallel_do_op DEPS executor)
op_library(create_reader_op DEPS reader)
# Regist multiple Kernel to pybind
if (WITH_GPU)
......@@ -185,7 +186,7 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
op_library(${src})
endforeach()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
......
......@@ -62,7 +62,7 @@ class CompareOpKernel
z->mutable_data<T>(context.GetPlace());
int axis = context.Attr<int>("axis");
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
z);
Functor(), z);
}
};
......
......@@ -41,6 +41,21 @@ class ConditionalOp : public framework::OperatorBase {
});
return retv;
}
bool ScalarCondition(
const std::vector<const framework::LoDTensor *> &ips) const {
if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
PADDLE_THROW("should have one initialized input as condition");
}
if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
ips[0]->numel() == 1)) {
PADDLE_THROW(
"condition input's data type should be bool, "
"numel should be 1, actual numel is %d",
ips[0]->numel());
}
return ips[0]->data<bool>()[0];
}
};
class ConditionalBlockOp : public ConditionalOp {
......@@ -53,9 +68,15 @@ class ConditionalBlockOp : public ConditionalOp {
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto xs = InputTensors(scope);
bool need_run = std::all_of(
xs.begin(), xs.end(),
[](const framework::LoDTensor *t) { return t->numel() != 0; });
bool need_run;
if (Attr<bool>("is_scalar_condition")) {
need_run = ScalarCondition(xs);
} else {
need_run = std::all_of(
xs.begin(), xs.end(),
[](const framework::LoDTensor *t) { return t->numel() != 0; });
}
if (need_run) {
auto *scope_var = scope.FindVar(Output("Scope"));
......@@ -88,6 +109,10 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"scope is std::vector<Scope*>");
AddAttr<framework::BlockDesc *>(
"sub_block", "The step block of conditional block operator");
AddAttr<bool>("is_scalar_condition",
"the input X is used as scalar "
"condition")
.SetDefault(false);
AddComment(R"DOC(Conditional block operator
Run the sub-block if X is not empty. Params is the other inputs and Out is the
......@@ -106,9 +131,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto xs = this->InputTensors(scope);
bool need_run = std::all_of(
xs.begin(), xs.end(),
[](const framework::LoDTensor *t) { return t->numel() != 0; });
bool need_run;
if (Attr<bool>("is_scalar_condition")) {
need_run = ScalarCondition(xs);
} else {
need_run = std::all_of(
xs.begin(), xs.end(),
[](const framework::LoDTensor *t) { return t->numel() != 0; });
}
if (need_run) {
auto *scope_var = scope.FindVar(Input("Scope"));
......@@ -182,6 +213,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
grad_op->SetOutput(framework::GradVarName("Params"),
InputGrad("Params", false));
grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/framework/op_registry.h"
#include "paddle/framework/reader.h"
namespace paddle {
namespace operators {
static std::vector<framework::DDim> RestoreShapes(
const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
std::vector<framework::DDim> res;
int offset = 0;
for (int len : ranks) {
auto start_it = shape_concat.begin() + offset;
auto end_it = start_it + len;
res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
offset += len;
}
return res;
}
// general infershape for file readers
class CreateFileReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output file reader should not be null.");
const auto shape_concat =
ctx->Attrs().Get<std::vector<int>>("shape_concat");
const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
ctx->SetReaderDims("Out", shapes);
}
};
// general infershape for decorated readers
class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
"Input(UnderlyingReader) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output decorated reader should not be null.");
ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
}
};
// general var type inference for all readers
class CreateReaderInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
std::string reader_name = op_desc.Output("Out")[0];
framework::VarDesc* reader = block->FindVarRecursive(reader_name);
reader->SetType(framework::proto::VarDesc::READER);
}
};
template <typename T>
class CreateRandomDataGeneratorOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()),
"The accumulate of all ranks should be equal to the "
"shape concat's length.");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
Attr<float>("max")));
}
};
class CreateRandomDataGeneratorOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddOutput("Out", "(ReaderHolder) The created random reader.");
AddAttr<std::vector<int>>("shape_concat",
"The concat of all data's shapes.");
AddAttr<std::vector<int>>(
"ranks",
"The ranks of each data."
"e.g."
"shape_concat = [2,3,4,5,6]"
"ranks = [3,2]"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively.");
AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
AddComment(R"DOC(
CreateRandomDataGenerator Operator
This Op creates a random reader.
The reader generates random data instead of really reading from files.
Generated data follow an uniform distribution between 'min' and 'max'.
)DOC");
}
};
class CreateShuffleReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
Attr<int>("buffer_size")));
}
};
class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddInput(
"UnderlyingReader",
"(ReaderHolder) The underlying reader for creating a shuffle reader.");
AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
AddComment(R"DOC(
CreateShuffleReader Operator
A shuffle reader takes another reader as its 'underlying reader'
and yields the underlying reader's outputs in a shuffled order.
)DOC");
}
};
class CreateBatchReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new framework::BatchReader(underlying_reader.Get(),
Attr<int>("batch_size")));
}
};
class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddInput(
"UnderlyingReader",
"(ReaderHolder) The underlying reader for creating a batch reader.");
AddOutput("Out", "(ReaderHolder) The created batch reader.");
AddAttr<int>("batch_size",
"How many instances the batch reader yields each time.")
.GreaterThan(0);
AddComment(R"DOC(
CreateBatchReader Operator
A batch reader takes another reader as its 'underlying reader',
gathers the underlying reader's outputs and then yields them in batches.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(create_random_data_generator,
ops::CreateRandomDataGeneratorOp<float>,
ops::CreateFileReaderInferShape,
ops::CreateRandomDataGeneratorOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::CreateReaderInferVarType);
REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
ops::CreateDecoratedReaderInferShape,
ops::CreateShuffleReaderOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::CreateReaderInferVarType);
REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
ops::CreateDecoratedReaderInferShape,
ops::CreateBatchReaderOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::CreateReaderInferVarType);
......@@ -80,6 +80,14 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
// resize output dims
output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
if (host_out_lod0.back() == 0) {
output->Resize({1, 1});
output->mutable_data<T>(ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> set_constant;
set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
output, -1);
}
}
};
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#include <string.h>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
......@@ -65,9 +67,14 @@ class CTCAlignKernel : public framework::OpKernel<T> {
framework::LoD output_lod;
output_lod.push_back(output_lod0);
output->set_lod(output_lod);
// resize output dims
output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
// for empty sequence
if (output_lod0.back() == 0) {
output->Resize({1, 1});
output_data = output->mutable_data<T>(ctx.GetPlace());
output_data[0] = -1;
}
}
};
......
......@@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
AddFunctor<T>(), z);
}
};
......
......@@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
DivFunctor<T>(), z);
}
};
......
......@@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MaxFunctor<T>(), z);
}
};
......
......@@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MinFunctor<T>(), z);
}
};
......
......@@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MulFunctor<T>(), z);
}
};
......
......@@ -365,10 +365,10 @@ template <typename Functor, typename DeviceContext, typename T,
typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y, int axis,
const framework::Tensor* y, int axis, Functor func,
framework::Tensor* z) {
TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), Functor());
x, y, z, ctx.template device_context<DeviceContext>(), func);
auto x_dims = x->dims();
auto y_dims = y->dims();
......
......@@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
PowFunctor<T>(), z);
}
};
......
......@@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
SubFunctor<T>(), z);
}
};
......
......@@ -21,13 +21,6 @@ using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenMatrixMapRowMajor = Eigen::Map<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
template <typename T>
using ConstEigenMatrixMapRowMajor = Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
class LayerNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -108,7 +101,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
Layer Normalization.
Layer Norm has been implemented as discussed in the paper:
https://arxiv.org/abs/1607.06450
...
......@@ -116,75 +108,6 @@ https://arxiv.org/abs/1607.06450
}
};
template <typename T>
class LayerNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto *output = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
output->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
auto squre = [](T ele) { return ele * ele; };
auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
mean_map = input_map.rowwise().mean();
var_map = (input_map - mean_map.replicate(1, right))
.unaryExpr(squre)
.rowwise()
.mean()
.unaryExpr(add_epslion);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto inv_std = var_map.unaryExpr(inv_std_func);
if (scale && bias) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1)) +
bias_map.replicate(left, 1);
} else if (scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1));
} else if (bias) {
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right)) +
bias_map.replicate(left, 1);
} else {
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right));
}
}
};
class LayerNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -237,125 +160,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
}
};
template <typename T>
class LayerNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
const auto *var = ctx.Input<Tensor>("Variance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
d_bias_map = d_y_map.colwise().sum();
}
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
auto d_scale_map =
EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale_map =
((x_map - mean_map.replicate(1, right))
.cwiseProduct(
var_map.unaryExpr(inv_std_func).replicate(1, right))
.cwiseProduct(d_y_map))
.colwise()
.sum();
}
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
auto triple_product_func = [](T ele) { return ele * ele * ele; };
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): these code can be refined
if (d_scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1));
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1))
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
} else {
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map);
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -363,8 +167,9 @@ namespace ops = paddle::operators;
REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
layer_norm_grad, ops::LayerNormGradOp);
REGISTER_OP_CPU_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/layer_norm_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -16,19 +16,222 @@ limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/elementwise_op_function.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T>
struct SubAndSquareFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
};
template <typename T>
struct DivAndSqrtFunctor {
explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
inline HOSTDEVICE T operator()(T a, T b) const {
return a / (sqrt(b + epsilon_));
}
private:
T epsilon_;
};
template <typename T>
struct MulFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
};
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
template <typename T>
struct SubFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
};
template <typename T>
struct MulInvVarFunctor {
inline HOSTDEVICE T operator()(T a, T b) const {
return a * std::sqrt(1.0 / b);
}
};
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias");
auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto x_dims = x.dims();
y->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
framework::DDim matrix_shape({left, right});
x.Resize(matrix_shape);
Tensor out;
out.ShareDataWith(*y);
out.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>();
math::RowwiseMean<DeviceContext, T> row_mean;
// get mean
row_mean(dev_ctx, x, mean);
// get variance
ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
row_mean(dev_ctx, out, var);
// get x_norm
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
ctx, &out, var, /*axis*/ 0,
DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
if (scale) {
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
}
if (bias) {
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
}
}
};
template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Y");
auto *mean = ctx.Input<Tensor>("Mean");
auto *var = ctx.Input<Tensor>("Variance");
auto *scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias");
auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
const auto &x_dims = x.dims();
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
framework::DDim matrix_shape({left, right});
d_y.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>();
math::ColwiseSum<DeviceContext, T> colwise_sum;
Tensor temp;
Tensor temp_norm;
if (d_scale || d_x) {
x.Resize(matrix_shape);
temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
if (!(bias && scale)) {
temp_norm.ShareDataWith(*y);
temp_norm.Resize(matrix_shape);
} else {
temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
// get x_norm
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
ctx, &temp_norm, var, /*axis*/ 0,
DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
}
}
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
colwise_sum(dev_ctx, d_y, d_bias);
}
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
colwise_sum(dev_ctx, temp, d_scale);
}
if (d_x) {
framework::DDim vec_shape({left});
d_x->mutable_data<T>(ctx.GetPlace());
auto dx_dim = d_x->dims();
Tensor temp_vec;
temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
math::RowwiseMean<DeviceContext, T> row_mean;
if (d_scale) {
// dy_dx
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
// dy_dmean_dx
row_mean(dev_ctx, temp, &temp_vec);
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
// dy_var_dx
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
} else {
// dy_dx
framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
// dy_dmean_dx
row_mean(dev_ctx, d_y, &temp_vec);
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
// dy_var_dx
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
}
// dy_var_dx
row_mean(dev_ctx, temp, &temp_vec);
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
ctx, d_x, var, /*axis*/ 0,
DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
d_x->Resize(dx_dim);
}
}
};
} // namespace operators
......
......@@ -331,6 +331,12 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -325,6 +325,31 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
vector->data<double>());
}
template struct RowwiseSum<platform::CUDADeviceContext, float>;
// template struct RowwiseSum<platform::CUDADeviceContext, double>;
// TODO(zcd): Following ColwiseSum format, need to confirm.
// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
// and only failed for this case. So reimplemented it.
template <>
void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
const platform::CUDADeviceContext& context, const framework::Tensor& input,
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
framework::Tensor one;
one.mutable_data<double>({size}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set;
set(context, &one, static_cast<double>(1.0));
gemv<platform::CUDADeviceContext, double>(
context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
1.0, one.data<double>(), input.data<double>(), 0.0,
vector->data<double>());
}
template struct RowwiseMean<platform::CUDADeviceContext, float>;
template struct RowwiseMean<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -128,6 +128,18 @@ struct ColwiseSum {
framework::Tensor* vec);
};
template <typename DeviceContext, typename T>
struct RowwiseSum {
void operator()(const DeviceContext& context, const framework::Tensor& input,
framework::Tensor* vec);
};
template <typename DeviceContext, typename T>
struct RowwiseMean {
void operator()(const DeviceContext& context, const framework::Tensor& input,
framework::Tensor* vec);
};
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -87,6 +87,88 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
}
};
template <typename DeviceContext, typename T>
void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
// rowwise-sum can be easily implemented. General reduce has a huge overhead in
// CPU
template <typename T>
class RowwiseMean<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
auto inv_size = 1.0 / size;
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
T sum = 0;
for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
sum += in_buf[i * size + j];
}
out_buf[i] = sum * inv_size;
}
}
};
template <typename DeviceContext, typename T>
void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
// rowwise-sum can be easily implemented. General reduce has a huge overhead in
// CPU
template <typename T>
class RowwiseSum<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size);
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
T sum = 0;
for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
sum += in_buf[i * size + j];
}
out_buf[i] = sum;
}
}
};
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -287,6 +287,9 @@ TEST_F(NCCLTester, ncclBcastOp) {
}
int main(int argc, char **argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
const int dev_count = p::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
......
......@@ -76,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src,
if (src.IsType<LoDTensor>()) {
if (src.Get<LoDTensor>().place() == dst_place) {
dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
} else {
Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
framework::LoD lod(src.Get<LoDTensor>().lod());
lod.CopyToPeer(dst_place);
dst->GetMutable<LoDTensor>()->set_lod(lod);
}
} else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>();
auto *dst_sr = dst->GetMutable<SelectedRows>();
dst_sr->set_rows(src_sr.rows());
dst_sr->set_height(src_sr.height());
if (src_sr.value().place() == dst_place) {
dst_sr->mutable_value()->ShareDataWith(src_sr.value());
dst_sr->set_rows(src_sr.rows());
} else {
Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
framework::Vector<int64_t> lod(src_sr.rows());
lod.CopyToPeer(dst_place);
dst_sr->set_rows(lod);
}
} else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
......@@ -145,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase {
auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
framework::Copy(src, place, dst);
framework::LoD lod(src.lod());
lod.CopyToPeer(place);
dst->set_lod(lod);
}
}
WaitOnPlaces(places);
......@@ -248,17 +258,19 @@ class ParallelDoGradOp : public framework::OperatorBase {
const std::vector<framework::Scope *> &sub_scopes,
const platform::PlaceList &places) const {
for (auto &s : Outputs(framework::GradVarName(kParameters))) {
VLOG(3) << "Accumulating " << s;
if (s == framework::kEmptyVarName) continue;
std::string tmp_name;
auto *tmp = sub_scopes[0]->Var(&tmp_name);
for (size_t i = 1; i < sub_scopes.size(); ++i) {
CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
WaitOnPlace(places[0]);
WaitOnPlaces(places);
auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{});
VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]);
}
......@@ -334,16 +346,9 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
std::vector<std::string> input{kParameters, kInputs};
std::vector<std::string> output{kOutputs};
PADDLE_ENFORCE(ctx->HasInputs(kParameters));
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
PADDLE_ENFORCE(ctx->HasInputs(kInputs));
for (auto &s : output) {
PADDLE_ENFORCE(ctx->HasInputs(s));
}
PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters));
......@@ -360,10 +365,14 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
ctx->SetDims({ig_name}, {i_dims[i]});
}
if (ctx->HasInputs(kParameters)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters));
auto p_dims = ctx->GetInputsDim(kParameters);
auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
for (size_t i = 0; i < pg_names.size(); ++i) {
auto &pg_name = pg_names[i];
if (pg_name == framework::kEmptyVarName) {
continue;
}
ctx->SetDims({pg_name}, {p_dims[i]});
}
}
};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/framework/op_registry.h"
#include "paddle/framework/reader.h"
namespace paddle {
namespace operators {
class ReadInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Reader"),
"The ReadOp must take a reader as input.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"),
"The ReadOp should be assigned with output.");
std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
std::vector<std::string> out_names = ctx->Outputs("Out");
PADDLE_ENFORCE_EQ(
reader_dims.size(), out_names.size(),
"The reader's dim number doesn't match the output number.");
ctx->SetOutputsDim("Out", reader_dims);
}
};
class ReadInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
std::string reader_name = op_desc.Input("Reader")[0];
std::vector<std::string> out_names = op_desc.Output("Out");
framework::VarDesc* reader = block->FindVarRecursive(reader_name);
auto dtypes = reader->GetDataTypes();
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
for (size_t i = 0; i < dtypes.size(); ++i) {
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
out.SetType(framework::proto::VarDesc::LOD_TENSOR);
out.SetDataType(dtypes[i]);
}
}
};
class ReadOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override {
framework::ReaderHolder* reader =
scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
if (!reader->HasNext()) {
reader->ReInit();
PADDLE_ENFORCE(
reader->HasNext(),
"Reader can not read the next data even it has been re-initialized.");
}
std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < ins.size(); ++i) {
auto* out =
scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
out->ShareDataWith(ins[i]);
out->set_lod(ins[i].lod());
}
}
};
class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddInput("Reader", "(ReaderHolder) The executed reader.");
AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
AddComment(R"DOC(
Read Operator
Execute a given reader once and output data.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker,
paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/target_assign_op.h"
namespace paddle {
namespace operators {
class TargetAssignOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
// checkout inputs
PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"),
"Input(EncodedGTBBox) of TargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
"Input(GTScoreLabel) of TargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
"Input(MatchIndices) of TargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
"Input(NegIndices) of TargetAssignOp should not be null");
// checkout outputs
PADDLE_ENFORCE(
ctx->HasOutput("PredBBoxLabel"),
"Output(PredBBoxLabel) of TargetAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("PredBBoxWeight"),
"Output(PredBBoxWeight) of TargetAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("PredScoreLabel"),
"Output(PredScoreLabel) of TargetAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("PredScoreWeight"),
"Output(PredScoreWeight) of TargetAssignOp should not be null.");
auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
auto mi_dims = ctx->GetInputDim("MatchIndices");
auto neg_dims = ctx->GetInputDim("NegIndices");
PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL,
"The rank of Input(EncodedGTBBox) must be 3.");
PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
"The rank of Input(GTScoreLabel) must be 2.");
PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
"The rank of Input(MatchIndices) must be 2.");
PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
"The rank of Input(NegIndices) must be 2.");
PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0],
"The 1st dimension (means the total number of "
"ground-truth bounding boxes) of Input(EncodedGTBBox) "
"and Input(GTScoreLabel) must be the same.");
PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1],
"The 2nd dimension (means the number of priod boxes) "
"of Input(EncodedGTBBox) and "
"Input(MatchIndices) must be the same.");
PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
"The 3rd dimension of Input(EncodedGTBBox) must be 4.");
auto n = mi_dims[0];
auto np = mi_dims[1];
ctx->SetOutputDim("PredBBoxLabel", {n, np, 4});
ctx->SetOutputDim("PredBBoxWeight", {n, np, 1});
ctx->SetOutputDim("PredScoreLabel", {n, np, 1});
ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
ctx.device_context());
}
};
class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
public:
TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("EncodedGTBBox",
"(LoDTensor), The encoded ground-truth bounding boxes with shape "
"[Ng, Np, 4], where Ng is the total number of ground-truth boxes "
"in this mini-batch, Np the number of predictions, 4 is the "
"number of coordinate in [xmin, ymin, xmax, ymax] layout.");
AddInput("GTScoreLabel",
"(LoDTensor, default LoDTensor<int>), The input ground-truth "
"labels with shape [Ng, 1], where the Ng is the same as it in "
"the input of EncodedGTBBox.");
AddInput("MatchIndices",
"(Tensor, default Tensor<int>), The input matched indices "
"with shape [N, Np], where N is the batch size, Np is the same "
"as it in the input of EncodedGTBBox. If MatchIndices[i][j] "
"is -1, the j-th prior box is not matched to any ground-truh "
"box in i-th instance.");
AddInput("NegIndices",
"(LoDTensor, default LoDTensor<int>), The input negative example "
"indices with shape [Neg, 1], where is the total number of "
"negative example indices.");
AddAttr<int>("background_label",
"(int, default 0), Label index of background class.")
.SetDefault(0);
AddOutput("PredBBoxLabel",
"(Tensor), The output encoded ground-truth labels "
"with shape [N, Np, 4], N is the batch size and Np, 4 is the "
"same as they in input of EncodedGTBBox. If MatchIndices[i][j] "
"is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth "
"box for background_label in i-th instance.");
AddOutput("PredBBoxWeight",
"(Tensor), The weight for PredBBoxLabel with the shape "
"of [N, Np, 1]");
AddOutput("PredScoreLabel",
"(Tensor, default Tensor<int>), The output score labels for "
"each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
"is -1, PredScoreLabel[i][j] = background_label.");
AddOutput("PredScoreWeight",
"(Tensor), The weight for PredScoreLabel with the shape "
"of [N, Np, 1]");
AddComment(R"DOC(
This operator is, for given the encoded boxes between prior boxes and
ground-truth boxes and ground-truth class labels, to assign classification
and regression targets to each prior box as well as weights to each
prior box. The weights is used to specify which prior box would not contribute
to training loss.
For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`,
`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`.
Assumed that the row offset for each instance in `EncodedGTBBox` is called lod,
this operato assigns classification/regression targets by performing the
following steps:
1. Assigning all outpts based on `MatchIndices`:
If id = MatchIndices[i][j] > 0,
PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j]
PredBBoxWeight[i][j] = 1.
PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
PredScoreWeight[i][j] = 1.
Otherwise,
PredBBoxLabel[j][j] = [0., 0., 0., 0.]
PredBBoxWeight[i][j] = 0.
PredScoreLabel[i][j] = background_label
PredScoreWeight[i][j] = 0.
2. Assigning PredScoreWeight based on `NegIndices`:
Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod,
for i-th instance and all ids of NegIndices in this instance:
PredScoreLabel[i][id] = background_label
PredScoreWeight[i][id] = 1.0
)DOC");
}
};
template <typename T>
struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
const size_t* lod, const int num, const int num_prior_box,
const int background_label, int* out_label, T* out_label_wt) {
for (int i = 0; i < num; ++i) {
for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
int id = neg_indices[j];
out_label[i * num_prior_box + id] = background_label;
out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0);
}
}
}
};
template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>;
template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
ops::TargetAssignOpMaker);
REGISTER_OP_CPU_KERNEL(
target_assign,
ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>,
ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/target_assign_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
const int num, const int num_prior_box,
const int background_label,
int* out_label, T* out_label_wt) {
int bidx = blockIdx.x;
int st = lod[bidx];
int ed = lod[bidx + 1];
int row_start = bidx * num_prior_box;
for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
int id = row_start + neg_indices[i];
out_label[id] = background_label;
out_label_wt[id] = 1.;
}
}
template <typename T>
struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const int* neg_indices, const size_t* lod, const int num,
const int num_prior_box, const int background_label,
int* out_label, T* out_label_wt) {
const int block_size = 256;
const int grid_size = num;
NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
neg_indices, lod, num, num_prior_box, background_label, out_label,
out_label_wt);
}
};
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
target_assign,
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/for_range.h"
namespace paddle {
namespace operators {
template <typename T>
struct TargetAssignFunctor {
const T* gt_box_;
const int* gt_label_;
const int* match_indices_;
const size_t* lod_;
const int background_label_;
const int64_t num_;
const int64_t num_prior_box_;
T* out_box_;
T* out_box_wt_;
int* out_label_;
T* out_label_wt_;
TargetAssignFunctor(const T* gt_box, const int* gt_label,
const int* match_indices, const size_t* lod,
const int background_label, const int64_t num,
const int64_t np, T* out_box, T* out_box_wt,
int* out_label, T* out_label_wt)
: gt_box_(gt_box),
gt_label_(gt_label),
match_indices_(match_indices),
lod_(lod),
background_label_(background_label),
num_(num),
num_prior_box_(np),
out_box_(out_box),
out_box_wt_(out_box_wt),
out_label_(out_label),
out_label_wt_(out_label_wt) {}
HOSTDEVICE void operator()(size_t i) const {
int row = i / num_prior_box_;
int col = i - row * num_prior_box_;
size_t row_off = lod_[row];
int offset = row * num_prior_box_ + col;
int id = match_indices_[offset];
T* obox = out_box_ + offset * 4;
int* olabel = out_label_ + offset;
T* obox_wt = out_box_wt_ + offset;
T* olabel_wt = out_label_wt_ + offset;
if (id > -1) {
const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4;
obox[0] = gtbox[0];
obox[1] = gtbox[1];
obox[2] = gtbox[2];
obox[3] = gtbox[3];
olabel[0] = gt_label_[row_off + id];
obox_wt[0] = static_cast<T>(1.);
olabel_wt[0] = static_cast<T>(1.);
} else {
obox[0] = static_cast<T>(0.);
obox[1] = static_cast<T>(0.);
obox[2] = static_cast<T>(0.);
obox[3] = static_cast<T>(0.);
olabel[0] = background_label_;
obox_wt[0] = static_cast<T>(0.);
olabel_wt[0] = static_cast<T>(0.);
}
}
};
template <typename DeviceContext, typename T>
struct NegTargetAssignFunctor {
void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
const size_t* lod, const int num, const int num_prior_box,
const int background_label, int* out_label,
T* out_label_wt) const;
};
template <typename DeviceContext, typename T>
class TargetAssignKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox");
auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL);
PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL);
PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
int background_label = ctx.Attr<int>("background_label");
const T* box_data = enc_gt_box->data<T>();
const int* label_data = gt_label->data<int>();
const int* match_idx_data = match_indices->data<int>();
const int* neg_idx_data = neg_indices->data<int>();
T* obox_data = out_box->mutable_data<T>(ctx.GetPlace());
T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace());
int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
int64_t num = match_indices->dims()[0];
int64_t num_prior_box = match_indices->dims()[1];
auto gt_lod = enc_gt_box->lod().back();
auto gt_label_lod = gt_label->lod().back();
auto neg_lod = neg_indices->lod().back();
for (size_t i = 0; i < gt_lod.size(); ++i) {
PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
}
size_t* gt_lod_data = gt_lod.data(ctx.GetPlace());
size_t* neg_lod_data = neg_lod.data(ctx.GetPlace());
TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
gt_lod_data, background_label, num,
num_prior_box, obox_data, obox_wt_data,
olabel_data, olabel_wt_data);
auto& device_ctx = ctx.template device_context<DeviceContext>();
platform::ForRange<DeviceContext> for_range(device_ctx,
num * num_prior_box);
for_range(functor);
NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor;
neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box,
background_label, olabel_data, olabel_wt_data);
}
};
} // namespace operators
} // namespace paddle
......@@ -39,11 +39,3 @@ nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
cc_library(profiler SRCS profiler.cc DEPS device_context)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
if(NOT WITH_C_API AND WITH_FLUID)
file(GLOB PLATFORM_HEADERS *.h)
file(GLOB PLATFORM_dynload_HEADERS dynload/*.h)
install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform)
install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform/dynload)
install(FILES details/device_ptr_cast.h DESTINATION include/paddle/platform/details)
endif()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
......
......@@ -127,6 +127,9 @@ TEST(NCCL, all_reduce) {
} // namespace paddle
int main(int argc, char** argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
......
......@@ -223,8 +223,6 @@ void BindVarDsec(py::module &m) {
.def("set_shapes", &VarDesc::SetShapes)
.def("set_dtype", &VarDesc::SetDataType)
.def("set_dtypes", &VarDesc::SetDataTypes)
.def("set_tensor_num", &VarDesc::SetTensorDescNum)
.def("tensor_num", &VarDesc::GetTensorDescNum)
.def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
.def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
.def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
......
......@@ -2,9 +2,3 @@ cc_library(stringpiece SRCS piece.cc)
cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
cc_test(to_string_test SRCS to_string_test.cc)
if(NOT WITH_C_API AND WITH_FLUID)
file(GLOB STRING_HEADERS *.h)
install(FILES ${STRING_HEADERS} DESTINATION include/paddle/string)
install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/string/tinyformat)
endif()
......@@ -324,6 +324,9 @@ class DistributeTranspiler:
pass
return orig_shape
def _op_input_var(self, op, varname):
pass
def _is_op_on_pserver(self, endpoint, all_ops, idx):
"""
Recursively check if the op need to run on current server.
......@@ -333,29 +336,35 @@ class DistributeTranspiler:
p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
]
op = all_ops[idx]
if op.inputs.has_key("Param"):
if op.inputs["Param"].name in param_names:
input_names = set(op.input_names)
# TODO(typhoonzero): using Param and Grad input name to identify
# that the operator is an optimization operator, need a better way.
if "Param" in input_names:
if op.input("Param")[0] in param_names:
return True
else:
for n in param_names:
if same_or_split_var(n, op.inputs[
"Param"].name) and n != op.inputs["Param"].name:
if same_or_split_var(n, op.input("Param")[0]) \
and n != op.input("Param")[0]:
return True
return False
else:
j = idx - 1
while j >= 0:
prev_op = all_ops[j]
prev_output_names = [o.name for o in prev_op.outputs.values()]
prev_input_names = [o.name for o in prev_op.inputs.values()]
# prev_output_names = [o.name for o in prev_op.outputs.values()]
# prev_input_names = [o.name for o in prev_op.inputs.values()]
# NOTE(typhoonzero): consider list input/output
prev_output_names = prev_op.desc.output_arg_names()
prev_input_names = prev_op.desc.input_arg_names()
found1 = False
found2 = False
for _, v in op.inputs.iteritems():
if v.name in prev_output_names:
for varname in op.desc.input_arg_names():
if varname in prev_output_names:
found1 = self._is_op_on_pserver(endpoint, all_ops, j)
# later ops may produce output for prev op's next batch use.
for _, v in op.outputs.iteritems():
if v.name in prev_input_names:
for varname in op.desc.output_arg_names():
if varname in prev_input_names:
found2 = self._is_op_on_pserver(endpoint, all_ops, j)
if found1 or found2:
return True
......@@ -366,11 +375,11 @@ class DistributeTranspiler:
new_inputs = dict()
# update param/grad shape first, then other inputs like
# moment can use the updated shape
for key, var in opt_op.inputs.iteritems():
for key in opt_op.input_names:
if key == "Grad":
grad_block = None
for g in self.param_grad_ep_mapping[endpoint]["grads"]:
if same_or_split_var(g.name, var.name):
if same_or_split_var(g.name, opt_op.input(key)[0]):
grad_block = g
break
if not grad_block:
......@@ -400,7 +409,7 @@ class DistributeTranspiler:
# param is already created on global program
param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, var.name):
if same_or_split_var(p.name, opt_op.input(key)[0]):
param_block = p
break
if not param_block:
......@@ -413,11 +422,12 @@ class DistributeTranspiler:
new_inputs[key] = tmpvar
for key, var in opt_op.inputs.iteritems():
for key in opt_op.input_names:
if key in ["Param", "Grad"]:
continue
# update accumulator variable shape
param_shape = new_inputs["Param"].shape
var = program.global_block().vars[opt_op.input(key)[0]]
new_shape = self._get_optimizer_input_shape(opt_op.type, key,
var.shape, param_shape)
tmpvar = program.global_block().create_var(
......@@ -436,30 +446,44 @@ class DistributeTranspiler:
shape=new_shape)
# change output's ParamOut variable
opt_op.outputs["ParamOut"] = new_inputs["Param"]
outputs = self._get_output_map_from_op(program.global_block(), opt_op)
outputs["ParamOut"] = new_inputs["Param"]
program.global_block().append_op(
type=opt_op.type,
inputs=new_inputs,
outputs=opt_op.outputs,
outputs=outputs,
attrs=opt_op.attrs)
def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
# Append the ops for parameters that do not need to be optimized/updated
for _, var in opt_op.inputs.iteritems():
program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
pserver_program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
inputs = self._get_input_map_from_op(self.program.global_block().vars,
opt_op)
for var in inputs.itervalues():
if type(var) == list:
varlist = var
else:
varlist = [var]
for var in varlist:
# TODO(typhoonzero): will remove below line later.
program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
if not pserver_program.global_block().vars.has_key(var.name):
pserver_program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
outputs = self._get_output_map_from_op(self.program.global_block().vars,
opt_op)
program.global_block().append_op(
type=opt_op.type,
inputs=opt_op.inputs,
outputs=opt_op.outputs,
inputs=inputs,
outputs=outputs,
attrs=opt_op.attrs)
def get_pserver_program(self, endpoint):
......@@ -503,7 +527,7 @@ class DistributeTranspiler:
self.optimize_ops, idx)
if not is_op_on_pserver:
continue
if opt_op.inputs.has_key("Grad"):
if "Grad" in opt_op.desc.input_arg_names():
self._append_pserver_ops(optimize_sub_program, pserver_program,
opt_op, endpoint)
else:
......@@ -530,6 +554,30 @@ class DistributeTranspiler:
pserver_program.sync_with_cpp()
return pserver_program
def _get_input_map_from_op(self, varmap, op):
iomap = dict()
for key in op.input_names:
vars = []
for varname in op.input(key):
vars.append(varmap[varname])
if len(vars) == 1:
iomap[key] = vars[0]
else:
iomap[key] = vars
return iomap
def _get_output_map_from_op(self, varmap, op):
iomap = dict()
for key in op.output_names:
vars = []
for varname in op.output(key):
vars.append(varmap[varname])
if len(vars) == 1:
iomap[key] = vars[0]
else:
iomap[key] = vars
return iomap
def get_startup_program(self, endpoint, pserver_program):
"""
Get startup program for current parameter server.
......@@ -560,17 +608,21 @@ class DistributeTranspiler:
# 2. rename op outputs
for op in orig_s_prog.global_block().ops:
new_inputs = dict()
new_outputs = dict()
# do not append startup op if var is not on this pserver
op_on_pserver = False
for key, var in op.outputs.iteritems():
newname, _ = _get_splited_name_and_shape(var.name)
for key in op.output_names:
newname, _ = _get_splited_name_and_shape(op.output(key)[0])
if newname:
op_on_pserver = True
new_outputs[key] = created_var_map[newname]
elif var.name in pserver_vars:
elif op.output(key)[0] in pserver_vars:
op_on_pserver = True
new_outputs[key] = pserver_vars[var.name]
new_outputs[key] = pserver_vars[op.output(key)[0]]
# most startup program ops have no inputs
new_inputs = self._get_input_map_from_op(pserver_vars, op)
if op_on_pserver:
if op.type in [
......@@ -579,7 +631,7 @@ class DistributeTranspiler:
op.attrs["shape"] = new_outputs["Out"].shape
s_prog.global_block().append_op(
type=op.type,
inputs=op.inputs,
inputs=new_inputs,
outputs=new_outputs,
attrs=op.attrs)
return s_prog
......@@ -47,27 +47,13 @@ def as_numpy(tensor):
return [as_numpy(t) for t in tensor]
assert isinstance(tensor, core.LoDTensor)
lod = tensor.lod()
tensor_data = np.array(tensor)
if len(lod) == 0:
ans = tensor_data
else:
raise RuntimeError("LoD Calculate lacks unit tests and buggy")
# elif len(lod) == 1:
# ans = []
# idx = 0
# while idx < len(lod) - 1:
# ans.append(tensor_data[lod[idx]:lod[idx + 1]])
# idx += 1
# else:
# for l in reversed(lod):
# ans = []
# idx = 0
# while idx < len(l) - 1:
# ans.append(tensor_data[l[idx]:l[idx + 1]])
# idx += 1
# tensor_data = ans
# ans = tensor_data
return ans
if len(lod) > 0:
raise RuntimeError(
"Some of your featched tensors hold LoD information. \
They can not be completely cast to Python ndarray. \
Please set the parameter 'return_numpy' as 'False' to \
return LoDTensor itself directly.")
return np.array(tensor)
def has_feed_operators(block, feed_targets, feed_holder_name):
......@@ -306,7 +292,6 @@ class Executor(object):
core.get_fetch_variable(scope, fetch_var_name, i)
for i in xrange(len(fetch_list))
]
if return_numpy:
outs = as_numpy(outs)
return outs
......@@ -768,6 +768,9 @@ class Block(object):
raise e
self.desc.remove_op(start, end + 1)
def slice_ops(self, start, end):
return list(self.ops)[start:end]
def prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op()
op = Operator(self, op_desc, *args, **kwargs)
......
......@@ -342,7 +342,11 @@ def save_inference_model(dirname,
prepend_feed_ops(inference_program, feeded_var_names)
append_fetch_ops(inference_program, fetch_var_names)
model_file_name = dirname + "/__model__"
if save_file_name == None:
model_file_name = dirname + "/__model__"
else:
model_file_name = dirname + "/__model_combined__"
with open(model_file_name, "wb") as f:
f.write(inference_program.desc.serialize_to_string())
......@@ -384,7 +388,11 @@ def load_inference_model(dirname, executor, load_file_name=None):
if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname)
model_file_name = dirname + "/__model__"
if load_file_name == None:
model_file_name = dirname + "/__model__"
else:
model_file_name = dirname + "/__model_combined__"
with open(model_file_name, "rb") as f:
program_desc_str = f.read()
......
......@@ -18,6 +18,7 @@ from tensor import assign, fill_constant
from .. import core
from ..framework import Program, Variable, Operator
from ..layer_helper import LayerHelper, unique_name
from ops import logical_and, logical_not, logical_or
__all__ = [
'split_lod_tensor',
......@@ -27,6 +28,7 @@ __all__ = [
'StaticRNNMemoryLink',
'WhileGuard',
'While',
'Switch',
'lod_rank_table',
'max_sequence_len',
'topk',
......@@ -36,6 +38,7 @@ __all__ = [
'array_write',
'create_array',
'less_than',
'equal',
'array_read',
'shrink_memory',
'array_length',
......@@ -274,21 +277,20 @@ class ParallelDo(object):
parent_block = self.parent_block()
local_inputs = set()
for op in current_block.ops:
for oname in op.output_names:
for out_var_name in op.output(oname):
local_inputs.add(out_var_name)
params = list()
for var in self.inputs:
local_inputs.add(var.name)
params = list()
for op in current_block.ops:
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in local_inputs:
params.append(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
local_inputs.add(out_var_name)
params = list(set(params))
return [parent_block.var(name) for name in params]
......@@ -973,6 +975,36 @@ def less_than(x, y, cond=None, **ignored):
return cond
def equal(x, y, cond=None, **ignored):
"""
**equal**
This layer returns the truth value of :math:`x == y` elementwise.
Args:
x(Variable): First operand of *equal*
y(Variable): Second operand of *equal*
cond(Variable|None): Optional output variable to store the result of *equal*
Returns:
Variable: The tensor variable storing the output of *equal*.
Examples:
.. code-block:: python
less = fluid.layers.equal(x=label, y=limit)
"""
helper = LayerHelper("equal", **locals())
if cond is None:
cond = helper.create_tmp_variable(dtype='bool')
cond.stop_gradient = True
helper.append_op(
type='equal', inputs={'X': [x],
'Y': [y]}, outputs={'Out': [cond]})
return cond
def array_read(array, i):
"""This function performs the operation to read the data in as an
LOD_TENSOR_ARRAY.
......@@ -1063,11 +1095,12 @@ class ConditionalBlockGuard(BlockGuard):
class ConditionalBlock(object):
def __init__(self, inputs, name=None):
def __init__(self, inputs, is_scalar_condition=False, name=None):
for each_input in inputs:
if not isinstance(each_input, Variable):
raise TypeError("Each input should be variable")
self.inputs = inputs
self.is_scalar_condition = is_scalar_condition
self.helper = LayerHelper('conditional_block', name=name)
def block(self):
......@@ -1112,7 +1145,66 @@ class ConditionalBlock(object):
},
outputs={'Out': out_list,
'Scope': [step_scope]},
attrs={'sub_block': inside_block})
attrs={
'sub_block': inside_block,
'is_scalar_condition': self.is_scalar_condition
})
class Switch(object):
def __init__(self, name=None):
self.helper = LayerHelper('switch', name=name)
self.inside_scope = False
self.pre_not_conditions = []
def case(self, condition):
"""create a new block for this condition
"""
if not self.inside_scope:
raise ValueError("case should be called inside with")
if len(self.pre_not_conditions) == 0:
cond_block = ConditionalBlock([condition], is_scalar_condition=True)
not_cond = logical_not(x=condition)
self.pre_not_conditions.append(not_cond)
else:
pre_cond_num = len(self.pre_not_conditions)
pre_not_cond = self.pre_not_conditions[pre_cond_num - 1]
new_not_cond = logical_and(
x=pre_not_cond, y=logical_not(x=condition))
self.pre_not_conditions.append(new_not_cond)
cond_block = ConditionalBlock(
[logical_and(
x=pre_not_cond, y=condition)],
is_scalar_condition=True)
return ConditionalBlockGuard(cond_block)
def default(self):
"""create a default case for this switch
"""
pre_cond_num = len(self.pre_not_conditions)
if pre_cond_num == 0:
raise ValueError("there should be at least one condition")
cond_block = ConditionalBlock(
[self.pre_not_conditions[pre_cond_num - 1]],
is_scalar_condition=True)
return ConditionalBlockGuard(cond_block)
def __enter__(self):
"""
set flag that now is inside switch.block {}
:return:
"""
self.inside_scope = True
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.inside_scope = False
if exc_type is not None:
return False # re-raise exception
return True
class IfElseBlockGuard(object):
......
......@@ -92,7 +92,7 @@ def fc(input,
.. math::
Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
In the above equation:
......@@ -410,12 +410,12 @@ def dynamic_lstmp(input,
"""
**Dynamic LSTMP Layer**
LSTMP (LSTM with recurrent projection) layer has a separate projection
layer after the LSTM layer, projecting the original hidden state to a
lower-dimensional one, which is proposed to reduce the number of total
parameters and furthermore computational complexity for the LSTM,
espeacially for the case that the size of output units is relative
large (https://research.google.com/pubs/archive/43905.pdf).
LSTMP (LSTM with recurrent projection) layer has a separate projection
layer after the LSTM layer, projecting the original hidden state to a
lower-dimensional one, which is proposed to reduce the number of total
parameters and furthermore computational complexity for the LSTM,
espeacially for the case that the size of output units is relative
large (https://research.google.com/pubs/archive/43905.pdf).
The formula is as follows:
......@@ -441,27 +441,27 @@ def dynamic_lstmp(input,
the matrix of weights from the input gate to the input).
* :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
matrices for peephole connections. In our implementation, \
we use vectors to reprenset these diagonal weight matrices.
we use vectors to reprenset these diagonal weight matrices.
* :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
bias vector).
bias vector).
* :math:`\sigma`: The activation, such as logistic sigmoid function.
* :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
gate, and cell activation vectors, respectively, all of which have \
the same size as the cell output activation vector :math:`h`.
the same size as the cell output activation vector :math:`h`.
* :math:`h`: The hidden state.
* :math:`r`: The recurrent projection of the hidden state.
* :math:`r`: The recurrent projection of the hidden state.
* :math:`\\tilde{c_t}`: The candidate hidden state, whose \
computation is based on the current input and previous hidden state.
* :math:`\odot`: The element-wise product of the vectors.
* :math:`\odot`: The element-wise product of the vectors.
* :math:`act_g` and :math:`act_h`: The cell input and cell output \
activation functions and `tanh` is usually used for them.
activation functions and `tanh` is usually used for them.
* :math:`\overline{act_h}`: The activation function for the projection \
output, usually using `identity` or same as :math:`act_h`.
Set `use_peepholes` to `False` to disable peephole connection. The formula
is omitted here, please refer to the paper
http://www.bioinf.jku.at/publications/older/2604.pdf for details.
Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
operations on the input :math:`x_{t}` are NOT included in this operator.
Users can choose to use fully-connected layer before LSTMP layer.
......@@ -479,8 +479,8 @@ def dynamic_lstmp(input,
- Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
W_{fh}, W_{oh}`}.
- The shape of hidden-hidden weight is (P x 4D),
where P is the projection size and D the hidden
- The shape of hidden-hidden weight is (P x 4D),
where P is the projection size and D the hidden
size.
- Projection weight = {:math:`W_{rh}`}.
- The shape of projection weight is (D x P).
......@@ -525,9 +525,9 @@ def dynamic_lstmp(input,
hidden_dim, proj_dim = 512, 256
fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
act=None, bias_attr=None)
proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
size=hidden_dim * 4,
proj_size=proj_dim,
proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
size=hidden_dim * 4,
proj_size=proj_dim,
use_peepholes=False,
is_reverse=True,
cell_activation="tanh",
......@@ -2525,7 +2525,8 @@ def ctc_greedy_decoder(input, blank, name=None):
interval [0, num_classes + 1).
Returns:
Variable: CTC greedy decode result.
Variable: CTC greedy decode result. If all the sequences in result were
empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
Examples:
.. code-block:: python
......
......@@ -61,6 +61,10 @@ __all__ = [
'clip_by_norm',
'softmax',
'sequence_softmax',
'logical_and',
'logical_or',
'logical_xor',
'logical_not',
] + __activations__
for _OP in set(__all__):
......
......@@ -15,7 +15,10 @@
import layers
from framework import Variable
__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
__all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
'polynomial_decay', 'piecewise_decay'
]
"""
When training a model, it's often useful to decay the
learning rate during training process, this is called
......@@ -101,7 +104,7 @@ def inverse_time_decay(learning_rate,
```python
if staircase:
decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
else
else:
decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
```
Args:
......@@ -123,3 +126,98 @@ def inverse_time_decay(learning_rate,
div_res = layers.floor(x=div_res)
return learning_rate / (1 + decay_rate * div_res)
def polynomial_decay(learning_rate,
global_step,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False):
"""Applies polynomial decay to the initial learning rate.
```python
if cycle:
decay_steps = decay_steps * ceil(global_step / decay_steps)
else:
global_step = min(global_step, decay_steps)
decayed_learning_rate = (learning_rate - end_learning_rate) *
(1 - global_step / decay_steps) ^ power +
end_learning_rate
```
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
end_learning_rate: A Python `float` number.
power: A Python `float` number
cycle: Boolean. If set true, decay the learning rate every decay_steps.
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for inverse_time_decay.")
if cycle:
div_res = layers.ceil(x=(global_step / decay_steps))
zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
with layers.Switch() as switch:
with switch.case(layers.equal(x=global_step, y=zero_var)):
layers.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res
else:
decay_steps_var = layers.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps))
global_step = layers.elementwise_min(x=global_step, y=decay_steps_var)
return (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate
def piecewise_decay(global_step, boundaries, values):
"""Applies piecewise decay to the initial learning rate.
```python
boundaries = [10000, 20000]
values = [1.0, 0.5, 0.1]
if step < 10000:
learning_rate = 1.0
elif step >= 10000 and step < 20000:
learning_rate = 0.5
else:
learning_rate = 0.1
```
"""
if len(values) - len(boundaries) != 1:
raise ValueError("len(values) - len(boundaries) should be 1")
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for piecewise_decay.")
lr = layers.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="learning_rate")
with layers.Switch() as switch:
for i in range(len(boundaries)):
boundary_val = layers.fill_constant(
shape=[1], dtype='float32', value=float(boundaries[i]))
value_var = layers.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(layers.less_than(global_step, boundary_val)):
layers.assign(value_var, lr)
last_value_var = layers.fill_constant(
shape=[1], dtype='float32', value=float(values[len(values) - 1]))
with switch.default():
layers.assign(last_value_var, lr)
return lr
......@@ -92,14 +92,13 @@ class ControlFlowGraph(object):
live_in = defaultdict(set)
live_out = defaultdict(set)
while True:
for i in range(self.op_size):
for i in range(self.op_size, 0, -1):
live_in[i] = set(self._live_in[i])
live_out[i] = set(self._live_out[i])
self._live_in[i] = self._uses[i] | (
self._live_out[i] - self._defs[i])
for s in self._successors[i]:
self._live_out[i] |= self._live_in[s]
self._live_in[i] = self._uses[i] | (
self._live_out[i] - self._defs[i])
if self._reach_fixed_point(live_in, live_out):
break
......@@ -145,7 +144,6 @@ class ControlFlowGraph(object):
if op.type() == "while" or op.type() == "while_grad":
continue
block_desc = op.block()
self.current_block_desc = block_desc
is_forward = i < self._forward_num
if self.pool:
defs_can_optimize = filter(
......@@ -156,6 +154,9 @@ class ControlFlowGraph(object):
for x in defs_can_optimize
]
for x, x_shape in out_pair:
# If x is both in uses and defs, it can not be optimized!
if x in self._uses[i]:
continue
for index, cache_pair in enumerate(self.pool):
cache_var = cache_pair[0]
cache_shape = cache_pair[1]
......@@ -208,17 +209,17 @@ def get_cfgs(input_program):
while_sub_block_ids = []
while_grad_sub_block_ids = []
while_op_output = set()
while_block_id_pair = []
while_op_dict = {}
for i in range(op_size):
op = block_desc.op(i)
if op.type() == "while":
while_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
while_op_dict[op.attr("sub_block").id] = op
elif op.type() == "while_grad":
while_grad_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
while_op_dict[op.attr("sub_block").id] = op
# Find while/while_grad block pair
for grad_id in while_grad_sub_block_ids:
......@@ -240,6 +241,10 @@ def get_cfgs(input_program):
for i in range(while_grad_block_op_size):
while_block_ops.append(while_grad_block.op(i))
while_op_output = set()
while_op_output.update(while_op_dict[parent_id].output_arg_names())
while_op_output.update(while_op_dict[grad_id].output_arg_names())
ops_list.append((while_block_ops, while_block_op_size, while_op_output))
# Process rest while block ops
......@@ -250,9 +255,15 @@ def get_cfgs(input_program):
for i in range(while_block_op_size):
while_block_ops.append(while_block.op(i))
ops_list.append((while_block_ops, while_block_op_size))
while_op_output = set()
while_op_output.update(while_op_dict[parent_id].output_arg_names())
ops_list.append((while_block_ops, while_block_op_size, while_op_output))
cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
cfgs = [
ControlFlowGraph(input_program, ops, forward_num, skip_opt)
for ops, forward_num, skip_opt in ops_list
]
return cfgs
......
......@@ -190,6 +190,8 @@ class Optimizer(object):
# Create any accumulators
program = loss.block.program
with program_guard(program, startup_program):
global_block = framework.default_main_program().global_block()
start = len(global_block.ops)
self.helper = LayerHelper(self.__class__.__name__)
self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads])
......@@ -203,19 +205,14 @@ class Optimizer(object):
param_and_grad)
optimize_ops.append(optimize_op)
# Returned list of ops can include more ops in addition
# to optimization ops
return_ops = optimize_ops
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
finish_ops = self._finish_update(loss.block)
if finish_ops is not None:
return_ops += finish_ops
self._finish_update(loss.block)
if self._global_step is not None:
return_ops.append(self._increment_global_step(loss.block))
return return_ops
self._increment_global_step(loss.block)
end = len(global_block.ops)
return global_block.slice_ops(start, end)
def minimize(self,
loss,
......
recognize_digits_*.inference.model
*.inference.model
......@@ -16,6 +16,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import contextlib
import unittest
import math
import sys
def main(use_cuda):
......@@ -58,6 +60,8 @@ def main(use_cuda):
print(avg_loss_value)
if avg_loss_value[0] < 10.0:
return
if math.isnan(float(avg_loss_value)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
avg_loss_value[0]))
......
......@@ -16,8 +16,11 @@ from __future__ import print_function
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import unittest
import contextlib
import math
import sys
import numpy
import unittest
def resnet_cifar10(input, depth=32):
......@@ -89,10 +92,7 @@ def vgg16_bn_drop(input):
return fc2
def main(net_type, use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
def train(net_type, use_cuda, save_dirname):
classdim = 10
data_shape = [3, 32, 32]
......@@ -111,12 +111,14 @@ def main(net_type, use_cuda):
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=predict, label=label)
# Test program
test_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
BATCH_SIZE = 128
PASS_NUM = 1
......@@ -125,6 +127,9 @@ def main(net_type, use_cuda):
paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
......@@ -132,18 +137,70 @@ def main(net_type, use_cuda):
loss = 0.0
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
pass_acc))
return
raise AssertionError(
"Image classification loss is too large, {0:2.2}".format(loss))
for batch_id, data in enumerate(train_reader()):
exe.run(feed=feeder.feed(data))
if (batch_id % 10) == 0:
acc_list = []
avg_loss_list = []
for tid, test_data in enumerate(test_reader()):
loss_t, acc_t = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[avg_cost, acc])
if math.isnan(float(loss_t)):
sys.exit("got NaN loss, training failed.")
acc_list.append(float(acc_t))
avg_loss_list.append(float(loss_t))
break # Use 1 segment for speeding up CI
acc_value = numpy.array(acc_list).mean()
avg_loss_value = numpy.array(avg_loss_list).mean()
print(
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1,
float(avg_loss_value), float(acc_value)))
if acc_value > 0.01: # Low threshold for speeding up CI
fluid.io.save_inference_model(save_dirname, ["pixel"],
[predict], exe)
return
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# The input's dimension of conv should be 4-D or 5-D.
tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
print("infer results: ", results[0])
def main(net_type, use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the trained model
save_dirname = "image_classification_" + net_type + ".inference.model"
train(net_type, use_cuda, save_dirname)
infer(use_cuda, save_dirname)
class TestImageClassification(unittest.TestCase):
......
......@@ -18,7 +18,9 @@ import numpy as np
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.fluid as fluid
import contextlib
import time
import unittest
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
......@@ -127,7 +129,15 @@ def to_lodtensor(data, place):
return res
def main():
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(use_cuda, save_dirname=None):
# define network topology
word = fluid.layers.data(
name='word_data', shape=[1], dtype='int64', lod_level=1)
......@@ -175,8 +185,8 @@ def main():
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE)
# place = fluid.CPUPlace()
place = fluid.CUDAPlace(0)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
......@@ -211,12 +221,102 @@ def main():
if batch_id != 0:
print("second per batch: " + str((time.time() - start_time)
/ batch_id))
# exit early for CI
exit(0)
# Set the threshold low to speed up the CI test
if float(pass_precision) > 0.05:
if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, [
'word_data', 'verb_data', 'ctx_n2_data',
'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
'ctx_p2_data', 'mark_data'
], [feature_out], exe)
return
batch_id = batch_id + 1
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
lod = [0, 4, 10]
ts_word = create_random_lodtensor(lod, place, low=0, high=1)
ts_pred = create_random_lodtensor(lod, place, low=0, high=1)
ts_ctx_n2 = create_random_lodtensor(lod, place, low=0, high=1)
ts_ctx_n1 = create_random_lodtensor(lod, place, low=0, high=1)
ts_ctx_0 = create_random_lodtensor(lod, place, low=0, high=1)
ts_ctx_p1 = create_random_lodtensor(lod, place, low=0, high=1)
ts_ctx_p2 = create_random_lodtensor(lod, place, low=0, high=1)
ts_mark = create_random_lodtensor(lod, place, low=0, high=1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
assert feed_target_names[0] == 'word_data'
assert feed_target_names[1] == 'verb_data'
assert feed_target_names[2] == 'ctx_n2_data'
assert feed_target_names[3] == 'ctx_n1_data'
assert feed_target_names[4] == 'ctx_0_data'
assert feed_target_names[5] == 'ctx_p1_data'
assert feed_target_names[6] == 'ctx_p2_data'
assert feed_target_names[7] == 'mark_data'
results = exe.run(inference_program,
feed={
feed_target_names[0]: ts_word,
feed_target_names[1]: ts_pred,
feed_target_names[2]: ts_ctx_n2,
feed_target_names[3]: ts_ctx_n1,
feed_target_names[4]: ts_ctx_0,
feed_target_names[5]: ts_ctx_p1,
feed_target_names[6]: ts_ctx_p2,
feed_target_names[7]: ts_mark
},
fetch_list=fetch_targets,
return_numpy=False)
print(results[0].lod())
np_data = np.array(results[0])
print("Inference Shape: ", np_data.shape)
print("Inference results: ", np_data)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the trained model
save_dirname = "label_semantic_roles.inference.model"
train(use_cuda, save_dirname)
infer(use_cuda, save_dirname)
class TestLabelSemanticRoles(unittest.TestCase):
def test_cuda(self):
with self.scope_prog_guard():
main(use_cuda=True)
def test_cpu(self):
with self.scope_prog_guard():
main(use_cuda=False)
@contextlib.contextmanager
def scope_prog_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__':
main()
unittest.main()
......@@ -18,6 +18,8 @@ import paddle.v2 as paddle
import sys
import numpy
import unittest
import math
import sys
def parse_arg():
......@@ -65,6 +67,7 @@ def conv_net(img, label):
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
......@@ -75,7 +78,7 @@ def conv_net(img, label):
return loss_net(conv_pool_2, label)
def train(nn_type, use_cuda, parallel, save_dirname):
def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
......@@ -140,18 +143,22 @@ def train(nn_type, use_cuda, parallel, save_dirname):
avg_loss_val = numpy.array(avg_loss_set).mean()
if float(acc_val) > 0.85: # test acc > 85%
if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, ["img"],
[prediction], exe)
fluid.io.save_inference_model(
save_dirname, ["img"], [prediction],
exe,
save_file_name=save_param_filename)
return
else:
print(
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val)))
if math.isnan(float(avg_loss_val)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Loss of recognize digits is too large")
def infer(use_cuda, save_dirname=None):
def infer(use_cuda, save_dirname=None, param_filename=None):
if save_dirname is None:
return
......@@ -162,13 +169,14 @@ def infer(use_cuda, save_dirname=None):
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
[inference_program, feed_target_names, fetch_targets
] = fluid.io.load_inference_model(save_dirname, exe, param_filename)
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
[1, 1, 28, 28]).astype("float32")
[batch_size, 1, 28, 28]).astype("float32")
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
......@@ -178,36 +186,45 @@ def infer(use_cuda, save_dirname=None):
print("infer results: ", results[0])
def main(use_cuda, parallel, nn_type):
def main(use_cuda, parallel, nn_type, combine):
if not use_cuda and not parallel:
save_dirname = "recognize_digits_" + nn_type + ".inference.model"
save_filename = None
if combine == True:
save_filename = "__params_combined__"
else:
save_dirname = None
save_filename = None
train(
nn_type=nn_type,
use_cuda=use_cuda,
parallel=parallel,
save_dirname=save_dirname)
infer(use_cuda=use_cuda, save_dirname=save_dirname)
save_dirname=save_dirname,
save_param_filename=save_filename)
infer(
use_cuda=use_cuda,
save_dirname=save_dirname,
param_filename=save_filename)
class TestRecognizeDigits(unittest.TestCase):
pass
def inject_test_method(use_cuda, parallel, nn_type):
def inject_test_method(use_cuda, parallel, nn_type, combine):
def __impl__(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
main(use_cuda, parallel, nn_type)
main(use_cuda, parallel, nn_type, combine)
fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
if use_cuda else 'cpu', 'parallel'
if parallel else 'normal')
fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
if use_cuda else 'cpu', 'parallel'
if parallel else 'normal', 'combine'
if combine else 'separate')
setattr(TestRecognizeDigits, fn, __impl__)
......@@ -216,7 +233,10 @@ def inject_all_tests():
for use_cuda in (False, True):
for parallel in (False, True):
for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type)
inject_test_method(use_cuda, parallel, nn_type, True)
# One unit-test for saving parameters as separate files
inject_test_method(False, False, 'mlp', False)
inject_all_tests()
......
......@@ -12,9 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import sys
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid.core as core
import paddle.v2.fluid as fluid
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
......@@ -102,7 +104,8 @@ def get_mov_combined_features():
CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
category_id = layers.data(name='category_id', shape=[1], dtype='int64')
category_id = layers.data(
name='category_id', shape=[1], dtype='int64', lod_level=1)
mov_categories_emb = layers.embedding(
input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
......@@ -112,7 +115,8 @@ def get_mov_combined_features():
MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
mov_title_id = layers.data(
name='movie_title', shape=[1], dtype='int64', lod_level=1)
mov_title_emb = layers.embedding(
input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
......@@ -142,23 +146,22 @@ def model():
scale_infer = layers.scale(x=inference, scale=5.0)
label = layers.data(name='score', shape=[1], dtype='float32')
square_cost = layers.square_error_cost(input=scale_infer, label=label)
avg_cost = layers.mean(x=square_cost)
return avg_cost
return scale_infer, avg_cost
def train(use_cuda, save_dirname):
scale_infer, avg_cost = model()
# test program
test_program = fluid.default_main_program().clone()
def main():
cost = model()
sgd_optimizer = SGDOptimizer(learning_rate=0.2)
opts = sgd_optimizer.minimize(cost)
opts = sgd_optimizer.minimize(avg_cost)
if USE_GPU:
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
......@@ -167,6 +170,8 @@ def main():
paddle.reader.shuffle(
paddle.dataset.movielens.train(), buf_size=8192),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
feeding = {
'user_id': 0,
......@@ -182,7 +187,7 @@ def main():
def func_feed(feeding, data):
feed_tensors = {}
for (key, idx) in feeding.iteritems():
tensor = core.LoDTensor()
tensor = fluid.LoDTensor()
if key != "category_id" and key != "movie_title":
if key == "score":
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
......@@ -209,14 +214,117 @@ def main():
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for data in train_reader():
outs = exe.run(framework.default_main_program(),
for batch_id, data in enumerate(train_reader()):
# train a mini-batch
outs = exe.run(program=fluid.default_main_program(),
feed=func_feed(feeding, data),
fetch_list=[cost])
fetch_list=[avg_cost])
out = np.array(outs[0])
if out[0] < 6.0:
# if avg cost less than 6.0, we think our code is good.
exit(0)
main()
if (batch_id + 1) % 10 == 0:
avg_cost_set = []
for test_data in test_reader():
avg_cost_np = exe.run(program=test_program,
feed=func_feed(feeding, test_data),
fetch_list=[avg_cost])
avg_cost_set.append(avg_cost_np[0])
break # test only 1 segment for speeding up CI
# get test avg_cost
test_avg_cost = np.array(avg_cost_set).mean()
if test_avg_cost < 6.0:
# if avg_cost less than 6.0, we think our code is good.
if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, [
"user_id", "gender_id", "age_id", "job_id",
"movie_id", "category_id", "movie_title"
], [scale_infer], exe)
return
if math.isnan(float(out[0])):
sys.exit("got NaN loss, training failed.")
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
def create_lod_tensor(data, lod=None):
tensor = fluid.LoDTensor()
if lod is None:
# Tensor, the shape is [batch_size, 1]
index = 0
lod_0 = [index]
for l in range(len(data)):
index += 1
lod_0.append(index)
lod = [lod_0]
tensor.set_lod(lod)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
tensor.set(flattened_data, place)
return tensor
# Use the first data from paddle.dataset.movielens.test() as input
assert feed_target_names[0] == "user_id"
user_id = create_lod_tensor([[1]])
assert feed_target_names[1] == "gender_id"
gender_id = create_lod_tensor([[1]])
assert feed_target_names[2] == "age_id"
age_id = create_lod_tensor([[0]])
assert feed_target_names[3] == "job_id"
job_id = create_lod_tensor([[10]])
assert feed_target_names[4] == "movie_id"
movie_id = create_lod_tensor([[783]])
assert feed_target_names[5] == "category_id"
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
assert feed_target_names[6] == "movie_title"
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
[[0, 5]])
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
results = exe.run(inference_program,
feed={
feed_target_names[0]: user_id,
feed_target_names[1]: gender_id,
feed_target_names[2]: age_id,
feed_target_names[3]: job_id,
feed_target_names[4]: movie_id,
feed_target_names[5]: category_id,
feed_target_names[6]: movie_title
},
fetch_list=fetch_targets,
return_numpy=False)
print("inferred score: ", np.array(results[0]))
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the inference model
save_dirname = "recommender_system.inference.model"
train(use_cuda, save_dirname)
infer(use_cuda, save_dirname)
if __name__ == '__main__':
main(USE_GPU)
......@@ -18,6 +18,10 @@ import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
import contextlib
import math
import sys
import unittest
from paddle.v2.fluid.executor import Executor
dict_size = 30000
......@@ -145,7 +149,7 @@ def seq_to_seq_net():
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost
return avg_cost, prediction
def to_lodtensor(data, place):
......@@ -163,8 +167,16 @@ def to_lodtensor(data, place):
return res
def main():
avg_cost = seq_to_seq_net()
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(use_cuda, save_dirname=None):
[avg_cost, prediction] = seq_to_seq_net()
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimizer.minimize(avg_cost)
......@@ -174,7 +186,7 @@ def main():
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
place = core.CPUPlace()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
......@@ -185,6 +197,7 @@ def main():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(framework.default_main_program(),
feed={
'source_sequence': word_data,
......@@ -192,13 +205,86 @@ def main():
'label_sequence': trg_word_next
},
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val))
if math.isnan(float(avg_cost_val[0])):
sys.exit("got NaN loss, training failed.")
if batch_id > 3:
exit(0)
if save_dirname is not None:
fluid.io.save_inference_model(
save_dirname, ['source_sequence',
'target_sequence'], [prediction], exe)
return
batch_id += 1
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
lod = [0, 4, 10]
word_data = create_random_lodtensor(lod, place, low=0, high=1)
trg_word = create_random_lodtensor(lod, place, low=0, high=1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
assert feed_target_names[0] == 'source_sequence'
assert feed_target_names[1] == 'target_sequence'
results = exe.run(inference_program,
feed={
feed_target_names[0]: word_data,
feed_target_names[1]: trg_word,
},
fetch_list=fetch_targets,
return_numpy=False)
print(results[0].lod())
np_data = np.array(results[0])
print("Inference shape: ", np_data.shape)
print("Inference results: ", np_data)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the trained model
save_dirname = "rnn_encoder_decoder.inference.model"
train(use_cuda, save_dirname)
infer(use_cuda, save_dirname)
class TestRnnEncoderDecoder(unittest.TestCase):
def test_cuda(self):
with self.scope_prog_guard():
main(use_cuda=True)
def test_cpu(self):
with self.scope_prog_guard():
main(use_cuda=False)
@contextlib.contextmanager
def scope_prog_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__':
main()
unittest.main()
......@@ -16,6 +16,8 @@ import unittest
import paddle.v2.fluid as fluid
import paddle.v2 as paddle
import contextlib
import math
import sys
def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
......@@ -115,6 +117,8 @@ def main(word_dict, net_method, use_cuda):
print("cost=" + str(cost_val) + " acc=" + str(acc_val))
if cost_val < 0.4 and acc_val > 0.8:
return
if math.isnan(float(cost_val)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Cost is too large for {0}".format(
net_method.__name__))
......
......@@ -16,6 +16,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import unittest
import os
import math
import sys
def main(use_cuda, is_sparse, parallel):
......@@ -112,6 +114,9 @@ def main(use_cuda, is_sparse, parallel):
fetch_list=[avg_cost])
if avg_cost_np[0] < 5.0:
return
if math.isnan(float(avg_cost_np[0])):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
......
......@@ -15,6 +15,8 @@
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import math
import sys
# need to fix random seed and training data to compare the loss
# value accurately calculated by the default and the memory optimization
......@@ -63,4 +65,6 @@ for pass_id in range(PASS_NUM):
if avg_loss_value[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good.
if math.isnan(float(avg_loss_value)):
sys.exit("got NaN loss, training failed.")
exit(1)
......@@ -18,6 +18,8 @@ import sys
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import math
import sys
# need to fix random seed and training data to compare the loss
# value accurately calculated by the default and the memory optimization
......@@ -152,7 +154,10 @@ for pass_id in range(PASS_NUM):
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
pass_acc))
# this model is slow, so if we can train two mini batch, we think it works properly.
if i > 2:
exit(0)
if math.isnan(float(loss)):
sys.exit("got NaN loss, training failed.")
i += 1
exit(1)
......@@ -19,6 +19,8 @@ import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
import math
import sys
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
......@@ -137,6 +139,8 @@ def main():
" avg_cost=" + str(avg_cost_val))
if batch_id > 2:
exit(0)
if math.isnan(float(avg_cost_val)):
sys.exit("got NaN loss, training failed.")
batch_id += 1
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import numpy as np
prog = fluid.framework.Program()
block = prog.current_block()
random_reader = block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
random_reader.desc.set_lod_levels([0, 0])
create_random_data_generator_op = block.append_op(
type="create_random_data_generator",
outputs={"Out": random_reader},
attrs={
"shape_concat": [1, 2, 1, 1],
"ranks": [2, 2],
"min": 0.0,
"max": 1.0
})
shuffle_reader = block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
shuffle_reader.desc.set_lod_levels([0, 0])
create_shuffle_reader_op = block.append_op(
type="create_shuffle_reader",
inputs={"UnderlyingReader": random_reader},
outputs={"Out": shuffle_reader},
attrs={"buffer_size": 7})
batch_reader = block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
batch_reader.desc.set_lod_levels([1, 1])
create_batch_reader_op = block.append_op(
type="create_batch_reader",
inputs={"UnderlyingReader": shuffle_reader},
outputs={"Out": batch_reader},
attrs={"batch_size": 10})
out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
read_op = block.append_op(
type="read", inputs={"Reader": batch_reader},
outputs={"Out": [out1, out2]})
place = fluid.CPUPlace()
exe = fluid.Executor(place)
[res1, res2] = exe.run(prog, fetch_list=[out1, out2], return_numpy=False)
test_pass = res1.lod() == [range(0, 11)] and res1.lod() == [
range(0, 11)
] and np.array(res1).shape == (10, 2) and np.array(res2).shape == (10, 1)
if not test_pass:
exit(1)
exit(0)
......@@ -31,6 +31,8 @@ def CTCAlign(input, lod, blank, merge_repeated):
result.append(token)
prev_token = token
result = np.array(result).reshape([len(result), 1]).astype("int32")
if len(result) == 0:
result = np.array([-1])
return result
......@@ -72,5 +74,14 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
[19, 1]).astype("int32")
class TestCTCAlignOpCase2(TestCTCAlignOp):
def config(self):
self.op_type = "ctc_align"
self.input_lod = [[0, 4]]
self.blank = 0
self.merge_repeated = True
self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
if __name__ == "__main__":
unittest.main()
......@@ -20,6 +20,8 @@ import paddle.v2.fluid.core as core
from paddle.v2.fluid.op import Operator
from paddle.v2.fluid.framework import grad_var_name
np.random.random(123)
def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
x_shape = x.shape
......@@ -62,9 +64,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
grad_x = dx_end + d_mean + d_std
grad_y.shape = x_shape
x.shape = x_shape
grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
scale.shape = scale_shape
var.shape, mean.shape = [N, ], [N, ]
return grad_x, d_scale, d_bias
......@@ -112,10 +114,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
class TestLayerNormdOp(OpTest):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(
np.allclose(
np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
msg)
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def __assert_grad_close(self,
tensor,
......@@ -123,7 +122,7 @@ class TestLayerNormdOp(OpTest):
name,
place,
max_relative_error=0.02):
a = np.array(tensor).reshape(np_array.shape)
a = np.array(tensor)
b = np_array
abs_a = np.abs(a)
abs_a[abs_a < 1e-5] = 1
......@@ -151,7 +150,7 @@ class TestLayerNormdOp(OpTest):
x_shape = shape
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
scale_shape = [D]
np.random.random(123)
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
......
......@@ -15,6 +15,8 @@
import unittest
import math
import copy
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as layers
......@@ -54,21 +56,37 @@ def inverse_time_decay(learning_rate,
return learning_rate / (1 + decay_rate * temp)
class TestLearningRateDecay(unittest.TestCase):
def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
init_lr = 1.0
decay_steps = 5
decay_rate = 0.5
def polynomial_decay(learning_rate,
global_step,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False):
if cycle:
div = math.ceil(global_step / float(decay_steps))
if div == 0:
div = 1
decay_steps = decay_steps * div
else:
global_step = min(global_step, decay_steps)
return (learning_rate - end_learning_rate) * \
((1 - float(global_step) / float(decay_steps)) ** power) + end_learning_rate
def piecewise_decay(global_step, boundaries, values):
assert len(boundaries) + 1 == len(values)
for i in range(len(boundaries)):
if global_step < boundaries[i]:
return values[i]
return values[len(values) - 1]
class TestLearningRateDecay(unittest.TestCase):
def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
global_step = layers.create_global_var(
shape=[1], value=0.0, dtype='float32', persistable=True)
decayed_lr = fluid_decay_fn(
learning_rate=init_lr,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=decay_rate,
staircase=staircase)
decayed_lr = fluid_decay_fn(global_step=global_step, **kwargs)
layers.increment(global_step, 1.0)
place = fluid.CPUPlace()
......@@ -79,31 +97,52 @@ class TestLearningRateDecay(unittest.TestCase):
step_val, lr_val = exe.run(fluid.default_main_program(),
feed=[],
fetch_list=[global_step, decayed_lr])
python_decayed_lr = python_decay_fn(
learning_rate=init_lr,
global_step=step,
decay_steps=decay_steps,
decay_rate=decay_rate,
staircase=staircase)
python_decayed_lr = python_decay_fn(global_step=step, **kwargs)
self.assertAlmostEqual(python_decayed_lr, lr_val[0])
def test_decay(self):
common_kwargs_true = {
"learning_rate": 1.0,
"decay_steps": 5,
"decay_rate": 0.5,
"staircase": True
}
common_kwargs_false = copy.deepcopy(common_kwargs_true)
common_kwargs_false["staircase"] = False
decay_fns = [
(exponential_decay, lr_decay.exponential_decay, True),
(exponential_decay, lr_decay.exponential_decay, False),
(natural_exp_decay, lr_decay.natural_exp_decay, True),
(natural_exp_decay, lr_decay.natural_exp_decay, False),
(inverse_time_decay, lr_decay.inverse_time_decay, True),
(inverse_time_decay, lr_decay.inverse_time_decay, False),
(exponential_decay, lr_decay.exponential_decay, common_kwargs_true),
(exponential_decay, lr_decay.exponential_decay,
common_kwargs_false),
(natural_exp_decay, lr_decay.natural_exp_decay, common_kwargs_true),
(natural_exp_decay, lr_decay.natural_exp_decay,
common_kwargs_false),
(inverse_time_decay, lr_decay.inverse_time_decay,
common_kwargs_true),
(inverse_time_decay, lr_decay.inverse_time_decay,
common_kwargs_false),
(polynomial_decay, lr_decay.polynomial_decay, {
"learning_rate": 1.0,
"decay_steps": 5,
"cycle": True
}),
(polynomial_decay, lr_decay.polynomial_decay, {
"learning_rate": 1.0,
"decay_steps": 5,
"cycle": False
}),
(piecewise_decay, lr_decay.piecewise_decay, {
"boundaries": [3, 6, 9],
"values": [0.1, 0.2, 0.3, 0.4]
}),
]
for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
staircase))
for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
print("decay_fn=" + py_decay_fn.__name__ + " kwargs=" + str(kwargs))
main_program = framework.Program()
startup_program = framework.Program()
with framework.program_guard(main_program, startup_program):
self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
if __name__ == '__main__':
......
......@@ -42,9 +42,9 @@ class TestOptimizer(unittest.TestCase):
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
opts, _ = sgd_optimizer.minimize(mean_out, init_program)
self.assertEqual(len(opts), 1)
sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd")
self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "sgd"])
def test_sgd_optimizer_with_global_step(self):
init_program = framework.Program()
......@@ -72,11 +72,10 @@ class TestOptimizer(unittest.TestCase):
sgd_optimizer = optimizer.SGDOptimizer(
learning_rate=learning_rate, global_step=global_step)
opts, _ = sgd_optimizer.minimize(mean_out, init_program)
self.assertEqual(len(opts), 2)
sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd")
increment_op = opts[1]
self.assertEqual(increment_op.type, "increment")
self.assertEqual(len(opts), 4)
self.assertEqual(
[op.type for op in opts],
["fill_constant", "elementwise_mul", "sgd", "increment"])
# Check init_program
init_ops = init_program.global_block().ops
......@@ -121,9 +120,10 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(
params_grads, mul_out, init_program)
self.assertEqual(len(opts), 1)
sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum")
self.assertEqual(len(opts), 3)
sgd_op = opts[-1]
self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "momentum"])
self.assertFalse(sgd_op.attr('use_nesterov'))
# Check accumulators
......@@ -170,9 +170,10 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(
params_grads, mul_out, init_program)
self.assertEqual(len(opts), 1)
sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum")
self.assertEqual(len(opts), 3)
sgd_op = opts[-1]
self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "momentum"])
self.assertTrue(sgd_op.attr('use_nesterov'))
# Check accumulators
......@@ -228,9 +229,9 @@ class TestAdagradOptimizer(unittest.TestCase):
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 1)
adagrad_op = opts[0]
self.assertEqual(adagrad_op.type, "adagrad")
self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "adagrad"])
# Check accumulators
accumulators = adagrad_optimizer.get_accumulators()
......@@ -288,9 +289,10 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 3)
adam_op = opts[0]
self.assertEqual(adam_op.type, "adam")
self.assertEqual(len(opts), 5)
self.assertEqual(
[op.type for op in opts],
["fill_constant", "elementwise_mul", "adam", "scale", "scale"])
# Check accumulators
accumulators = adam_optimizer.get_accumulators()
......@@ -350,9 +352,10 @@ class TestAdamaxOptimizer(unittest.TestCase):
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 2)
adam_op = opts[0]
self.assertEqual(adam_op.type, "adamax")
self.assertEqual(len(opts), 4)
self.assertEqual(
[op.type for op in opts],
["fill_constant", "elementwise_mul", "adamax", "scale"])
# Check accumulators
accumulators = adamax_optimizer.get_accumulators()
......@@ -409,9 +412,10 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
opts = decayed_adagrad_optimizer.create_optimization_pass(
params_grads, mul_out, init_program)
self.assertEqual(len(opts), 1)
decayed_adagrad_op = opts[0]
self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad")
self.assertEqual(len(opts), 3)
self.assertEqual(
[op.type for op in opts],
["fill_constant", "elementwise_mul", "decayed_adagrad"])
# Check accumulators
accumulators = decayed_adagrad_optimizer.get_accumulators()
......
......@@ -120,7 +120,6 @@ class TestVarDesc(unittest.TestCase):
block = program_desc.block(0)
var = block.var('my_reader')
var.set_type(core.VarDesc.VarType.READER)
var.set_tensor_num(3)
src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
var.set_shapes(src_shapes)
res_shapes = var.shapes()
......@@ -141,7 +140,6 @@ class TestVarDesc(unittest.TestCase):
block = program_desc.block(0)
var = block.var('my_reader')
var.set_type(core.VarDesc.VarType.READER)
var.set_tensor_num(3)
src_types = [
core.DataType.INT32, core.DataType.FP64, core.DataType.FP32
]
......@@ -154,7 +152,6 @@ class TestVarDesc(unittest.TestCase):
block = program_desc.block(0)
var = block.var('my_reader')
var.set_type(core.VarDesc.VarType.READER)
var.set_tensor_num(3)
src_types = [3, 1, 2]
var.set_lod_levels(src_types)
self.assertEqual(src_types, var.lod_levels())
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.v2.fluid.core as core
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.framework as framework
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import default_startup_program
class TestSwitch(unittest.TestCase):
def check_switch(self, value):
x = layers.fill_constant(shape=[1], dtype='float32', value=value)
zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)
result = layers.create_global_var(
shape=[1], value=-1.0, dtype='float32', persistable=True)
with layers.Switch() as switch:
with switch.case(layers.less_than(x, zero_var)):
layers.assign(zero_var, result)
with switch.case(layers.less_than(x, one_var)):
layers.assign(one_var, result)
with switch.case(layers.less_than(x, two_var)):
layers.assign(two_var, result)
with switch.default():
layers.assign(three_var, result)
cpu = core.CPUPlace()
exe = Executor(cpu)
exe.run(default_startup_program())
out = exe.run(feed={}, fetch_list=[result])[0][0]
return out
def test_switch(self):
test_data = {(-0.1, 0), (0.1, 1), (1.1, 2), (2.1, 3)}
for x, expected_result in test_data:
main_program = framework.Program()
startup_program = framework.Program()
with framework.program_guard(main_program, startup_program):
result = self.check_switch(x)
self.assertEqual(result, expected_result)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import random
from op_test import OpTest
def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
if len(gt_lod) != len(neg_lod):
raise AssertionError("The input arguments are illegal.")
batch_size = len(gt_lod) - 1
match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
for n in range(batch_size):
gt_num = gt_lod[n + 1] - gt_lod[n]
ids = random.sample([i for i in range(num_prior)], gt_num)
match_indices[n, ids] = [i for i in range(gt_num)]
ret_ids = set([i for i in range(num_prior)]) - set(ids)
s = neg_lod[n]
e = neg_lod[n + 1]
l = e - s
neg_ids = random.sample(ret_ids, l)
neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
return match_indices, neg_indices
def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
neg_lod, background_label):
batch_size, num_prior = match_indices.shape
# init target bbox
trg_box = np.zeros((batch_size, num_prior, 4)).astype('float32')
# init weight for target bbox
trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
# init target label
trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
trg_label = trg_label * background_label
# init weight for target label
trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
for i in range(batch_size):
cur_indices = match_indices[i]
col_ids = np.where(cur_indices > -1)
col_val = cur_indices[col_ids]
gt_start = gt_lod[i]
# target bbox
for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
trg_box[i][c][:] = encoded_box[v][c][:]
# weight for target bbox
trg_box_wt[i][col_ids] = 1.0
trg_label[i][col_ids] = gt_label[col_val + gt_start]
trg_label_wt[i][col_ids] = 1.0
# set target label weight to 1.0 for the negative samples
neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
trg_label_wt[i][neg_ids] = 1.0
return trg_box, trg_box_wt, trg_label, trg_label_wt
class TestTargetAssginOp(OpTest):
def setUp(self):
self.op_type = "target_assign"
num_prior = 120
num_class = 21
gt_lod = [0, 5, 11, 23]
neg_lod = [0, 4, 7, 13]
batch_size = len(gt_lod) - 1
num_gt = gt_lod[-1]
background_label = 0
encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
gt_label = np.random.randint(
num_class, size=(num_gt, 1)).astype('int32')
match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
gt_lod, neg_lod)
trg_box, trg_box_wt, trg_label, trg_label_wt = target_assign(
encoded_box, gt_label, match_indices, neg_indices, gt_lod, neg_lod,
background_label)
self.inputs = {
'EncodedGTBBox': (encoded_box, [gt_lod]),
'GTScoreLabel': (gt_label, [gt_lod]),
'MatchIndices': (match_indices),
'NegIndices': (neg_indices, [neg_lod]),
}
self.attrs = {'background_label': background_label}
self.outputs = {
'PredBBoxLabel': (trg_box),
'PredBBoxWeight': (trg_box_wt),
'PredScoreLabel': (trg_label),
'PredScoreWeight': (trg_label_wt),
}
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册