提交 cf87f39b 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into remove_evaluator

......@@ -4,13 +4,17 @@ cache:
- $HOME/.ccache
- $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party
- $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required
dist: trusty
services:
- docker
os:
- linux
env:
- JOB=build_doc
- JOB=check_style
- JOB=build_android
addons:
apt:
packages:
......@@ -41,8 +45,10 @@ before_install:
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
# 43min timeout
if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
- |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
......
| Github account | name |
|---|---|
| abhinavarora | Abhinav Arora |
| backyes | Yan-Fei Wang |
| beckett1124 | Bin Qi |
| JiayiFeng | Jia-Yi Feng |
......
......@@ -166,11 +166,11 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
set(EXTERNAL_LIBS
${GFLAGS_LIBRARIES}
${GLOG_LIBRARIES}
gflags
glog
${CBLAS_LIBRARIES}
${PROTOBUF_LIBRARY}
${ZLIB_LIBRARIES}
protobuf
zlib
${PYTHON_LIBRARIES}
)
......
......@@ -106,10 +106,10 @@ def vgg16_bn_drop(input):
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2
......
......@@ -59,7 +59,6 @@ endif(NOT WITH_GOLANG)
if(NOT WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC)
add_definitions("-DCUPTI_LIB_PATH=\"\"")
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
......@@ -77,9 +76,7 @@ else()
if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
else()
add_definitions("-DCUPTI_LIB_PATH=\"\"")
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
......
......@@ -28,7 +28,7 @@ ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
ExternalProject_Add(
zlib
extern_zlib
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/madler/zlib.git"
GIT_TAG "v1.2.8"
......@@ -49,9 +49,11 @@ ExternalProject_Add(
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
ADD_DEPENDENCIES(zlib extern_zlib)
LIST(APPEND external_project_dependencies zlib)
ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
......
......@@ -104,7 +104,9 @@ function(merge_static_libs TARGET_NAME)
foreach(lib ${libs})
list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
endforeach()
if(libs_deps)
list(REMOVE_DUPLICATES libs_deps)
endif()
# To produce a library we need at least one source file.
# It is created by add_custom_command below and will helps
......@@ -191,10 +193,13 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
else()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif()
......
......@@ -72,7 +72,7 @@ copy(inference_lib DEPENDS paddle_fluid_shared
)
set(module "platform")
copy(platform_lib
copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
)
......
......@@ -7,10 +7,6 @@ it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.or
A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
it has several easy to use methods
it has several easy to use methods to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
- `paddle.train`
- `paddle.test`
- `paddle.infer`
to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code.
......@@ -9,7 +9,7 @@
为了编译PaddlePaddle,我们需要
1. 一台电脑,可以装的是 Linux, Windows 或者 MacOS 操作系统
1. Docker
2. Docker
不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker 镜像里。
......
......@@ -9,7 +9,7 @@ Requirements
To build PaddlePaddle, you need
1. A computer -- Linux, Windows, MacOS.
1. Docker.
2. Docker.
Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image.
We run all the tools by running this image.
......
Install Using pip
Install using pip
================================
You can use current widely used Python package management
......@@ -8,7 +8,7 @@ most of current Linux systems or MacOS.
.. _pip_install:
Install Using pip
Install using pip
------------------------------
Run the following command to install PaddlePaddle on the current
......
......@@ -39,15 +39,16 @@ In the backward pass
This implementation allows to write mixed device program like this
```python
# get embedding feature on CPU
feature = some_cpu_only_op(data)
W1 = fluid.tensor(size=[100,20], parameter=true)
W2 = fluid.tensor(size=[20,15], parameter=true)
gpu_places = get_place(use_gpu=True)
data = layers.data()
gpu_places = layers.get_place(use_gpu=True)
# parallel processing on multiple GPUs
pd = ParallelDo(gpu_places)
with pd.do():
read_input(feature)
prediction = my_net(feature)
with pd.do(input=data):
prediction = softmax(fc(fc(data, W1), W2))
write_output(prediction)
prediction = pd()
loss = cross_entropy(prediction, label)
......@@ -66,20 +67,20 @@ start_program
main_program
{
block0 {
vars: data, places, w1, w2
vars: data, places, w1, w2, w1_grad, w2_grad,
ops: data, get_place, parallel_do(block1),
parallel_do_grad(block2),
sgd(w2, w2_grad),
sgd(w1, w1_grad)
}
block1 {
block1 { # the forward pass
parent_block: 0
vars: data, h1, h2, loss
ops: fc, fc, softmax
}
block2 {
block2 { # the backward pass
parent_block: 1
vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
ops: softmax_grad,
fc_grad
fc_grad
......
......@@ -222,6 +222,7 @@ upstream
## 提交代码的一些约定
为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定:
1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。
2. 提交PUll Request前:
- 请注意commit的数量:
......@@ -231,6 +232,7 @@ upstream
3. 如果解决了某个Issue的问题,请在该PUll Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)
此外,在回复评审人意见时,请您遵守以下约定:
1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢):
- 对评审意见同意且按其修改完的,给个简单的`Done`即可;
- 对评审意见不同意的,请给出您自己的反驳理由。
......
......@@ -4,6 +4,5 @@ Development
.. toctree::
:maxdepth: 1
new_layer_en.rst
contribute_to_paddle_en.md
write_docs_en.rst
############################
Install, Build and Unit test
############################
TBD
###############################
Cluster Training and Prediction
###############################
TBD
FAQ
====
.. toctree::
:maxdepth: 1
build_and_install/index_en.rst
model/index_en.rst
parameter/index_en.rst
local/index_en.rst
cluster/index_en.rst
#############################
Local Training and Prediction
#############################
TBD
###################
Model Configuration
###################
TBD
#################
Parameter Setting
#################
TBD
......@@ -5,3 +5,4 @@ GET STARTED
:maxdepth: 1
quickstart_en.rst
concepts/use_concepts_en.rst
C-API Prediction Library
========================
.. toctree::
:maxdepth: 1
compile_paddle_lib_en.md
organization_of_the_inputs_en.md
workflow_of_capi_en.md
......@@ -4,12 +4,15 @@
### 输入/输出数据类型
在C-API中,按照基本数据类型在PaddlePaddle内部的定义和实现,输入数据可分为:
1. 一维整型数组
1. 二维浮点型矩阵
- 稠密矩阵
- 稀疏矩阵
说明:
1. 一维数组**仅支持整型值**
- 常用于自然语言处理任务,例如:表示词语在词典中的序号;
- 分类任务中类别标签;
......@@ -274,6 +277,7 @@ PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织
如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
于是,在组织神经网络输入时,需要思考完成以下工作:
1. 为每一个输入/输出创建`argument`
- C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)
1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
......
## Input/Output Data Organization
TBD
......@@ -11,6 +11,7 @@
</p>
- 准备预测模型
1. 只将神经网络结构进行序列化。
- 只对神经网络结构进行序列化,加载模型需同时指定:网络结构的序列化结果和模型参数存储目录。
1. 将网络结构定义和训练结束存储下来的模型参数文件(多个)合并入一个文件。
......@@ -18,6 +19,7 @@
- 预测时只需加载一个文件便于发布。
- **注意**:以上两种方式只需选择其一即可。
- 调用 C-API 开发预测序
1. 初始化PaddlePaddle运行环境。
1. 加载预测模型。
1. 创建神经网络输入,组织输入数据。
......@@ -90,6 +92,7 @@
1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口,与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时,通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)
- 注意事项
1. 使用PaddlePaddle V2 API训练,模型中所有可学习参数会被存为一个压缩文件,需要手动进行解压,将它们放在同一目录中,C-API不会直接加载 V2 API 存储的压缩文件。
1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件,请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
1. 通过灵活使用以上两个接口,加载模型可其它多种方式,例如也可在程序运行过程中再加载另外一个模型。
......@@ -106,6 +109,7 @@ C-API支持的所有输入数据类型和他们的组织方式,请参考“输
这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出,使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
在组织神经网络输入,获取输出时,需要思考完成以下工作:
1. 为每一个输入/输出创建`argument`
1. 为每一个`argument`创建`paddle_matrix`来存储数据;
......
## 启动参数说明
# 启动参数说明
下面以`doc/howto/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
### 启动参数服务器
## 启动参数服务器
执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
```bash
$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
```
如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行:
```bash
$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
```
......@@ -20,8 +23,10 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
### 启动计算节点
## 启动计算节点
执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py)
```bash
$ python train.py
```
......@@ -67,7 +72,7 @@ paddle.init(
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
### 准备数据集
## 准备数据集
参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
......@@ -84,7 +89,8 @@ for f in flist:
```
示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000``-00001``-00002`):
```
```bash
train.txt
train.txt-00000
train.txt-00001
......@@ -99,12 +105,13 @@ test.txt-00002
对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
### 准备训练程序
## 准备训练程序
我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
最后,工作空间应如下所示:
```
```bash
.
|-- my_lib.py
|-- word_dict.pickle
......@@ -133,3 +140,21 @@ test.txt-00002
- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。
- `test_data_dir`:包含测试数据集的目录。
## 异步 SGD 更新
我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。
例如,设置 `AdaGrad` optimize 的 `is_async``async_lagged_grad_discard_ratio` 参数:
```python
adagrad = paddle.optimizer.AdaGrad(
is_async=True,
async_lagged_grad_discard_ratio=1.6,
learning_rate=3e-3,
regularization=paddle.optimizer.L2Regularization(8e-4))
```
- `is_async`: 是否为异步SGD更新模式。
- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制,接收到足够的gradient(
`async_lagged_grad_discard_ratio * num_gradient_servers`)之后,后面的gradient
将会被抛弃。
## Command-line arguments
# Command-line arguments
We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
### Starting parameter server
## Starting parameter server
Type the below command to start a parameter server which will wait for trainers to connect:
```bash
$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0
```
If you wish to run parameter servers in background, and save a log file, you can type:
```bash
$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log &
```
Parameter Description
......@@ -21,8 +22,10 @@ Parameter Description
- ports_num: **required, default 1**, total number of ports will listen on.
- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
- num_gradient_servers: **required, default 1**, total number of gradient servers.
- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on.
## Starting trainer
### Starting trainer
Type the command below to start the trainer(name the file whatever you want, like "train.py")
```bash
......@@ -70,7 +73,7 @@ Parameter Description
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
### Prepare Training Dataset
## Prepare Training Dataset
Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
......@@ -88,7 +91,7 @@ for f in flist:
Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
```
```bash
train.txt
train.txt-00000
train.txt-00001
......@@ -103,13 +106,13 @@ When job started, every trainer needs to get it's own part of data. In some dist
Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
### Prepare Training program
## Prepare Training program
We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
Your workspace may looks like:
```
```bash
.
|-- my_lib.py
|-- word_dict.pickle
......@@ -138,3 +141,21 @@ Your workspace may looks like:
- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
- `test_data_dir`: containing testing data.
## Async SGD Update
We can set some parameters of the optimizer to make it support async SGD update.
For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer:
```python
adagrad = paddle.optimizer.AdaGrad(
is_async=True,
async_lagged_grad_discard_ratio=1.6,
learning_rate=3e-3,
regularization=paddle.optimizer.L2Regularization(8e-4))
```
- `is_async`: Is Async-SGD or not.
- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control.
when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed,
current async gradient will be discard silently.
# Cluster Training Using Fabric
# Fabric
## Prepare a Linux cluster
......
# Distributed PaddlePaddle Training on AWS with Kubernetes
# Kubernetes on AWS
We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
......
# PaddlePaddle On Kubernetes
# Kubernetes
In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
......
# 在OpenMPI集群中提交训练作业
# 在OpenMPI集群中启动训练
## 准备OpenMPI集群
......
# Cluster Training Using OpenMPI
# OpenMPI
## Prepare an OpenMPI cluster
......
......@@ -2,10 +2,25 @@
命令行参数设置
===============
深度学习算法的实现有着多样化的特点,运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。
在这一部分,首先以几个实际场景为例,展示了部分命令行参数的使用:
.. toctree::
:maxdepth: 1
use_case_cn.md
接着对所有参数的使用场合进行概述和分类:
.. toctree::
:maxdepth: 1
arguments_cn.md
最后给出细节描述,详细解释这些参数的属性和意义:
.. toctree::
:maxdepth: 1
detail_introduction_cn.md
Layers supporting hierarchical sequence as input
================================================
TBD
API comparision between RNN and hierarchical RNN
================================================
TBD
......@@ -5,3 +5,6 @@ RNN Models
:maxdepth: 1
rnn_config_en.rst
recurrent_group_en.md
hierarchical_layer_en.rst
hrnn_rnn_api_compare_en.rst
# Recurrent Group Tutorial
TBD
......@@ -8,3 +8,4 @@ PaddlePaddle Documentation
build_and_install/index_en.rst
howto/index_en.rst
dev/index_en.rst
faq/index_en.rst
......@@ -96,3 +96,6 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc)
......@@ -91,6 +91,11 @@ class ChannelHolder {
inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() {
PADDLE_ENFORCE_EQ(IsInitialized(), true);
return holder_->Type();
}
private:
/**
* @note Placeholder hides type T, so it doesn't appear as a template
......
......@@ -542,3 +542,341 @@ TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
ChannelHolderSendReceive(ch);
delete ch;
}
TEST(ChannelHolder, ChannelUninitializedTest) {
ChannelHolder *ch = new ChannelHolder();
EXPECT_EQ(ch->IsInitialized(), false);
int i = 10;
EXPECT_EQ(ch->Send(&i), false);
EXPECT_EQ(ch->Receive(&i), false);
bool is_exception = false;
try {
ch->Type();
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
}
TEST(ChannelHolder, ChannelInitializedTest) {
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(2);
EXPECT_EQ(ch->IsInitialized(), true);
// Channel should remain intialized even after close
ch->close();
EXPECT_EQ(ch->IsInitialized(), true);
delete ch;
}
TEST(ChannelHolder, TypeMismatchSendTest) {
// Test with unbuffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(0);
bool is_exception = false;
bool boolean_data = true;
try {
ch->Send(&boolean_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
// Test with Buffered Channel
ch = new ChannelHolder();
ch->Reset<float>(10);
is_exception = false;
int int_data = 23;
try {
ch->Send(&int_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
}
TEST(ChannelHolder, TypeMismatchReceiveTest) {
// Test with unbuffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(0);
bool is_exception = false;
bool float_data;
try {
ch->Receive(&float_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
// Test with Buffered Channel
ch = new ChannelHolder();
ch->Reset<float>(10);
is_exception = false;
int int_data = 23;
try {
ch->Receive(&int_data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
delete ch;
}
void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the channel
// This should unblock all receivers
ch->close();
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
*success = ch->Send(&data);
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Explicitly close the thread
// This should unblock all senders
ch->close();
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
// This tests that closing a channelholder unblocks
// any receivers waiting on the channel
TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
// Check for buffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderCloseUnblocksReceiversTest(ch);
delete ch;
// Check for unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderCloseUnblocksReceiversTest(ch);
delete ch;
}
// This tests that closing a channelholder unblocks
// any senders waiting for channel to have write space
TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
// Check for buffered channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderCloseUnblocksSendersTest(ch, true);
delete ch;
// Check for unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderCloseUnblocksSendersTest(ch, false);
delete ch;
}
// This tests that destroying a channelholder unblocks
// any senders waiting for channel
void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
*success = ch->Send(&data);
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
// Explicitly destroy the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successfuld sends
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
if (isBuffered) {
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
} else {
// In unbuffered channel, no send should be successful
EXPECT_EQ(ct, 0);
}
// Join all threads
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
// This tests that destroying a channelholder also unblocks
// any receivers waiting on the channel
void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
// Check for Buffered Channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderDestroyUnblockReceivers(ch);
// ch is already deleted already deleted in
// ChannelHolderDestroyUnblockReceivers
// Check for Unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderDestroyUnblockReceivers(ch);
}
TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
// Check for Buffered Channel
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(1);
ChannelHolderDestroyUnblockSenders(ch, true);
// ch is already deleted already deleted in
// ChannelHolderDestroyUnblockReceivers
// Check for Unbuffered channel
ch = new ChannelHolder();
ch->Reset<int>(0);
ChannelHolderDestroyUnblockSenders(ch, false);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thread>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
USE_NO_KERNEL_OP(go);
USE_NO_KERNEL_OP(channel_close);
USE_NO_KERNEL_OP(channel_create);
USE_NO_KERNEL_OP(channel_recv);
USE_NO_KERNEL_OP(channel_send);
USE_NO_KERNEL_OP(elementwise_add);
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace paddle {
namespace framework {
template <typename T>
void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name,
T value) {
// Create LoDTensor<int> of dim [1,1]
auto var = scope.Var(name);
auto tensor = var->GetMutable<LoDTensor>();
tensor->Resize({1, 1});
T *expect = tensor->mutable_data<T>(place);
expect[0] = value;
}
void InitTensorsInScope(Scope &scope, p::CPUPlace &place) {
p::CPUDeviceContext ctx(place);
// Create channel variable
scope.Var("Channel");
// Create Variables, x0 will be put into channel,
// result will be pulled from channel
CreateIntVariable(scope, place, "Status", false);
CreateIntVariable(scope, place, "x0", 99);
CreateIntVariable(scope, place, "result", 0);
}
void AddOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, AttributeMap attrs,
BlockDesc *block) {
// insert op
auto op = block->AppendOp();
op->SetType(type);
for (auto &kv : inputs) {
op->SetInput(kv.first, kv.second);
}
for (auto &kv : outputs) {
op->SetOutput(kv.first, kv.second);
}
op->SetAttrMap(attrs);
}
TEST(Concurrency, Go_Op) {
Scope scope;
p::CPUPlace place;
// Initialize scope variables
InitTensorsInScope(scope, place);
framework::Executor executor(place);
ProgramDesc program;
BlockDesc *block = program.MutableBlock(0);
// Create channel OP
AddOp("channel_create", {}, {{"Out", {"Channel"}}},
{{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
block);
// Create Go Op routine
BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
{{"Status", {"Status"}}}, {}, goOpBlock);
// Create Go Op
AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
block);
// Create Channel Receive Op
AddOp("channel_recv", {{"Channel", {"Channel"}}},
{{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
// Create Channel Close Op
AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
// Check the result tensor to make sure it is set to 0
const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
auto *initialData = tensor.data<int>();
EXPECT_EQ(initialData[0], 0);
executor.Run(program, &scope, 0, true, true);
// After we call executor.run, the Go operator should do a channel_send to set
// the
// "result" variable to 99
auto *finalData = tensor.data<int>();
EXPECT_EQ(finalData[0], 99);
}
} // namespace framework
} // namespace paddle
......@@ -26,12 +26,15 @@ Dim<i> make_dim(const int64_t* d) {
}
template <>
Dim<1> make_dim<1>(const int64_t* d) {
return Dim<1>(*d);
Dim<0> make_dim<0>(const int64_t* d) {
return Dim<0>(*d);
}
void make_ddim(DDim& ddim, const int64_t* dims, int n) {
switch (n) {
case 0:
ddim = make_dim<0>(dims);
break;
case 1:
ddim = make_dim<1>(dims);
break;
......@@ -190,7 +193,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
this->operator()(t.tail);
}
void operator()(const Dim<1>& t) { vector.push_back(t.head); }
void operator()(const Dim<0>& t) {}
};
/// @endcond
......@@ -247,9 +250,8 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
}
}
void operator()(const Dim<1>& dim) {
PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
vector.push_back(dim.head);
void operator()(const Dim<0>& dim) {
PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
}
};
......
......@@ -30,8 +30,8 @@ namespace framework {
* The number of dimensions must be between [1, 9].
*/
struct DDim {
typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
Dim<8>, Dim<9>>
typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
Dim<7>, Dim<8>, Dim<9>>
DDimVar;
DDimVar var;
......
......@@ -72,38 +72,36 @@ struct Dim {
// Base case specialization
template <>
struct Dim<1> {
static constexpr int dimensions = 1;
struct Dim<0> {
static constexpr int dimensions = 0;
HOSTDEVICE
Dim(int64_t _head) : head(_head) {}
Dim(int64_t _head) {}
HOSTDEVICE
Dim() : head(0) {}
Dim() {}
HOSTDEVICE
Dim(int idx, const Dim<1>& size) : head(idx) {
Dim(int idx, const Dim<0>& size) {
#ifndef __CUDA_ARCH__
if (idx >= size.head) {
if (idx > 0) {
throw std::invalid_argument("Index out of range.");
}
#else
PADDLE_ASSERT(idx < size.head);
PADDLE_ASSERT(idx == 0);
#endif
}
HOSTDEVICE
bool operator==(const Dim<1>& o) const { return (head == o.head); }
bool operator==(const Dim<0>& o) const { return true; }
HOSTDEVICE
bool operator!=(const Dim<1>& o) const { return !(*this == o); }
bool operator!=(const Dim<0>& o) const { return false; }
HOSTDEVICE
int64_t& operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const;
int64_t head;
};
namespace {
......@@ -154,15 +152,20 @@ HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
}
template <>
HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx != 0) {
throw std::invalid_argument("Invalid index");
}
#else
PADDLE_ASSERT(idx == 0);
PADDLE_ASSERT(false);
#endif
return dim.head;
#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
}
template <int D>
......@@ -181,15 +184,20 @@ HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
}
template <>
HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx != 0) {
throw std::invalid_argument("Invalid index");
}
#else
PADDLE_ASSERT(idx == 0);
PADDLE_ASSERT(false);
#endif
return dim.head;
#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
}
} // namespace
......@@ -218,12 +226,12 @@ HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
}
// Dynamic access to constant Dim
inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
return indexer(*this, i);
}
// Dynamic access to mutable Dim
inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
return indexer(*this, i);
}
......@@ -251,8 +259,8 @@ HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
// Base case dot product of two Dims
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
return a.head * b.head;
HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
return 0;
}
// Product of a Dim
......@@ -264,8 +272,8 @@ HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
// Base case product of a Dim
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
return prod * a.head;
HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
return prod;
}
// Is 0 <= idx_i < size_i for all i?
......@@ -278,8 +286,8 @@ HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
// Base case of is 0 <= idx_i < size_i ?
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
return ((0 <= idx.head) && (idx.head < size.head));
HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
return true;
}
/**
......@@ -294,8 +302,8 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
// Base case of ex_prefix_mul
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
return Dim<1>(mul);
HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
return Dim<0>();
}
///\endcond
......@@ -309,8 +317,8 @@ HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
// Base case
template <>
HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
return Dim<1>(a.head + b.head);
HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
return Dim<0>();
}
template <int i>
......@@ -328,8 +336,8 @@ HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
// Base case
template <>
HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
return Dim<1>(a.head * b.head);
HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
return Dim<0>();
}
template <int i>
......@@ -356,10 +364,9 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
///\cond HIDDEN
template <>
HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
const Dim<1>& stride) {
int norm_stride = size.head == 1 ? 0 : stride.head;
return Dim<1>(norm_stride);
HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
const Dim<0>& stride) {
return Dim<0>();
}
///\endcond
......@@ -394,6 +401,10 @@ typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
return os;
}
inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
return os;
}
template <int i>
HOST std::string Dim<i>::to_string() const {
std::stringstream stream;
......
......@@ -117,6 +117,7 @@ message VarType {
// raw variables should manage their own allocations
// in operators like nccl_op
RAW = 17;
TUPLE = 18;
}
required Type type = 1;
......@@ -148,6 +149,9 @@ message VarType {
required int64 capacity = 2;
}
optional ChannelDesc channel = 6;
message Tuple { repeated Type element_type = 1; }
optional Tuple tuple = 7;
}
message VarDesc {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdexcept>
#include <string>
#include <vector>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace framework {
typedef boost::variant<int, int64_t, float, double, std::string, Tensor,
LoDTensor /*, ChannelHolder*/>
ElementVar;
class Tuple {
public:
using ElementVars = std::vector<ElementVar>;
Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
: var_(var), var_desc_(var_desc) {}
Tuple(std::vector<ElementVar>& var) : var_(var) {}
ElementVar get(int idx) const { return var_[idx]; };
ElementVar& get(int idx) { return var_[idx]; };
bool isSameType(Tuple& t) const;
size_t getSize() const { return var_.size(); };
private:
ElementVars var_;
std::vector<VarDesc> var_desc_;
};
bool Tuple::isSameType(Tuple& t) const {
size_t tuple_size = getSize();
if (tuple_size != t.getSize()) {
return false;
}
for (size_t j = 0; j < tuple_size; ++j) {
auto type1 = get(j).which();
auto type2 = t.get(j).which();
if (type1 != type2) return false;
}
return true;
}
Tuple* make_tuple(std::vector<ElementVar> tuple) { return new Tuple(tuple); }
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sstream>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tuple.h"
TEST(Tuple, Make) {
std::vector<paddle::framework::ElementVar> element_type;
element_type.push_back(12);
element_type.push_back(12.0f);
element_type.push_back("ElementVar");
paddle::framework::Tuple* tuple = paddle::framework::make_tuple(element_type);
EXPECT_EQ(boost::get<int>(tuple->get(0)), 12);
EXPECT_EQ(boost::get<float>(tuple->get(1)), 12.0f);
EXPECT_EQ(boost::get<std::string>(tuple->get(2)), "ElementVar");
delete tuple;
}
TEST(Tuple, IsTheSameType) {
std::vector<paddle::framework::ElementVar> element_type1;
std::vector<paddle::framework::ElementVar> element_type2;
std::vector<paddle::framework::ElementVar> element_type3;
element_type1.push_back(12);
element_type1.push_back(12.0f);
element_type1.push_back("Tuple1");
element_type2.push_back(13);
element_type2.push_back(13.0f);
element_type2.push_back("Tuple2");
element_type3.push_back(14.0f);
element_type3.push_back(14);
element_type3.push_back("Tuple3");
paddle::framework::Tuple* tuple1 =
paddle::framework::make_tuple(element_type1);
paddle::framework::Tuple* tuple2 =
paddle::framework::make_tuple(element_type2);
paddle::framework::Tuple* tuple3 =
paddle::framework::make_tuple(element_type3);
EXPECT_TRUE(tuple1->isSameType(*tuple2));
EXPECT_FALSE(tuple1->isSameType(*tuple3));
delete tuple1;
delete tuple2;
delete tuple3;
}
......@@ -130,7 +130,7 @@ endif()
if(WITH_DISTRIBUTE)
add_subdirectory(detail)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
......
......@@ -41,6 +41,14 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ColToRowMatchIndices", dims);
ctx->SetOutputDim("ColToRowMatchDist", dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
platform::CPUPlace());
}
};
template <typename T>
......
......@@ -37,12 +37,19 @@ class BoxCoderOp : public framework::OperatorWithKernel {
"The rank of Input of PriorBoxVar must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
if (code_type == BoxCodeType::kEncodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]");
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
"The rank of Input of TargetBox must be 3");
PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
}
ctx->SetOutputDim(
"OutputBox",
......@@ -70,25 +77,28 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
"of variance.");
AddInput(
"TargetBox",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
"[xmin, ymin] is the left top coordinate of the box if the input "
"is image feature map, they are close to the origin of the coordinate "
"system. [xmax, ymax] is the right bottom coordinate of the box. "
"This tensor can contain LoD information to represent a batch "
"of inputs. One instance of this batch can contain different "
"numbers of entities.");
"(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
"[N, 4] when code_type is 'encode_center_size'. This input also can "
"be a 3-D Tensor with shape [N, M, 4] when code_type is "
"'decode_center_size'. [N, 4], each box is represented as "
"[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
"of the box if the input is image feature map, they are close to "
"the origin of the coordinate system. [xmax, ymax] is the right "
"bottom coordinate of the box. This tensor can contain LoD "
"information to represent a batch of inputs. One instance of this "
"batch can contain different numbers of entities.");
AddAttr<std::string>("code_type",
"(string, default encode_center_size) "
"the code type used with the target box")
.SetDefault("encode_center_size")
.InEnum({"encode_center_size", "decode_center_size"});
AddOutput(
"OutputBox",
AddOutput("OutputBox",
"(LoDTensor or Tensor) "
"(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
"representing the result of N target boxes encoded/decoded with "
"M Prior boxes and variances.");
"When code_type is 'encode_center_size', the output tensor of "
"box_coder_op with shape [N, M, 4] representing the result of N "
"target boxes encoded with M Prior boxes and variances. When "
"code_type is 'decode_center_size', N represents the batch size "
"and M represents the number of deocded boxes.");
AddComment(R"DOC(
Bounding Box Coder Operator.
......
......@@ -66,7 +66,6 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
......@@ -79,17 +78,16 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
2;
T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
target_box_data[row_idx * len + 2]) *
target_box_data[idx * len + 2]) *
prior_box_width;
T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
target_box_data[row_idx * len + 3]) *
target_box_data[idx * len + 3]) *
prior_box_height;
T target_box_center_x = prior_box_var_data[col_idx * len] *
target_box_data[row_idx * len] *
prior_box_width +
target_box_data[idx * len] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
target_box_data[row_idx * len + 1] *
target_box_data[idx * len + 1] *
prior_box_height +
prior_box_center_y;
......
......@@ -89,6 +89,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len;
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
......@@ -99,20 +100,19 @@ class BoxCoderKernel : public framework::OpKernel<T> {
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x = prior_box_var_data[j * len] *
target_box_data[i * len] * prior_box_width +
target_box_data[offset] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[j * len + 1] *
target_box_data[i * len + 1] *
target_box_data[offset + 1] *
prior_box_height +
prior_box_center_y;
T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
target_box_data[i * len + 2]) *
target_box_data[offset + 2]) *
prior_box_width;
T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
target_box_data[i * len + 3]) *
target_box_data[offset + 3]) *
prior_box_height;
size_t offset = i * col * len + j * len;
output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] = target_box_center_x + target_box_width / 2;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/op_registry.h"
namespace pf = paddle::framework;
static constexpr char kChannel[] = "Channel";
namespace paddle {
namespace operators {
class ChannelCloseOp : public framework::OperatorBase {
public:
ChannelCloseOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto &inp = *scope.FindVar(Input(kChannel));
// Get the mutable version of the channel variable and closes it.
pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
ch->close();
}
};
class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("Channel"),
"The input of ChannelClose op must be set");
}
};
class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ChannelCloseOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(kChannel,
"The Channel Variable that should be closed by"
" the ChannelClose Op.");
AddComment(R"DOC(
Channel Close Operator.
This operator closes an open channel.
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelCloseOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace pf = paddle::framework;
static constexpr char kOutput[] = "Out";
namespace paddle {
namespace operators {
class ChannelCreateOp : public framework::OperatorBase {
public:
ChannelCreateOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto &out = *scope.FindVar(Output(kOutput));
// Determine the datatype and capacity of the channel to be created
// from the attributes provided.
auto dtype =
static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
auto capacity = Attr<int>("capacity");
// Based on the datatype, create a new channel holder initialized with
// the given capacity. When capacity is 0, an unbuffered channel is
// created.
pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
if (dtype == framework::proto::VarType::LOD_TENSOR) {
ch->Reset<pf::LoDTensor>(capacity);
} else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
ch->Reset<pf::SelectedRows>(capacity);
} else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
ch->Reset<pf::LoDRankTable>(capacity);
} else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
ch->Reset<pf::LoDTensorArray>(capacity);
} else if (dtype == framework::proto::VarType::READER) {
ch->Reset<pf::ReaderHolder>(capacity);
} else if (dtype == framework::proto::VarType::CHANNEL) {
ch->Reset<pf::ChannelHolder>(capacity);
} else if (dtype == framework::proto::VarType::BOOL) {
ch->Reset<bool>(capacity);
} else if (dtype == framework::proto::VarType::INT32) {
ch->Reset<int>(capacity);
} else if (dtype == framework::proto::VarType::INT64) {
ch->Reset<int64_t>(capacity);
} else if (dtype == framework::proto::VarType::FP32) {
ch->Reset<float>(capacity);
} else if (dtype == framework::proto::VarType::FP64) {
ch->Reset<double>(capacity);
} else {
PADDLE_THROW(
"Data type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
"READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
dtype);
}
}
};
class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasOutput(kOutput),
"The output of ChannelCreate op must be set");
context->SetOutputDim(kOutput, {1});
}
};
class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ChannelCreateOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput(kOutput,
"The object of a Channel type created by ChannelCreate Op.");
AddAttr<int>("capacity", "The size of the buffer of Channel.")
.SetDefault(0);
AddAttr<int>("data_type", "The data type of elements inside the Channel.");
AddComment(R"DOC(
Channel Create Operator.
This operator creates an object of the VarType Channel and returns it.
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelCreateOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include <paddle/fluid/framework/lod_rank_table.h>
#include <paddle/fluid/framework/lod_tensor_array.h>
#include <paddle/fluid/framework/reader.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h"
static constexpr char Channel[] = "Channel";
static constexpr char Status[] = "Status";
static constexpr char Out[] = "Out";
namespace paddle {
namespace operators {
void SetReceiveStatus(const platform::Place &dev_place,
framework::Variable &status_var, bool status) {
auto cpu = platform::CPUPlace();
auto status_tensor =
status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
cpu);
status_tensor[0] = status;
}
bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) {
// Get type of channel and use that to call mutable data for Variable
auto type = framework::ToVarType(ch->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Receive(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Receive(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Receive(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
return ch->Receive(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Receive(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelReceive:Unsupported type");
}
class ChannelRecvOp : public framework::OperatorBase {
public:
ChannelRecvOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE(ctx->HasInput(Channel),
"Input(Channel) of ChannelRecvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Out),
"Input(Channel) of ChannelRecvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Status),
"Output(Status) of ChannelRecvOp should not be null.");
ctx->SetOutputDim("Status", {1});
}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
// Get the channel holder created by channel_create op, passed as input.
framework::ChannelHolder *ch =
scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
auto output_var = scope.FindVar(Output(Out));
// Receive the data from the channel.
bool ok = ChannelReceive(ch, output_var);
// Set the status output of the `ChannelReceive` call.
SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
}
};
class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ChannelRecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(Channel,
"(Channel) A variable which \"receives\" the a value sent"
"to it by a channel_send op.")
.AsDuplicable();
AddOutput(Out,
"(Variable) Output Variable that will hold the data received"
" from the Channel")
.AsDuplicable();
AddOutput(Status,
"(Tensor) An LoD Tensor that returns a boolean status of the"
"result of the receive operation.")
.AsDuplicable();
AddComment(R"DOC(
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelRecvOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include <paddle/fluid/framework/lod_rank_table.h>
#include <paddle/fluid/framework/lod_tensor_array.h>
#include <paddle/fluid/framework/reader.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h"
static constexpr char Channel[] = "Channel";
static constexpr char X[] = "X";
static constexpr char Status[] = "Status";
static constexpr char copy[] = "copy";
namespace paddle {
namespace operators {
void SetSendStatus(const platform::Place &dev_place,
framework::Variable &status_var, bool status) {
auto cpu = platform::CPUPlace();
auto status_tensor =
status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
cpu);
status_tensor[0] = status;
}
bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Send(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Send(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Send(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Send(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
return ch->Send(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Send(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelSend:Unsupported type");
}
class ChannelSendOp : public framework::OperatorBase {
public:
ChannelSendOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE(ctx->HasInput(Channel),
"Input(Channel) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(X),
"Input(X) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Status),
"Output(Status) of ChannelSendOp should not be null.");
ctx->SetOutputDim("Status", {1});
}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
// Get the channel holder created by channel_create op, passed as input.
framework::ChannelHolder *ch =
scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
auto input_var = scope.FindVar(Input(X));
// Send the input data through the channel.
bool ok = ChannelSend(ch, input_var);
// Set the status output of the `ChannelSend` call.
SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
}
};
class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ChannelSendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(Channel,
"(Channel) A variable which \"sends\" the passed in value to "
"a listening receiver.")
.AsDuplicable();
AddInput(X, "(Variable) The value which gets sent by the channel.")
.AsDuplicable();
AddOutput(Status,
"(Tensor) An LoD Tensor that returns a boolean status of the"
"result of the send operation.")
.AsDuplicable();
AddAttr<bool>(copy, "(bool, default false) Should copy before send")
.SetDefault(false);
AddComment(R"DOC(
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::ChannelSendOpMaker);
......@@ -24,6 +24,29 @@ namespace detail {
template <typename T, int Rank>
struct StridedMemcpyFunctor;
template <typename T>
struct StridedMemcpyFunctor<T, 0> {
void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
framework::Dim<0> dst_stride, T* dst) const {
auto place = dev_ctx.GetPlace();
if (platform::is_cpu_place(place)) {
auto& cpu_place = boost::get<platform::CPUPlace>(place);
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
} else {
#ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::CUDAPlace>(place);
auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
cuda_ctx.stream());
#else
PADDLE_THROW("Paddle is not compiled with GPU");
#endif
}
}
};
template <typename T>
struct StridedMemcpyFunctor<T, 1> {
void operator()(const platform::DeviceContext& dev_ctx, const T* src,
......
......@@ -65,12 +65,17 @@ smaller than or equal to the dimensions of $X$.
There are two cases for this operator:
1. The shape of $Y$ is same with $X$;
2. The shape of $Y$ is a subset of $X$.
2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
of size 1 for $Y$ will be ignored for the consideration of subsequence.
For case 2:
$Y$ will be broadcasted to match the shape of $X$ and axis should be
set to index of the start dimension to broadcast $Y$ onto $X$.
If axis is -1, it is treated as axis=rank(X)-rank(Y).
For example
.. code-block:: python
......@@ -79,6 +84,7 @@ For example
shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
information. However, the output only shares the LoD information with input $X$.
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#ifdef __NVCC__
#include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_helper.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif
......@@ -61,6 +62,19 @@ inline void get_mid_dims(const framework::DDim& x_dims,
}
}
inline void trim_trailing_singular_dims(framework::DDim& dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
}
if (actual_dims_size != dims.size()) {
auto actual_dims = framework::vectorize(dims);
actual_dims.resize(actual_dims_size);
dims = framework::make_ddim(actual_dims);
}
}
template <typename T, typename DeviceContext>
class RowwiseTransformIterator;
template <typename T, typename DeviceContext>
......@@ -263,44 +277,6 @@ class TransformFunctor {
} \
}
template <class functor, typename DeviceContext, typename T>
void ElementwiseCompute(const framework::ExecutionContext& ctx) {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
auto x_dims = x->dims();
auto y_dims = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input.");
if (x_dims == y_dims) {
functor f;
f.template Run<DeviceContext, T>(x, y, z, ctx);
return;
}
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
if (post == 1) {
functor f;
f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
return;
} else {
functor f;
f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
return;
}
}
#define EIGEN_ADD(x, y) ((x) + (y))
EIGEN_FUNCTOR(Add, EIGEN_ADD);
......@@ -361,13 +337,10 @@ template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast1CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int h, int w,
DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
extern __shared__ char shm_buffer[];
T* shm = reinterpret_cast<T*>(shm_buffer);
int j = blockIdx.x;
int i = threadIdx.x;
int tid = threadIdx.x;
shm[tid] = 0;
T val = 0;
do {
int x_offset = i * w + j;
......@@ -375,22 +348,16 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
if (dy) {
shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
i += ELEMWISE_MAX_BLOCK_DIM;
} while (i < h);
if (dy) {
__syncthreads();
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
// Sum, could be optimized
val = platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) {
for (int k = 1; k < h; ++k) {
shm[0] += shm[k];
}
dy[j] = shm[0];
dy[j] = val;
}
}
}
......@@ -402,10 +369,8 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x,
T* dx, T* dy) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
int gird_size = w;
int shared_mem_size = block_size * sizeof(T);
ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, shared_mem_size,
stream>>>(x, y, out, dout, h, w, dx_op,
dy_op, dx, dy);
ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
}
#endif
......@@ -436,7 +401,6 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out,
}
#ifdef __NVCC__
template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast2CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int pre, int n,
......@@ -444,9 +408,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int tid = threadIdx.x;
int j = blockIdx.x;
extern __shared__ char shm_buffer[];
T* shm = reinterpret_cast<T*>(shm_buffer);
shm[tid] = 0;
T val = 0;
int ttid = tid;
while (true) {
......@@ -461,23 +423,18 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
}
if (dy != nullptr) {
shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
ttid += ELEMWISE_MAX_BLOCK_DIM;
}
if (dy) {
__syncthreads();
int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
// Sum, could be optimized
if (tid == 0) {
for (int i = 1; i < h; ++i) {
shm[0] += shm[i];
}
dy[j] = shm[0];
val = platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) {
dy[j] = val;
}
}
}
......@@ -489,10 +446,8 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
DY_OP dy_op, T* dx, T* dy) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
int gird_size = n;
int shared_mem_size = block_size * sizeof(T);
ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, shared_mem_size,
stream>>>(x, y, out, dout, pre, n, post,
dx_op, dy_op, dx, dy);
ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
}
#endif
......@@ -516,14 +471,10 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
auto x_dim = x.dims();
auto y_dim = y.dims();
if (y_dim.size() == 1 && y_dim[0] == 1) {
// y is a scalar
auto extended_dims = framework::vectorize(x_dim);
extended_dims.push_back(1);
x_dim = framework::make_ddim(extended_dims);
}
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
trim_trailing_singular_dims(y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, pre, n, post);
if (post == 1) {
......@@ -591,14 +542,9 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
return;
}
if (y_dims.size() == 1 && y_dims[0] == 1) {
// y is a scalar
auto extended_dims = framework::vectorize(x_dims);
extended_dims.push_back(1);
x_dims = framework::make_ddim(extended_dims);
}
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
trim_trailing_singular_dims(y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
......@@ -633,16 +579,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
return;
}
if (y_dims.size() == 1 && y_dims[0] == 1) {
// y is a scalar
auto extended_dims = framework::vectorize(x_dims);
extended_dims.push_back(1);
x_dims = framework::make_ddim(extended_dims);
}
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thread>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using StepScopeVar = std::vector<framework::Scope *>;
static constexpr char kBlock[] = "sub_block";
static constexpr char kX[] = "X";
class GoOp : public framework::OperatorBase {
public:
GoOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
void ExecuteOnThread(framework::Executor *executor,
framework::BlockDesc *block,
framework::Scope *scope) const {
framework::ProgramDesc *program = block->Program();
executor->Run(*program, scope, block->ID(), false /*create_local_scope*/);
}
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
/*
* Determine the global scope. Create a new child scope.
* Within the child scope, add all the local variables relevant
* to that scope.
*
* Now go through all the inputs to the op to ensure that
* all of them are in the newly created scope. This is important
* to ensure that they don't get destroyed when the parent scope
* is deleted.
* */
// TODO(varunarora): Consider moving this root scope lookup to scope.h.
const framework::Scope *root_scope = &scope;
const framework::Scope *parent_scope = &(root_scope->parent());
while (parent_scope != nullptr) {
root_scope = parent_scope;
parent_scope = &(parent_scope->parent());
}
framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
framework::Executor executor(dev_place);
framework::Scope &new_scope = root_scope->NewScope();
for (auto &var : block->AllVars()) {
new_scope.Var(var->Name());
}
auto &inputs = Inputs(kX);
for (size_t i = 0; i < inputs.size(); i++) {
PADDLE_ENFORCE_NOT_NULL(new_scope.FindVar(inputs.at(i)),
"All variables used in the go block "
"should be created in the global scope");
}
// Now execute the go op with the newly created scope.
std::thread go_thread([dev_place, block, &new_scope, this]() {
framework::Executor executor(dev_place);
ExecuteOnThread(&executor, block, &new_scope);
});
go_thread.detach();
}
};
class GoOpMaker : public framework::OpProtoAndCheckerMaker {
public:
GoOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(kX,
"A set of variables, which are required by operators inside the "
"block of Go Op.")
.AsDuplicable();
AddAttr<framework::BlockDesc *>(kBlock, "The block inside GoOp");
AddComment(R"DOC(
)DOC");
}
};
// TODO(thuan): Look into Gradient Operator for GO_OP
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(go, paddle::operators::GoOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::GoOpMaker);
......@@ -247,7 +247,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
ctx.device_context());
platform::CPUPlace());
}
};
......
......@@ -62,7 +62,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("Scores")->type()),
ctx.device_context());
platform::CPUPlace());
}
};
......
......@@ -67,6 +67,14 @@ class PriorBoxOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
platform::CPUPlace());
}
};
class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
......
......@@ -59,6 +59,16 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
}
};
class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &out_var : op_desc.Output("Out")) {
block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
}
}
};
class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
......@@ -80,7 +90,8 @@ class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
ops::SplitSelectedRowsOpMaker,
ops::SplitSelectedRowsGradMaker);
ops::SplitSelectedRowsGradMaker,
ops::SplitSelectedRowsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
split_selected_rows,
ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
proto_library(profiler_proto SRCS profiler.proto)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(profiler_py_proto profiler_py_proto_init)
add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if(WITH_GPU)
cc_library(enforce SRCS enforce.cc DEPS)
......
......@@ -62,5 +62,53 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
}
#endif
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
__shared__ T shm[32];
const int warpSize = 32;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
}
return val;
}
} // namespace platform
} // namespace paddle
......@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <google/protobuf/text_format.h>
#include <fstream>
#include <map>
#include <mutex>
#include <numeric>
#include <thread>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h"
......@@ -51,6 +55,36 @@ uint64_t kAlignSize = 8;
} \
} while (0)
std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
switch (kind) {
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
return "MEMCPY_HtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
return "MEMCPY_DtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
return "MEMCPY_HtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
return "MEMCPY_AtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
return "MEMCPY_AtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
return "MEMCPY_AtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
return "MEMCPY_DtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
return "MEMCPY_DtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
return "MEMCPY_HtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
return "MEMCPY_PtoP";
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
return "MEMCPY_FORCE_INT";
default:
break;
}
return "MEMCPY";
}
void EnableActivity() {
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
......@@ -107,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
kernel->correlationId);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
default: { break; }
}
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
......@@ -137,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_[id] = anno;
}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns,
std::hash<std::thread::id>{}(std::this_thread::get_id())});
}
void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {
std::lock_guard<std::mutex> l(trace_mu_);
......@@ -172,17 +240,15 @@ class DeviceTracerImpl : public DeviceTracer {
CUPTI_CALL(
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
enabled_ = true;
}
proto::Profile GenProfile() {
proto::Profile GenProfile(const std::string &profile_path) {
std::lock_guard<std::mutex> l(trace_mu_);
proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_);
std::map<std::string, std::vector<uint64_t>> event_times;
for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
......@@ -194,15 +260,31 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event_times[event->name()].push_back(r.end_ns - r.start_ns);
}
for (const auto &et : event_times) {
fprintf(
stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
et.first.c_str(),
std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
et.second.size());
for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.thread_id);
event->set_device_id(-1);
}
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc);
profile_f << profile_str;
profile_f.close();
return profile_pb;
}
......@@ -240,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_;
};
......@@ -252,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {
void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {}
......@@ -259,7 +349,9 @@ class DeviceTracerDummy : public DeviceTracer {
void Enable() {}
proto::Profile GenProfile() { return proto::Profile(); }
proto::Profile GenProfile(const std::string &profile_path) {
return proto::Profile();
}
void Disable() {}
};
......@@ -281,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
void ClearCurAnnotation() { cur_annotation = nullptr; }
const char *CurAnnotation() { return cur_annotation; }
} // namespace platform
} // namespace paddle
......@@ -36,6 +36,21 @@ class DeviceTracer {
uint32_t stream_id;
uint32_t correlation_id;
};
struct CPURecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint64_t thread_id;
};
struct MemRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint32_t device_id;
uint32_t stream_id;
uint32_t correlation_id;
uint64_t bytes;
};
virtual ~DeviceTracer() {}
// Needs to be called once before use.
......@@ -48,6 +63,14 @@ class DeviceTracer {
// human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id,
uint64_t bytes) = 0;
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
uint64_t end_ns) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end,
......@@ -55,7 +78,7 @@ class DeviceTracer {
uint32_t correlation_id) = 0;
// Generate a proto after done (Disabled).
virtual proto::Profile GenProfile() = 0;
virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
virtual bool IsEnabled() = 0;
};
......@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
void SetCurAnnotation(const char* anno);
// Clear the name after the operation is done.
void ClearCurAnnotation();
// Current name of the operation being run in the thread.
const char* CurAnnotation();
} // namespace platform
} // namespace paddle
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
......
......@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
__macro(cuptiFinalize); \
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback);
__macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <string>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_string(cudnn_dir, "",
......@@ -45,8 +46,7 @@ DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
namespace paddle {
namespace platform {
namespace dynload {
static const char* cupti_lib_path = CUPTI_LIB_PATH;
static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
static inline std::string join(const std::string& part1,
const std::string& part2) {
......
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <iomanip>
#include <map>
#ifdef PADDLE_WITH_CUDA
......@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
.count();
}
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
......@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
}
RecordEvent::RecordEvent(const std::string& name,
const DeviceContext* dev_ctx) {
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
: start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
name_ = name;
......@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
}
ClearCurAnnotation();
PopEvent(name_, dev_ctx_);
}
......@@ -199,22 +211,22 @@ std::vector<std::vector<Event>> GetAllEvents() {
return result;
}
void DisableProfiler(EventSortingKey sorted_key) {
void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer();
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
tracer->Disable();
tracer->GenProfile();
tracer->GenProfile(profile_path);
}
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
}
void ParseEvents(std::vector<std::vector<Event>>& events,
......
......@@ -108,6 +108,7 @@ struct RecordEvent {
~RecordEvent();
uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_;
// Event name
......@@ -140,7 +141,8 @@ void EnableProfiler(ProfilerState state);
// Clear the g_all_event_lists, which is total event lists of all threads.
void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key);
void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path);
// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
......
......@@ -15,12 +15,17 @@ limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;
message MemCopy { optional uint64 bytes = 3; }
message Event {
optional string name = 1;
optional uint64 start_ns = 2;
optional uint64 end_ns = 3;
optional uint32 device_id = 5;
// When positive, it represents gpu id. When -1, it represents CPU.
optional int64 device_id = 5;
optional uint32 stream_id = 6;
optional MemCopy memcopy = 7;
}
message Profile {
......
......@@ -125,5 +125,5 @@ TEST(RecordEvent, RecordEvent) {
EXPECT_EQ(start_profiler_count, 1);
// Will remove parsing-related code from test later
DisableProfiler(EventSortingKey::kTotal);
DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
}
......@@ -31,7 +31,6 @@ ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
BUILD_ROOT=/paddle/build_android
DEST_ROOT=/paddle/install_android
rm -rf $BUILD_ROOT 2>/dev/null || true
mkdir -p $BUILD_ROOT
cd $BUILD_ROOT
......
#!/bin/bash
set -e
ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
TMP_DIR=$HOME/$JOB/tmp
mkdir -p $TMP_DIR
cd $TMP_DIR
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
unzip -q android-ndk-r14b-linux-x86_64.zip
chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
cd $HOME
rm -rf $TMP_DIR
# Create the build directory for CMake.
mkdir -p $TRAVIS_BUILD_DIR/build_android
cd $TRAVIS_BUILD_DIR/build_android
# Compile paddle binaries
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=armeabi-v7a \
-DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
make -j `nproc`
......@@ -57,7 +57,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
if(WITH_SWIG_PY)
......
......@@ -11,27 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
class TestCSPFramework(unittest.TestCase):
def daisy_chain(self):
n = 10000
leftmost = fluid.make_channel(dtype=int)
right = leftmost
left = leftmost
with fluid.While(steps=n):
right = fluid.make_channel(dtype=int)
with fluid.go():
fluid.send(left, 1 + fluid.recv(right))
left = right
with fluid.go():
fluid.send(right, 1)
fluid.Print(fluid.recv(leftmost))
if __name__ == '__main__':
unittest.main()
......@@ -220,7 +220,10 @@ def _callback_lookup_(op):
:return: callback function
"""
if op.type == 'parallel_do' and op.attr('use_nccl'):
all_vars = op.block.vars
param_names = set(op.input('parameters'))
param_names = filter(lambda name: all_vars[name].stop_gradient is False,
param_names)
param_grad_names = [n + "@GRAD" for n in param_names]
class ParallelDoCallBack(object):
......
......@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: Variables: make_channel
# TODO: Operators: send, close_channel, recv, go, select
from layers.control_flow import BlockGuard
from layer_helper import LayerHelper
from layer_helper import LayerHelper, unique_name
from layers import fill_constant
import core
__all__ = [
'Go',
......@@ -46,41 +46,200 @@ class Go(BlockGuard):
parent_block = main_program.block(main_program.current_block()
.parent_idx)
inner_outputs = set()
x_name_list = set()
out_vars = set()
for op in go_block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the Go operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
x_name_list.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
# Iterate over all operators , get all the outputs
# add to the output list of Go operator only if
# they exist in the parent block.
for oname in op.output_names:
for out_var_name in op.output(oname):
if out_var_name in parent_block.vars:
out_vars.add(parent_block.var(out_var_name))
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in parent_block.vars:
out_vars.append(parent_block.var(inner_out_name))
parent_block.append_op(
type='go',
inputs={'X': [parent_block.var(x_name) for x_name in x_name_list]},
outputs={'Out': out_vars},
inputs={
'X':
[parent_block.var_recursive(x_name) for x_name in x_name_list]
},
outputs={},
attrs={'sub_block': go_block})
def make_channel(dtype, size=0):
return True
def make_channel(dtype, capacity=0):
"""
Helps implementation of a concurrent program by creating a "channel" of
a defined data type. Channels allow for the passing of data in
concurrent scenarios - such as when using threads to divide computation.
Channels can be used to "send" and "receive" such data concurrently.
There are two kinds of channels: unbuffered and buffered. Unbuffered
channels have no capacity - and thus, block on send and only unblock only
once what they have sent has been received.
def channel_send(channel, value):
return True
On the other hand, buffered channels are initialized with a capacity -
and do not block on sends.
Use this method in combination with `channel_send`, `channel_recv`,
`channel_close`, and `Go` to design a concurrent Paddle program.
Args:
dtype (ParamAttr|string): Data type of the data sent in the channel.
This data type should be the string name of a numpy data type.
capacity (ParamAttr|int): Size of the channel. Defaults to 0 for
to create an unbuffered channel.
Returns:
Variable: The channel variable that can be used to send an receive data
of the defined dtype.
def channel_recv(channel):
return True
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
...
# Code to execute in a Go block, which receives the channel data.
fluid.channel_send(ch, 100)
fluid.channel_close(ch)
"""
helper = LayerHelper('channel_create', **locals())
main_program = helper.main_program
make_channel_block = main_program.current_block()
# Make a channel variable (using the channel data type) and make sure it
# persists into the global scope.
channel = helper.create_variable(
name=unique_name.generate('channel'),
type=core.VarDesc.VarType.CHANNEL,
persistable=True)
create_channel_op = make_channel_block.append_op(
type="channel_create",
outputs={"Out": channel},
attrs={"data_type": dtype,
"capacity": capacity})
return channel
def channel_send(channel, value):
"""
Sends a value through a channel variable. Used by an unbuffered or buffered
channel to pass data from within or to a concurrent Go block, where
`channel_recv` to used to get the passed value.
Args:
channel (Variable|Channel): Channel variable created using
`make_channel`.
value (Variable): Value to send to channel
Returns:
Variable: The boolean status on whether or not the channel
successfully sent the passed value.
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
...
# Code to execute in a Go block, which receives the channel data.
fluid.channel_send(ch, 100)
"""
helper = LayerHelper('channel_send', **locals())
main_program = helper.main_program
channel_send_block = main_program.current_block()
status = helper.create_variable(
name=unique_name.generate('status'),
type=core.VarDesc.VarType.LOD_TENSOR,
dtype=core.VarDesc.VarType.BOOL)
channel_send_op = channel_send_block.append_op(
type="channel_send",
inputs={
"Channel": channel,
"X": value,
},
outputs={"Status": status})
return status
def channel_recv(channel, return_value):
"""
Receives a value through a channel variable. Used by an unbuffered or
buffered channel within a concurrent Go block to get data from originally
sent using `channel_send`, or from outside such a block where
`channel_send` is used to send the value.
Args:
channel (Variable|Channel): Channel variable created using
`make_channel`.
return_value (Variable): Variable to set as a result of running channel_recv_op
Returns:
Variable: The received value from the channel.
Variable: The boolean status on whether or not the channel
successfully received the passed value.
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
with fluid.Go():
returned_value = fluid.channel_recv(ch, 'int32')
# Code to send data through the channel.
"""
helper = LayerHelper('channel_recv', **locals())
main_program = helper.main_program
channel_recv_block = main_program.current_block()
status = helper.create_variable(
name=unique_name.generate('status'),
type=core.VarDesc.VarType.LOD_TENSOR,
dtype=core.VarDesc.VarType.BOOL)
channel_recv_op = channel_recv_block.append_op(
type="channel_recv",
inputs={"Channel": channel},
outputs={"Out": return_value,
"Status": status})
return return_value, status
def channel_close(channel):
return True
"""
Closes a channel created using `make_channel`.
Args:
channel (Variable|Channel): Channel variable created using
`make_channel`.
Examples:
.. code-block:: python
ch = fluid.make_channel(dtype='int32', capacity=10)
...
# Code to receive and send data through a channel
...
fluid.channel_close(ch)
"""
helper = LayerHelper('channel_close', **locals())
main_program = helper.main_program
channel_close_block = main_program.current_block()
channel_close_op = channel_close_block.append_op(
type="channel_close", inputs={"Channel": channel})
......@@ -276,17 +276,17 @@ class DistributeTranspiler:
pserver_program.global_block().create_var(
name=orig_var_name,
persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
print("create origin var: ", orig_var_name)
for trainer_id in xrange(self.trainers):
var = pserver_program.global_block().create_var(
name="%s.trainer_%d" % (orig_var_name, trainer_id),
persistable=False,
type=v.type,
dtype=v.dtype,
shape=v.shape)
recv_inputs.append(var)
print("create per trainer var: ", var.name)
# step3
optimize_block = pserver_program.create_block(0)
# step 4
......@@ -551,6 +551,7 @@ class DistributeTranspiler:
type="sum",
inputs={"X": vars2merge},
outputs={"Out": merged_var})
if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
optimize_block.append_op(
type="scale",
inputs={"X": merged_var},
......
......@@ -22,6 +22,7 @@ from layer_helper import LayerHelper
__all__ = [
'Accuracy',
'ChunkEvaluator',
'EditDistance',
]
......@@ -173,7 +174,7 @@ class ChunkEvaluator(Evaluator):
class EditDistance(Evaluator):
"""
Accumulate edit distance sum and sequence number from mini-batches and
compute the average edit_distance of all batches.
compute the average edit_distance and instance error of all batches.
Args:
input: the sequences predicted by network.
......@@ -189,14 +190,12 @@ class EditDistance(Evaluator):
for epoch in PASS_NUM:
distance_evaluator.reset(exe)
for data in batches:
loss, sum_distance = exe.run(fetch_list=[cost] + distance_evaluator.metrics)
avg_distance = distance_evaluator.eval(exe)
pass_distance = distance_evaluator.eval(exe)
loss = exe.run(fetch_list=[cost])
distance, instance_error = distance_evaluator.eval(exe)
In the above example:
'sum_distance' is the sum of the batch's edit distance.
'avg_distance' is the average of edit distance from the firt batch to the current batch.
'pass_distance' is the average of edit distance from all the pass.
'distance' is the average of the edit distance in a pass.
'instance_error' is the instance error rate in a pass.
"""
......@@ -206,25 +205,45 @@ class EditDistance(Evaluator):
if main_program.current_block().idx != 0:
raise ValueError("You can only invoke Evaluator in root block")
self.total_error = self.create_state(
dtype='float32', shape=[1], suffix='total_error')
self.total_distance = self.create_state(
dtype='float32', shape=[1], suffix='total_distance')
self.seq_num = self.create_state(
dtype='int64', shape=[1], suffix='seq_num')
error, seq_num = layers.edit_distance(
self.instance_error = self.create_state(
dtype='int64', shape=[1], suffix='instance_error')
distances, seq_num = layers.edit_distance(
input=input, label=label, ignored_tokens=ignored_tokens)
#error = layers.cast(x=error, dtype='float32')
sum_error = layers.reduce_sum(error)
layers.sums(input=[self.total_error, sum_error], out=self.total_error)
zero = layers.fill_constant(shape=[1], value=0.0, dtype='float32')
compare_result = layers.equal(distances, zero)
compare_result_int = layers.cast(x=compare_result, dtype='int')
seq_right_count = layers.reduce_sum(compare_result_int)
instance_error_count = layers.elementwise_sub(
x=seq_num, y=seq_right_count)
total_distance = layers.reduce_sum(distances)
layers.sums(
input=[self.total_distance, total_distance],
out=self.total_distance)
layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
self.metrics.append(sum_error)
layers.sums(
input=[self.instance_error, instance_error_count],
out=self.instance_error)
self.metrics.append(total_distance)
self.metrics.append(instance_error_count)
def eval(self, executor, eval_program=None):
if eval_program is None:
eval_program = Program()
block = eval_program.current_block()
with program_guard(main_program=eval_program):
total_error = _clone_var_(block, self.total_error)
total_distance = _clone_var_(block, self.total_distance)
seq_num = _clone_var_(block, self.seq_num)
instance_error = _clone_var_(block, self.instance_error)
seq_num = layers.cast(x=seq_num, dtype='float32')
out = layers.elementwise_div(x=total_error, y=seq_num)
return np.array(executor.run(eval_program, fetch_list=[out])[0])
instance_error = layers.cast(x=instance_error, dtype='float32')
avg_distance = layers.elementwise_div(x=total_distance, y=seq_num)
avg_instance_error = layers.elementwise_div(
x=instance_error, y=seq_num)
result = executor.run(
eval_program, fetch_list=[avg_distance, avg_instance_error])
return np.array(result[0]), np.array(result[1])
......@@ -14,7 +14,7 @@
import numpy as np
import contextlib
from framework import Program, default_main_program
from framework import Program, default_main_program, Variable
from . import core
__all__ = [
......@@ -281,6 +281,8 @@ class Executor(object):
if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
for i, var in enumerate(fetch_list):
assert isinstance(var, Variable) or isinstance(var, str), (
"Wrong type for fetch_list[%s]: %s" % (i, type(var)))
global_block.append_op(
type='fetch',
inputs={'X': [var]},
......
......@@ -141,6 +141,8 @@ class Variable(object):
dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable.
lod_level(int): The level of lod tensor. 0 means it is not a time
series data.
capacity(int): The capacity of Channel variable. Ignored
for other types.
persistable(bool): True if the variable should be saved as check point.
Defaults to False.
stop_gradient(bool): True if the variable will stop to calculate
......@@ -154,6 +156,7 @@ class Variable(object):
shape=None,
dtype=None,
lod_level=None,
capacity=None,
persistable=None,
error_clip=None,
stop_gradient=False,
......@@ -224,6 +227,14 @@ class Variable(object):
"persistable is {2}. They are not matched".format(
self.name, self.persistable, persistable))
if capacity is not None:
if is_new_var:
self.desc.set_capacity(capacity)
else:
# TODO(abhinavarora) : Compare with set capacity once,
# get_capacity is implemented
pass
self.block.vars[name] = self
self.op = None
self.stop_gradient = stop_gradient
......@@ -472,10 +483,11 @@ class Operator(object):
self.desc.check_attrs()
no_kernel_op_set = {
'feed', 'fetch', 'save', 'load', 'recurrent',
'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
'load_combine', 'ncclInit'
'load_combine', 'ncclInit', 'channel_create', 'channel_close',
'channel_send', 'channel_recv'
}
if type not in no_kernel_op_set:
self.desc.infer_var_type(self.block.desc)
......@@ -761,7 +773,7 @@ class Block(object):
stop_gradient = v.stop_gradient
else:
raise ValueError("unsupported var type: %s", type(v))
orig_var_type = v.type
self.desc.rename_var(name, new_name)
# NOTE: v is destroyed by C++ after calling rename_var.
d = self.desc.find_var(new_name)
......@@ -770,6 +782,7 @@ class Block(object):
self,
d.shape(),
d.dtype(),
type=orig_var_type,
name=new_name,
stop_gradient=stop_gradient,
trainable=trainable,
......@@ -780,7 +793,7 @@ class Block(object):
elif var_type == "Variable":
var = Variable(
self,
type=v.type,
type=orig_var_type,
name=new_name,
error_clip=error_clip,
stop_gradient=stop_gradient)
......
......@@ -330,9 +330,28 @@ class LayerHelper(object):
return self.main_program.current_block().create_var(*args, **kwargs)
def create_global_variable(self, persistable=False, *args, **kwargs):
"""
create global variable, note that there is no initializer for this global variable.
Args:
persistable(bool): True if it is a checkpoint value.
*args: See create_var's documentation
**kwargs: See create_var's documentation
Returns(Variable): the created variable.
"""
return self.main_program.global_block().create_var(
*args, persistable=persistable, **kwargs)
def create_or_get_global_variable(self, name, *args, **kwargs):
"""
Creates a global variable if not exists and returns the variable and
a boolean flag which is true when it is a new variable.
"""
if self.main_program.global_block().has_var(name):
return self.main_program.global_block().var(name), False
else:
return self.create_global_variable(name=name, *args, **kwargs), True
def set_variable_initializer(self, var, initializer):
assert isinstance(var, Variable)
self.startup_program.global_block().create_var(
......
......@@ -43,8 +43,8 @@ for _OP in set(__auto__):
globals()[_OP] = generate_layer_fn(_OP)
def detection_output(scores,
loc,
def detection_output(loc,
scores,
prior_box,
prior_box_var,
background_label=0,
......@@ -54,21 +54,27 @@ def detection_output(scores,
score_threshold=0.01,
nms_eta=1.0):
"""
**Detection Output Layer**
**Detection Output Layer for Single Shot Multibox Detector (SSD).**
This layer applies the NMS to the output of network and computes the
predict bounding box location. The output's shape of this layer could
be zero if there is no valid bounding box.
This operation is to get the detection results by performing following
two steps:
1. Decode input bounding box predictions according to the prior boxes.
2. Get the final detection results by applying multi-class non maximum
suppression (NMS).
Please note, this operation doesn't clip the final output bounding boxes
to the image window.
Args:
scores(Variable): A 3-D Tensor with shape [N, C, M] represents the
predicted confidence predictions. N is the batch size, C is the
class number, M is number of bounding boxes. For each category
there are total M scores which corresponding M bounding boxes.
loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
predicted locations of M bounding bboxes. N is the batch size,
and each bounding box has four coordinate values and the layout
is [xmin, ymin, xmax, ymax].
scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
predicted confidence predictions. N is the batch size, C is the
class number, M is number of bounding boxes. For each category
there are total M scores which corresponding M bounding boxes.
prior_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
each box is represented as [xmin, ymin, xmax, ymax],
[xmin, ymin] is the left top coordinate of the anchor box,
......@@ -91,7 +97,15 @@ def detection_output(scores,
nms_eta(float): The parameter for adaptive NMS.
Returns:
The detected bounding boxes which are a Tensor.
Variable: The detection outputs is a LoDTensor with shape [No, 6].
Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
`No` is the total number of detections in this mini-batch. For each
instance, the offsets in first dimension are called LoD, the offset
number is N + 1, N is the batch size. The i-th image has
`LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
has no detected results. If all images have not detected results,
all the elements in LoD are 0, and output tensor only contains one
value, which is -1.
Examples:
.. code-block:: python
......@@ -100,7 +114,7 @@ def detection_output(scores,
append_batch_size=False, dtype='float32')
pbv = layers.data(name='prior_box_var', shape=[10, 4],
append_batch_size=False, dtype='float32')
loc = layers.data(name='target_box', shape=[21, 4],
loc = layers.data(name='target_box', shape=[2, 21, 4],
append_batch_size=False, dtype='float32')
scores = layers.data(name='scores', shape=[2, 21, 10],
append_batch_size=False, dtype='float32')
......@@ -109,7 +123,6 @@ def detection_output(scores,
prior_box=pb,
prior_box_var=pbv)
"""
helper = LayerHelper("detection_output", **locals())
decoded_box = box_coder(
prior_box=prior_box,
......@@ -118,6 +131,7 @@ def detection_output(scores,
code_type='decode_center_size')
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
scores = nn.transpose(scores, perm=[0, 2, 1])
helper.append_op(
type="multiclass_nms",
inputs={'Scores': scores,
......@@ -595,12 +609,13 @@ def multi_box_head(inputs,
name(str): Name of the prior box layer. Default: None.
Returns:
mbox_loc(list): The predicted boxes' location of the inputs.
The layout of each element is [N, H, W, Priors]. Priors
is the number of predicted boxof each position of each input.
mbox_conf(list): The predicted boxes' confidence of the inputs.
The layout of each element is [N, H, W, Priors]. Priors
is the number of predicted box of each position of each input.
mbox_loc(Variable): The predicted boxes' location of the inputs.
The layout is [N, H*W*Priors, 4]. where Priors
is the number of predicted boxes each position of each input.
mbox_conf(Variable): The predicted boxes' confidence of the inputs.
The layout is [N, H*W*Priors, C]. where Priors
is the number of predicted boxes each position of each input
and C is the number of Classes.
boxes(Variable): the output prior boxes of PriorBox.
The layout is [num_priors, 4]. num_priors is the total
box count of each position of inputs.
......@@ -751,7 +766,7 @@ def multi_box_head(inputs,
num_boxes = box.shape[2]
# get box_loc
num_loc_output = num_boxes * num_classes * 4
num_loc_output = num_boxes * 4
mbox_loc = nn.conv2d(
input=input,
num_filters=num_loc_output,
......@@ -760,7 +775,12 @@ def multi_box_head(inputs,
stride=stride)
mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
mbox_locs.append(mbox_loc)
new_shape = [
mbox_loc.shape[0],
mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
]
mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
mbox_locs.append(mbox_loc_flatten)
# get conf_loc
num_conf_output = num_boxes * num_classes
......@@ -771,11 +791,18 @@ def multi_box_head(inputs,
padding=pad,
stride=stride)
conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
mbox_confs.append(conf_loc)
new_shape = [
conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
conf_loc.shape[3] / num_classes, num_classes
]
conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
mbox_confs.append(conf_loc_flatten)
if len(box_results) == 1:
box = box_results[0]
var = var_results[0]
mbox_locs_concat = mbox_locs[0]
mbox_confs_concat = mbox_confs[0]
else:
reshaped_boxes = []
reshaped_vars = []
......@@ -785,5 +812,7 @@ def multi_box_head(inputs,
box = tensor.concat(reshaped_boxes)
var = tensor.concat(reshaped_vars)
mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
return mbox_locs, mbox_confs, box, var
return mbox_locs_concat, mbox_confs_concat, box, var
......@@ -53,12 +53,22 @@ def monkey_patch_variable():
value = float(value)
tmp_name = unique_tmp_name()
var = ref_var.block.create_var(name=tmp_name, dtype=dtype)
batch_dim = -1
for i, d in enumerate(ref_var.shape):
if d < 0:
batch_dim = i
break
assert batch_dim != -1
ref_var.block.append_op(
type='fill_constant_batch_size_like',
outputs={'Out': [var]},
inputs={'Input': [ref_var]},
attrs={'shape': ref_var.shape,
'value': value})
attrs={
'shape': ref_var.shape,
'value': value,
'input_dim_idx': batch_dim,
'output_dim_idx': batch_dim
})
return var
def astype(self, dtype):
......@@ -118,11 +128,20 @@ def monkey_patch_variable():
tmp_name = unique_tmp_name()
out = self.block.create_var(name=tmp_name, dtype=lhs_dtype)
axis = -1
if other_var.shape[0] == -1:
axis = 0
assert len(self.shape) >= len(other_var.shape), (
"The rank of the first argument of an binary operator cannot "
"be smaller than the rank of its second argument: %s vs %s" %
(len(self.shape), len(other_var.shape)))
self.block.append_op(
type=op_type,
inputs={'X': [self],
'Y': [other_var]},
outputs={'Out': out})
outputs={'Out': out},
attrs={'axis': axis})
return out
comment = OpProtoHolder.instance().get_op_proto(op_type).comment
......
......@@ -69,6 +69,7 @@ __all__ = [
'softmax_with_cross_entropy',
'smooth_l1',
'one_hot',
'autoincreased_step_counter',
]
......@@ -2443,10 +2444,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
return out
def edit_distance(input,
label,
normalized=False,
ignored_tokens=None,
def edit_distance(input, label, normalized=True, ignored_tokens=None,
name=None):
"""
EditDistance operator computes the edit distances between a batch of
......@@ -3201,3 +3199,34 @@ def one_hot(input, depth):
attrs={'depth': depth},
outputs={'Out': one_hot_out})
return one_hot_out
def autoincreased_step_counter(counter_name=None, begin=1, step=1):
"""
NOTE: The counter will be automatically increased by 1 every mini-batch
Return the run counter of the main program, which is started with 1.
Args:
counter_name(str): The counter name, default is '@STEP_COUNTER@'.
begin(int): The first value of this counter.
step(int): The increment step between each execution.
Returns(Variable): The global run counter.
"""
helper = LayerHelper('global_step_counter')
if counter_name is None:
counter_name = '@STEP_COUNTER@'
counter, is_new_var = helper.create_or_get_global_variable(
name=counter_name, dtype='int64', shape=[1], persistable=True)
if is_new_var:
helper.set_variable_initializer(
counter, initializer=Constant(
value=begin - 1, force_cpu=True))
helper.main_program.global_block().prepend_op(
type='increment',
inputs={'X': [counter]},
outputs={'Out': [counter]},
attrs={'step': float(step)})
counter.stop_gradient = True
return counter
......@@ -13,7 +13,6 @@
# limitations under the License.
import layers
from framework import Variable
from initializer import init_on_cpu
__all__ = [
......@@ -30,11 +29,15 @@ strategy according to this module.
"""
def exponential_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
def _decay_step_counter():
# the first global step is zero in learning rate decay
global_step = layers.autoincreased_step_counter(
counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
global_step = layers.cast(global_step, 'float32')
return global_step
def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
"""Applies exponential decay to the learning rate.
```python
......@@ -44,7 +47,6 @@ def exponential_decay(learning_rate,
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
......@@ -52,8 +54,7 @@ def exponential_decay(learning_rate,
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for exponential_decay.")
global_step = _decay_step_counter()
with init_on_cpu():
# update learning_rate
......@@ -65,23 +66,17 @@ def exponential_decay(learning_rate,
return decayed_lr
def natural_exp_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
"""Applies natural exponential decay to the initial learning rate.
```python
if not staircase:
decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
else:
decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
```
>>> if not staircase:
>>> decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
>>> else:
>>> decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
......@@ -89,8 +84,7 @@ def natural_exp_decay(learning_rate,
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for natural_exp_decay.")
global_step = _decay_step_counter()
with init_on_cpu():
div_res = global_step / decay_steps
......@@ -101,23 +95,17 @@ def natural_exp_decay(learning_rate,
return decayed_lr
def inverse_time_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
"""Applies inverse time decay to the initial learning rate.
```python
if staircase:
decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
else:
decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
```
>>> if staircase:
>>> decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
>>> else:
>>> decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
will be the initial learning rate during training.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
......@@ -125,8 +113,7 @@ def inverse_time_decay(learning_rate,
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for inverse_time_decay.")
global_step = _decay_step_counter()
with init_on_cpu():
div_res = global_step / decay_steps
......@@ -139,26 +126,22 @@ def inverse_time_decay(learning_rate,
def polynomial_decay(learning_rate,
global_step,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False):
"""Applies polynomial decay to the initial learning rate.
```python
if cycle:
decay_steps = decay_steps * ceil(global_step / decay_steps)
else:
global_step = min(global_step, decay_steps)
decayed_learning_rate = (learning_rate - end_learning_rate) *
(1 - global_step / decay_steps) ^ power +
end_learning_rate
```
>>> if cycle:
>>> decay_steps = decay_steps * ceil(global_step / decay_steps)
>>> else:
>>> global_step = min(global_step, decay_steps)
>>> decayed_learning_rate = (learning_rate - end_learning_rate) *
>>> (1 - global_step / decay_steps) ^ power +
>>> end_learning_rate
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
end_learning_rate: A Python `float` number.
power: A Python `float` number
......@@ -167,8 +150,7 @@ def polynomial_decay(learning_rate,
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for inverse_time_decay.")
global_step = _decay_step_counter()
with init_on_cpu():
if cycle:
......@@ -193,27 +175,24 @@ def polynomial_decay(learning_rate,
return decayed_lr
def piecewise_decay(global_step, boundaries, values):
def piecewise_decay(boundaries, values):
"""Applies piecewise decay to the initial learning rate.
```python
boundaries = [10000, 20000]
values = [1.0, 0.5, 0.1]
if step < 10000:
learning_rate = 1.0
elif step >= 10000 and step < 20000:
learning_rate = 0.5
else:
learning_rate = 0.1
```
>>> boundaries = [10000, 20000]
>>> values = [1.0, 0.5, 0.1]
>>>
>>> if step < 10000:
>>> learning_rate = 1.0
>>> elif 10000 <= step < 20000:
>>> learning_rate = 0.5
>>> else:
>>> learning_rate = 0.1
"""
if len(values) - len(boundaries) != 1:
raise ValueError("len(values) - len(boundaries) should be 1")
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for piecewise_decay.")
global_step = _decay_step_counter()
with init_on_cpu():
lr = layers.create_global_var(
......
......@@ -29,6 +29,8 @@ dtype_to_size = {
core.VarDesc.VarType.BOOL: 1
}
sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num, skip_opt):
......@@ -141,7 +143,7 @@ class ControlFlowGraph(object):
self.pool = []
for i in range(self.op_size):
op = self._ops[i]
if op.type() == "while" or op.type() == "while_grad":
if op.type() in sub_block_ops:
continue
block_desc = op.block()
is_forward = i < self._forward_num
......@@ -198,67 +200,75 @@ class ControlFlowGraph(object):
block_desc, var_name, is_forward).shape()))
def get_cfgs(input_program):
def _process_sub_block_pair(pdesc, sub_block_pair):
ops_list = []
pdesc = input_program.get_desc()
block_desc = pdesc.block(0)
op_size = block_desc.op_size()
# Get global block ops
ops_list.append(
([block_desc.op(i) for i in range(op_size)], op_size, set()))
while_sub_block_ids = []
while_grad_sub_block_ids = []
while_block_id_pair = []
while_op_dict = {}
for fwd_op, bwd_op in sub_block_pair:
sub_block_ids = []
grad_sub_block_ids = []
sub_block_id_pair = []
sub_op_dict = {}
for i in range(op_size):
op = block_desc.op(i)
if op.type() == "while":
while_sub_block_ids.append(op.attr("sub_block").id)
while_op_dict[op.attr("sub_block").id] = op
elif op.type() == "while_grad":
while_grad_sub_block_ids.append(op.attr("sub_block").id)
while_op_dict[op.attr("sub_block").id] = op
if op.type() == fwd_op:
sub_block_ids.append(op.attr("sub_block").id)
sub_op_dict[op.attr("sub_block").id] = op
elif op.type() == bwd_op:
grad_sub_block_ids.append(op.attr("sub_block").id)
sub_op_dict[op.attr("sub_block").id] = op
# Find fwd_op/bwd_op block pair
for grad_id in grad_sub_block_ids:
fwd_id = pdesc.block(grad_id).get_forward_block_idx()
if fwd_id in sub_block_ids:
sub_block_id_pair.append((fwd_id, grad_id))
sub_block_ids.remove(fwd_id)
# Find while/while_grad block pair
for grad_id in while_grad_sub_block_ids:
forward_id = pdesc.block(grad_id).get_forward_block_idx()
if forward_id in while_sub_block_ids:
while_block_id_pair.append((forward_id, grad_id))
while_sub_block_ids.remove(forward_id)
# Get fwd_op/bwd_op block ops
for fwd_id, grad_id in sub_block_id_pair:
sub_block_ops = []
sub_block = pdesc.block(fwd_id)
block_op_size = sub_block.op_size()
for i in range(block_op_size):
sub_block_ops.append(sub_block.op(i))
# Get while/while_grad block ops
for forward_id, grad_id in while_block_id_pair:
while_block_ops = []
while_block = pdesc.block(forward_id)
while_block_op_size = while_block.op_size()
for i in range(while_block_op_size):
while_block_ops.append(while_block.op(i))
grad_sub_block = pdesc.block(grad_id)
grad_sub_block_op_size = grad_sub_block.op_size()
for i in range(grad_sub_block_op_size):
sub_block_ops.append(grad_sub_block.op(i))
while_grad_block = pdesc.block(grad_id)
while_grad_block_op_size = while_grad_block.op_size()
for i in range(while_grad_block_op_size):
while_block_ops.append(while_grad_block.op(i))
sub_op_output = set()
sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
ops_list.append((sub_block_ops, block_op_size, sub_op_output))
while_op_output = set()
while_op_output.update(while_op_dict[forward_id].output_arg_names())
while_op_output.update(while_op_dict[grad_id].output_arg_names())
# Process rest fwd_op block ops
for fwd_id in sub_block_ids:
sub_block_ops = []
sub_block = pdesc.block(fwd_id)
sub_block_op_size = sub_block.op_size()
for i in range(sub_block_op_size):
sub_block_ops.append(sub_block.op(i))
sub_op_output = set()
sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
return ops_list
ops_list.append((while_block_ops, while_block_op_size, while_op_output))
# Process rest while block ops
for forward_id in while_sub_block_ids:
while_block_ops = []
while_block = pdesc.block(forward_id)
while_block_op_size = while_block.op_size()
for i in range(while_block_op_size):
while_block_ops.append(while_block.op(i))
def _get_cfgs(input_program):
ops_list = []
pdesc = input_program.get_desc()
block_desc = pdesc.block(0)
op_size = block_desc.op_size()
# Get global block ops
ops_list.append(
([block_desc.op(i) for i in range(op_size)], op_size, set()))
while_op_output = set()
while_op_output.update(while_op_dict[forward_id].output_arg_names())
sub_block_pair = [("while", "while_grad"), ("parallel_do",
"parallel_do_grad")]
ops_list.append((while_block_ops, while_block_op_size, while_op_output))
ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
cfgs = [
ControlFlowGraph(input_program, ops, forward_num, skip_opt)
......@@ -268,6 +278,6 @@ def get_cfgs(input_program):
def memory_optimize(input_program):
cfgs = get_cfgs(input_program)
cfgs = _get_cfgs(input_program)
for cfg in cfgs:
cfg.memory_optimize()
......@@ -35,11 +35,10 @@ class Optimizer(object):
but need to use one of it's implementation.
"""
def __init__(self, learning_rate, global_step=None, regularization=None):
def __init__(self, learning_rate, regularization=None):
if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable")
self._global_step = global_step
self.regularization = regularization
self._learning_rate = learning_rate
# each program should have a independent learning rate
......@@ -159,26 +158,6 @@ class Optimizer(object):
format(name, param.name))
return self._accumulators[name][param.name]
def _increment_global_step(self, block):
"""Increment the global step by 1 after every iteration
Args:
block: the block in which the loss variable is present
Returns:
list with global_step increment op as its only element
"""
assert isinstance(block, framework.Block)
assert self._global_step is not None
# create the increment op
increment_op = block.append_op(
type="increment",
inputs={"X": self._global_step},
outputs={"Out": self._global_step},
attrs={"step": 1.0})
return increment_op
def create_optimization_pass(self,
parameters_and_grads,
loss,
......@@ -225,8 +204,6 @@ class Optimizer(object):
# FIXME: Need to fix this once we figure out how to handle dependencies
self._finish_update(loss.block)
if self._global_step is not None:
self._increment_global_step(loss.block)
end = len(global_block.ops)
return global_block.slice_ops(start, end)
......
......@@ -73,7 +73,7 @@ def reset_profiler():
@contextmanager
def profiler(state, sorted_key=None):
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
......@@ -95,8 +95,9 @@ def profiler(state, sorted_key=None):
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile
proto output file.
"""
if state not in ['CPU', 'GPU', "All"]:
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
if state == "GPU":
......@@ -122,4 +123,4 @@ def profiler(state, sorted_key=None):
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key])
core.disable_profiler(key_map[sorted_key], profile_path)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册