Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
812c5f60
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
812c5f60
编写于
12月 29, 2017
作者:
C
chengduoZH
浏览文件
操作
浏览文件
下载
差异文件
remove conflict
上级
24cf2fcd
5bf37b91
变更
100
隐藏空白更改
内联
并排
Showing
100 changed file
with
1705 addition
and
547 deletion
+1705
-547
benchmark/paddle/image/run_openblas_infer.sh
benchmark/paddle/image/run_openblas_infer.sh
+10
-6
benchmark/paddle/image/run_openblas_train.sh
benchmark/paddle/image/run_openblas_train.sh
+1
-1
doc/api/v2/config/layer.rst
doc/api/v2/config/layer.rst
+5
-0
doc/api/v2/fluid/layers.rst
doc/api/v2/fluid/layers.rst
+0
-6
doc/design/optimizer.md
doc/design/optimizer.md
+1
-1
doc/getstarted/build_and_install/docker_install_cn.rst
doc/getstarted/build_and_install/docker_install_cn.rst
+4
-4
doc/getstarted/build_and_install/docker_install_en.rst
doc/getstarted/build_and_install/docker_install_en.rst
+3
-3
doc/getstarted/build_and_install/pip_install_cn.rst
doc/getstarted/build_and_install/pip_install_cn.rst
+2
-2
doc/getstarted/build_and_install/pip_install_en.rst
doc/getstarted/build_and_install/pip_install_en.rst
+2
-2
doc/getstarted/index_cn.rst
doc/getstarted/index_cn.rst
+2
-2
doc/getstarted/index_en.rst
doc/getstarted/index_en.rst
+2
-2
paddle/framework/CMakeLists.txt
paddle/framework/CMakeLists.txt
+10
-2
paddle/framework/data_transform.cc
paddle/framework/data_transform.cc
+1
-0
paddle/framework/data_transform.h
paddle/framework/data_transform.h
+8
-9
paddle/framework/data_transform_test.cc
paddle/framework/data_transform_test.cc
+48
-27
paddle/framework/executor.cc
paddle/framework/executor.cc
+28
-7
paddle/framework/init.cc
paddle/framework/init.cc
+1
-1
paddle/framework/lod_tensor.cc
paddle/framework/lod_tensor.cc
+19
-93
paddle/framework/lod_tensor_test.cc
paddle/framework/lod_tensor_test.cc
+14
-0
paddle/framework/op_desc.cc
paddle/framework/op_desc.cc
+8
-0
paddle/framework/op_desc.h
paddle/framework/op_desc.h
+2
-0
paddle/framework/op_kernel_type.h
paddle/framework/op_kernel_type.h
+8
-0
paddle/framework/op_kernel_type_test.cc
paddle/framework/op_kernel_type_test.cc
+1
-3
paddle/framework/operator.cc
paddle/framework/operator.cc
+55
-27
paddle/framework/selected_rows.cc
paddle/framework/selected_rows.cc
+54
-1
paddle/framework/selected_rows.h
paddle/framework/selected_rows.h
+9
-0
paddle/framework/selected_rows_test.cc
paddle/framework/selected_rows_test.cc
+14
-0
paddle/framework/tensor.h
paddle/framework/tensor.h
+1
-1
paddle/framework/tensor_test.cc
paddle/framework/tensor_test.cc
+60
-54
paddle/framework/tensor_util.cc
paddle/framework/tensor_util.cc
+119
-0
paddle/framework/tensor_util.cu
paddle/framework/tensor_util.cu
+1
-0
paddle/framework/tensor_util.h
paddle/framework/tensor_util.h
+108
-0
paddle/framework/tensor_util_test.cc
paddle/framework/tensor_util_test.cc
+74
-0
paddle/framework/tensor_util_test.cu
paddle/framework/tensor_util_test.cu
+57
-0
paddle/framework/threadpool.h
paddle/framework/threadpool.h
+13
-6
paddle/framework/threadpool_test.cc
paddle/framework/threadpool_test.cc
+12
-7
paddle/framework/var_desc.cc
paddle/framework/var_desc.cc
+1
-1
paddle/function/GemmConvOp.cpp
paddle/function/GemmConvOp.cpp
+158
-3
paddle/function/Im2Col.h
paddle/function/Im2Col.h
+50
-0
paddle/function/Im2ColTest.cpp
paddle/function/Im2ColTest.cpp
+82
-0
paddle/gserver/layers/MKLDNNLRNLayer.cpp
paddle/gserver/layers/MKLDNNLRNLayer.cpp
+1
-1
paddle/operators/CMakeLists.txt
paddle/operators/CMakeLists.txt
+28
-58
paddle/operators/array_operator.h
paddle/operators/array_operator.h
+2
-2
paddle/operators/array_to_lod_tensor_op.cc
paddle/operators/array_to_lod_tensor_op.cc
+3
-2
paddle/operators/assign_op.cc
paddle/operators/assign_op.cc
+2
-2
paddle/operators/beam_search_decode_op.cc
paddle/operators/beam_search_decode_op.cc
+2
-2
paddle/operators/cond_op.cc
paddle/operators/cond_op.cc
+2
-2
paddle/operators/conv_op.cc
paddle/operators/conv_op.cc
+4
-2
paddle/operators/cross_entropy_op.cc
paddle/operators/cross_entropy_op.cc
+3
-3
paddle/operators/feed_op.cc
paddle/operators/feed_op.cc
+2
-2
paddle/operators/fetch_op.cc
paddle/operators/fetch_op.cc
+2
-2
paddle/operators/fill_constant_op.cc
paddle/operators/fill_constant_op.cc
+2
-2
paddle/operators/fill_op.cc
paddle/operators/fill_op.cc
+3
-2
paddle/operators/load_op.cc
paddle/operators/load_op.cc
+3
-3
paddle/operators/lod_tensor_to_array_op.cc
paddle/operators/lod_tensor_to_array_op.cc
+3
-2
paddle/operators/math/CMakeLists.txt
paddle/operators/math/CMakeLists.txt
+4
-4
paddle/operators/math/math_function.cc
paddle/operators/math/math_function.cc
+21
-0
paddle/operators/math/math_function.cu
paddle/operators/math/math_function.cu
+29
-0
paddle/operators/math/math_function_impl.h
paddle/operators/math/math_function_impl.h
+0
-19
paddle/operators/merge_lod_tensor_op.cc
paddle/operators/merge_lod_tensor_op.cc
+2
-2
paddle/operators/nccl_op_test.cu.cc
paddle/operators/nccl_op_test.cu.cc
+1
-1
paddle/operators/recurrent_op.cc
paddle/operators/recurrent_op.cc
+5
-4
paddle/operators/reorder_lod_tensor_by_rank_op.cc
paddle/operators/reorder_lod_tensor_by_rank_op.cc
+2
-2
paddle/operators/save_op.cc
paddle/operators/save_op.cc
+2
-2
paddle/operators/send_op.cc
paddle/operators/send_op.cc
+1
-1
paddle/operators/shrink_rnn_memory_op.cc
paddle/operators/shrink_rnn_memory_op.cc
+2
-2
paddle/operators/split_lod_tensor_op.cc
paddle/operators/split_lod_tensor_op.cc
+2
-2
paddle/operators/tensor_array_read_write_op.cc
paddle/operators/tensor_array_read_write_op.cc
+6
-4
paddle/platform/device_context.cc
paddle/platform/device_context.cc
+1
-19
paddle/platform/device_context.h
paddle/platform/device_context.h
+22
-8
paddle/platform/device_context_test.cu
paddle/platform/device_context_test.cu
+7
-22
paddle/platform/for_range.h
paddle/platform/for_range.h
+1
-1
paddle/platform/nccl_test.cu
paddle/platform/nccl_test.cu
+1
-1
paddle/platform/place.h
paddle/platform/place.h
+27
-1
paddle/pybind/CMakeLists.txt
paddle/pybind/CMakeLists.txt
+1
-0
paddle/pybind/protobuf.cc
paddle/pybind/protobuf.cc
+22
-3
paddle/pybind/pybind.cc
paddle/pybind/pybind.cc
+18
-17
paddle/pybind/tensor_py.h
paddle/pybind/tensor_py.h
+5
-4
paddle/scripts/docker/build.sh
paddle/scripts/docker/build.sh
+1
-1
paddle/scripts/submit_local.sh.in
paddle/scripts/submit_local.sh.in
+14
-9
python/paddle/v2/dataset/flowers.py
python/paddle/v2/dataset/flowers.py
+1
-1
python/paddle/v2/fluid/__init__.py
python/paddle/v2/fluid/__init__.py
+1
-1
python/paddle/v2/fluid/backward.py
python/paddle/v2/fluid/backward.py
+224
-14
python/paddle/v2/fluid/data_feeder.py
python/paddle/v2/fluid/data_feeder.py
+6
-2
python/paddle/v2/fluid/framework.py
python/paddle/v2/fluid/framework.py
+4
-2
python/paddle/v2/fluid/io.py
python/paddle/v2/fluid/io.py
+1
-1
python/paddle/v2/fluid/layers/nn.py
python/paddle/v2/fluid/layers/nn.py
+130
-8
python/paddle/v2/fluid/optimizer.py
python/paddle/v2/fluid/optimizer.py
+3
-3
python/paddle/v2/fluid/tests/op_test.py
python/paddle/v2/fluid/tests/op_test.py
+2
-2
python/paddle/v2/fluid/tests/test_array_read_write_op.py
python/paddle/v2/fluid/tests/test_array_read_write_op.py
+2
-2
python/paddle/v2/fluid/tests/test_conditional_block.py
python/paddle/v2/fluid/tests/test_conditional_block.py
+2
-2
python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+2
-2
python/paddle/v2/fluid/tests/test_optimizer.py
python/paddle/v2/fluid/tests/test_optimizer.py
+7
-7
python/paddle/v2/fluid/tests/test_recurrent_op.py
python/paddle/v2/fluid/tests/test_recurrent_op.py
+2
-2
python/paddle/v2/fluid/tests/test_regularizer.py
python/paddle/v2/fluid/tests/test_regularizer.py
+3
-3
python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+1
-1
python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+1
-1
python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+2
-2
python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
...ddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+2
-2
python/paddle/v2/fluid/tests/test_while_op.py
python/paddle/v2/fluid/tests/test_while_op.py
+2
-2
未找到文件。
benchmark/paddle/image/run_openblas_infer.sh
浏览文件 @
812c5f60
...
...
@@ -8,15 +8,19 @@ function clock_to_seconds() {
}
function
infer
()
{
unset
OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology
=
$1
layer_num
=
$2
bs
=
$3
t
hread
=
`
nproc
`
if
[
$t
hread
-gt
$bs
]
;
then
t
hread
=
$bs
t
rainers
=
`
nproc
`
if
[
$t
rainers
-gt
$bs
]
;
then
t
rainers
=
$bs
fi
log
=
"logs/infer-
${
topology
}
-
${
layer_num
}
-
${
thread
}
openblas-
${
bs
}
.log"
log
=
"logs/infer-
${
topology
}
-
${
layer_num
}
-
${
trainers
}
openblas-
${
bs
}
.log"
threads
=
$((
`
nproc
`
/
trainers
))
if
[
$threads
-eq
0
]
;
then
threads
=
1
fi
export
OPENBLAS_NUM_THREADS
=
$threads
models_in
=
"models/
${
topology
}
-
${
layer_num
}
/pass-00000/"
if
[
!
-d
$models_in
]
;
then
...
...
@@ -28,7 +32,7 @@ function infer() {
--config
=
"
${
topology
}
.py"
\
--use_mkldnn
=
False
\
--use_gpu
=
False
\
--trainer_count
=
$t
hread
\
--trainer_count
=
$t
rainers
\
--log_period
=
$log_period
\
--config_args
=
"batch_size=
${
bs
}
,layer_num=
${
layer_num
}
,is_infer=True,num_samples=256"
\
--init_model_path
=
$models_in
\
...
...
benchmark/paddle/image/run_openblas_train.sh
浏览文件 @
812c5f60
set
-e
function
train
()
{
unset
OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
export
OPENBLAS_NUM_THREADS
=
1
topology
=
$1
layer_num
=
$2
bs
=
$3
...
...
doc/api/v2/config/layer.rst
浏览文件 @
812c5f60
...
...
@@ -252,6 +252,11 @@ first_seq
.. autoclass:: paddle.v2.layer.first_seq
:noindex:
sub_seq
---------
.. autoclass:: paddle.v2.layer.sub_seq
:noindex:
concat
------
.. autoclass:: paddle.v2.layer.concat
...
...
doc/api/v2/fluid/layers.rst
浏览文件 @
812c5f60
...
...
@@ -68,12 +68,6 @@ scale
:noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
...
...
doc/design/optimizer.md
浏览文件 @
812c5f60
...
...
@@ -79,7 +79,7 @@ class Optimizer(object):
def
minimize
(
self
,
loss
,
parameter_list
):
"""Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward
_ops
()` and
This method combines interface `append_backward()` and
`create_optimization_pass()` into one.
"""
params_grads
=
self
.
create_backward_pass
(
loss
,
parameter_list
)
...
...
doc/getstarted/build_and_install/docker_install_cn.rst
浏览文件 @
812c5f60
...
...
@@ -15,7 +15,7 @@
获取PaddlePaddle的Docker镜像
------------------------------
执行下面的命令获取最新的PaddlePaddle Docker镜像
执行下面的命令获取最新的PaddlePaddle Docker镜像
,版本为cpu_avx_mkl:
.. code-block:: bash
...
...
@@ -27,7 +27,7 @@
docker pull docker.paddlepaddle.org/paddle
下载GPU版本的Docker镜像:
下载GPU版本
(cuda8.0_cudnn5_avx_mkl)
的Docker镜像:
.. code-block:: bash
...
...
@@ -54,7 +54,7 @@
.. _docker_run:
在Docker中执行PaddlePaddle训练程序
------------------------------
------------------------------
----
假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
...
...
@@ -82,7 +82,7 @@
.. _docker_run_book:
使用Docker启动PaddlePaddle Book教程
------------------------------
------------------------------
-----
使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
...
...
doc/getstarted/build_and_install/docker_install_en.rst
浏览文件 @
812c5f60
...
...
@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
Pull PaddlePaddle Docker Image
------------------------------
Run the following command to download the latest Docker images:
Run the following command to download the latest Docker images
, the version is cpu_avx_mkl
:
.. code-block:: bash
...
...
@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:
docker pull docker.paddlepaddle.org/paddle
Download GPU version images:
Download GPU version
(cuda8.0_cudnn5_avx_mkl)
images:
.. code-block:: bash
...
...
@@ -58,7 +58,7 @@ and run:
.. _docker_run:
Launch your training program in Docker
------------------------------
------------------------------
--------
Assume that you have already written a PaddlePaddle program
named :code:`train.py` under directory :code:`/home/work` (refer to
...
...
doc/getstarted/build_and_install/pip_install_cn.rst
浏览文件 @
812c5f60
...
...
@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
------------------------------
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件
,版本为cpu_avx_openblas
。
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行:
如果需要安装支持GPU的版本
(cuda7.5_cudnn5_avx_openblas)
,需要执行:
.. code-block:: bash
...
...
doc/getstarted/build_and_install/pip_install_en.rst
浏览文件 @
812c5f60
...
...
@@ -12,14 +12,14 @@ Install Using pip
------------------------------
Run the following command to install PaddlePaddle on the current
machine, it will also download requirements.
machine, it will also download requirements
, the version is cpu_avx_openblas
.
.. code-block:: bash
pip install paddlepaddle
If you wish to install GPU version, just run:
If you wish to install GPU version
(cuda7.5_cudnn5_avx_openblas)
, just run:
.. code-block:: bash
...
...
doc/getstarted/index_cn.rst
浏览文件 @
812c5f60
...
...
@@ -7,13 +7,13 @@
++++++++
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装:
执行下面的命令完成快速安装
,版本为cpu_avx_openblas
:
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行:
如果需要安装支持GPU的版本
(cuda7.5_cudnn5_avx_openblas)
,需要执行:
.. code-block:: bash
...
...
doc/getstarted/index_en.rst
浏览文件 @
812c5f60
...
...
@@ -8,13 +8,13 @@ Quick Install
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install:
Simply run the following command to install
, the version is cpu_avx_openblas
:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version, run:
If you need to install GPU version
(cuda7.5_cudnn5_avx_openblas)
, run:
.. code-block:: bash
...
...
paddle/framework/CMakeLists.txt
浏览文件 @
812c5f60
...
...
@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
cc_library
(
tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context
)
if
(
WITH_GPU
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto
)
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto
)
endif
()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
cc_test
(
tensor_util_test SRCS tensor_util_test.cc DEPS tensor
)
if
(
WITH_GPU
)
nv_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor
)
else
()
cc_test
(
tensor_util_test SRCS tensor_util_test.cc DEPS tensor
)
endif
()
cc_test
(
eigen_test SRCS eigen_test.cc DEPS tensor
)
...
...
paddle/framework/data_transform.cc
浏览文件 @
812c5f60
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/data_transform.h"
#include "paddle/framework/lod_tensor.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/framework/data_transform.h
浏览文件 @
812c5f60
...
...
@@ -27,9 +27,8 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
using
DataTransformFN
=
std
::
function
<
void
(
const
std
::
vector
<
platform
::
DeviceContext
*>
ctx
,
const
Variable
&
in
,
Variable
*
out
)
>
;
using
DataTransformFn
=
std
::
function
<
void
(
const
platform
::
DeviceContext
*
ctx
,
const
Variable
&
in
,
Variable
*
out
)
>
;
using
KernelTypePair
=
std
::
pair
<
OpKernelType
,
OpKernelType
>
;
struct
KernelTypePairHash
{
...
...
@@ -47,7 +46,7 @@ struct KernelTypePairHash {
};
using
DataTransformMap
=
std
::
unordered_map
<
KernelTypePair
,
DataTransformF
N
,
KernelTypePairHash
>
;
std
::
unordered_map
<
KernelTypePair
,
DataTransformF
n
,
KernelTypePairHash
>
;
class
DataTransformFnMap
{
public:
...
...
@@ -58,25 +57,25 @@ class DataTransformFnMap {
}
void
Insert
(
const
OpKernelType
&
left
,
const
OpKernelType
&
right
,
const
DataTransformF
N
&
data_tranform_fn
)
{
const
DataTransformF
n
&
data_tranform_fn
)
{
Insert
(
std
::
make_pair
(
left
,
right
),
data_tranform_fn
);
}
void
Insert
(
const
KernelTypePair
&
kernel_type_pair
,
const
DataTransformF
N
&
data_tranform_fn
)
{
const
DataTransformF
n
&
data_tranform_fn
)
{
PADDLE_ENFORCE
(
!
Has
(
kernel_type_pair
),
"KernelTypePair %s has been registered"
,
""
);
map_
.
insert
({
kernel_type_pair
,
data_tranform_fn
});
}
const
DataTransformF
N
&
Get
(
const
KernelTypePair
&
key_pair
)
const
{
const
DataTransformF
n
&
Get
(
const
KernelTypePair
&
key_pair
)
const
{
auto
data_transformer
=
GetNullable
(
key_pair
);
PADDLE_ENFORCE_NOT_NULL
(
data_transformer
,
"DataTransformF
N
should not be NULL"
);
"DataTransformF
n
should not be NULL"
);
return
*
data_transformer
;
}
const
DataTransformF
N
*
GetNullable
(
const
KernelTypePair
&
key_pair
)
const
{
const
DataTransformF
n
*
GetNullable
(
const
KernelTypePair
&
key_pair
)
const
{
auto
it
=
map_
.
find
(
key_pair
);
if
(
it
==
map_
.
end
())
{
return
nullptr
;
...
...
paddle/framework/data_transform_test.cc
浏览文件 @
812c5f60
...
...
@@ -11,36 +11,61 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <array>
#include <vector>
#include "paddle/framework/data_transform.h"
#include <gtest/gtest.h>
#include "paddle/framework/data_transform.h"
namespace
paddle
{
namespace
framework
{
using
namespace
platform
;
/**
* @brief cross validation of different kernel type transform
* We use four bit map represent different combination.
* If the field has multiple possible value, only choose two of them.
* For DataType, only test the FP32(float), FP64(double).
* e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
* 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
*/
std
::
array
<
proto
::
DataType
,
2
>
kDataType
=
{
{
proto
::
DataType
::
FP32
,
proto
::
DataType
::
FP64
}};
std
::
array
<
Place
,
2
>
kPlace
=
{{
CPUPlace
(),
CUDAPlace
(
0
)}};
std
::
array
<
DataLayout
,
2
>
kDataLayout
=
{
{
DataLayout
::
kNHWC
,
DataLayout
::
kNCHW
}};
std
::
array
<
LibraryType
,
2
>
kLibraryType
=
{
{
LibraryType
::
kPlain
,
LibraryType
::
kMKLDNN
}};
OpKernelType
GenFromBit
(
const
std
::
vector
<
bool
>
bits
)
{
return
OpKernelType
(
kDataType
[
bits
[
0
]],
kPlace
[
bits
[
1
]],
kDataLayout
[
bits
[
2
]],
kLibraryType
[
bits
[
3
]]);
}
int
test_value
=
0
;
OpKernelType
kernel_type_1
(
proto
::
DataType
::
FP32
,
CPUPlace
(),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
OpKernelType
kernel_type_2
(
proto
::
DataType
::
FP32
,
CUDAPlace
(
0
),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
OpKernelType
kernel_type_3
(
proto
::
DataType
::
FP16
,
CUDAPlace
(
0
),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
auto
kernel0
=
GenFromBit
({
0
,
0
,
0
,
0
});
auto
kernel1
=
GenFromBit
({
0
,
0
,
0
,
1
});
auto
kernel2
=
GenFromBit
({
0
,
0
,
1
,
0
});
auto
kernel3
=
GenFromBit
({
0
,
0
,
1
,
1
});
void
type1_to_type2
(
std
::
vector
<
platform
::
DeviceContext
*>
ctx
,
const
Variable
&
in
,
Variable
*
out
)
{
void
TransDataType_t
(
const
platform
::
DeviceContext
*
ctx
,
const
Variable
&
in
,
Variable
*
out
)
{
test_value
++
;
}
void
type2_to_type3
(
std
::
vector
<
platform
::
DeviceContext
*>
ctx
,
const
Variable
&
in
,
Variable
*
out
)
{
void
TransDataLayout_t
(
const
platform
::
DeviceContext
*
ctx
,
const
Variable
&
in
,
Variable
*
out
)
{
test_value
--
;
}
void
type1_to_type3
(
std
::
vector
<
platform
::
DeviceContext
*>
ctx
,
const
Variable
&
in
,
Variable
*
out
)
{
void
TransLibraryType_t
(
const
platform
::
DeviceContext
*
ctx
,
const
Variable
&
in
,
Variable
*
out
)
{
test_value
+=
2
;
}
...
...
@@ -49,12 +74,9 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
namespace
frw
=
paddle
::
framework
;
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel_type_1
,
frw
::
kernel_type_2
,
frw
::
type1_to_type2
);
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel_type_2
,
frw
::
kernel_type_3
,
frw
::
type2_to_type3
);
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel_type_1
,
frw
::
kernel_type_3
,
frw
::
type1_to_type3
);
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel0
,
frw
::
kernel1
,
frw
::
TransDataType_t
);
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel1
,
frw
::
kernel2
,
frw
::
TransDataLayout_t
);
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel0
,
frw
::
kernel2
,
frw
::
TransLibraryType_t
);
TEST
(
DataTransform
,
Register
)
{
using
namespace
paddle
::
framework
;
...
...
@@ -62,17 +84,16 @@ TEST(DataTransform, Register) {
auto
&
instance
=
DataTransformFnMap
::
Instance
();
ASSERT_EQ
(
instance
.
Map
().
size
(),
3UL
);
std
::
vector
<
DeviceContext
*>
ctx
;
DeviceContext
*
ctx
=
nullptr
;
paddle
::
framework
::
Variable
in
;
paddle
::
framework
::
Variable
out
;
instance
.
Get
(
std
::
make_pair
(
frw
::
kernel_type_1
,
frw
::
kernel_type_2
))(
ctx
,
in
,
&
out
);
instance
.
Get
(
std
::
make_pair
(
frw
::
kernel0
,
frw
::
kernel1
))(
ctx
,
in
,
&
out
);
ASSERT_EQ
(
test_value
,
1
);
instance
.
Get
(
std
::
make_pair
(
frw
::
kernel_type_2
,
frw
::
kernel_type_3
))(
ctx
,
in
,
&
out
);
instance
.
Get
(
std
::
make_pair
(
frw
::
kernel1
,
frw
::
kernel2
))(
ctx
,
in
,
&
out
);
ASSERT_EQ
(
test_value
,
0
);
instance
.
Get
(
std
::
make_pair
(
frw
::
kernel_type_1
,
frw
::
kernel_type_3
))(
ctx
,
in
,
&
out
);
instance
.
Get
(
std
::
make_pair
(
frw
::
kernel0
,
frw
::
kernel2
))(
ctx
,
in
,
&
out
);
ASSERT_EQ
(
test_value
,
2
);
}
paddle/framework/executor.cc
浏览文件 @
812c5f60
...
...
@@ -14,18 +14,17 @@ limitations under the License. */
#include "paddle/framework/executor.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <set>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h"
DEFINE_bool
(
check_nan_inf
,
false
,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."
);
namespace
paddle
{
namespace
framework
{
...
...
@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
}
}
static
void
CheckTensorNANOrInf
(
const
std
::
string
&
name
,
const
framework
::
Tensor
&
tensor
)
{
if
(
tensor
.
memory_size
()
==
0
)
{
return
;
}
if
(
tensor
.
type
().
hash_code
()
!=
typeid
(
float
).
hash_code
()
&&
tensor
.
type
().
hash_code
()
!=
typeid
(
double
).
hash_code
())
{
return
;
}
PADDLE_ENFORCE
(
!
framework
::
HasInf
(
tensor
),
"Tensor %s has Inf"
,
name
);
PADDLE_ENFORCE
(
!
framework
::
HasNAN
(
tensor
),
"Tensor %s has NAN"
,
name
);
}
void
Executor
::
Run
(
const
ProgramDesc
&
pdesc
,
Scope
*
scope
,
int
block_id
,
bool
create_local_scope
,
bool
create_vars
)
{
// TODO(tonyyang-svail):
...
...
@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
auto
op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
VLOG
(
3
)
<<
op
->
DebugString
();
op
->
Run
(
*
local_scope
,
place_
);
if
(
FLAGS_check_nan_inf
)
{
for
(
auto
&
vname
:
op
->
OutputVars
(
true
))
{
auto
*
var
=
local_scope
->
FindVar
(
vname
);
if
(
var
==
nullptr
)
continue
;
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
CheckTensorNANOrInf
(
vname
,
var
->
Get
<
framework
::
LoDTensor
>
());
}
}
}
}
if
(
create_local_scope
)
{
if
(
create_
vars
&&
create_
local_scope
)
{
scope
->
DeleteScope
(
local_scope
);
}
}
...
...
paddle/framework/init.cc
浏览文件 @
812c5f60
...
...
@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
places
.
emplace_back
(
platform
::
CPUPlace
());
LOG
(
WARNING
)
<<
"Not specified CPU device, create CPU by Default."
;
}
platform
::
DeviceContextPool
::
Create
(
places
);
platform
::
DeviceContextPool
::
Init
(
places
);
return
true
;
}
...
...
paddle/framework/lod_tensor.cc
浏览文件 @
812c5f60
...
...
@@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
void
SerializeToStream
(
std
::
ostream
&
os
,
const
LoDTensor
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
// TODO(typhoonzero): serialize to ostream
{
// the 1st field, uint32_t version
{
// the 1st field, uint32_t version for LoDTensor
constexpr
uint32_t
version
=
0
;
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
version
),
sizeof
(
version
));
}
{
// the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto
::
TensorDesc
desc
;
desc
.
set_data_type
(
framework
::
ToDataType
(
tensor
.
type
()));
auto
dims
=
framework
::
vectorize
(
tensor
.
dims
());
auto
*
pb_dims
=
desc
.
mutable_dims
();
pb_dims
->
Resize
(
static_cast
<
int
>
(
dims
.
size
()),
0
);
std
::
copy
(
dims
.
begin
(),
dims
.
end
(),
pb_dims
->
begin
());
int32_t
size
=
desc
.
ByteSize
();
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
size
),
sizeof
(
size
));
auto
out
=
desc
.
SerializeAsString
();
os
.
write
(
out
.
data
(),
size
);
}
{
// the 3rd field, tensor data
uint64_t
size
=
tensor
.
memory_size
();
auto
*
data_ptr
=
tensor
.
data
<
void
>
();
PADDLE_ENFORCE
(
size
<
std
::
numeric_limits
<
std
::
streamsize
>::
max
(),
"Index overflow when writing tensor"
);
if
(
platform
::
is_gpu_place
(
tensor
.
place
()))
{
#ifdef PADDLE_WITH_CUDA
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
gpu_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
platform
::
CPUPlace
cpu
;
uintptr_t
data
=
reinterpret_cast
<
uintptr_t
>
(
data_ptr
);
while
(
size
!=
0
)
{
size_t
size_to_write
=
std
::
min
(
kBufSize
,
static_cast
<
size_t
>
(
size
));
memory
::
Copy
(
cpu
,
buf
.
get
(),
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
.
place
()),
reinterpret_cast
<
const
void
*>
(
data
),
size_to_write
,
gpu_dev_ctx
.
stream
());
gpu_dev_ctx
.
Wait
();
os
.
write
(
buf
.
get
(),
size_to_write
);
data
+=
size_to_write
;
size
-=
size_to_write
;
}
#else
PADDLE_THROW
(
"Unexpected branch"
);
#endif
}
else
{
os
.
write
(
static_cast
<
const
char
*>
(
data_ptr
),
static_cast
<
std
::
streamsize
>
(
size
));
}
}
{
// the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
{
// the 2st field, LoD information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto
lod
=
tensor
.
lod
();
uint64_t
size
=
lod
.
size
();
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
size
),
sizeof
(
size
));
...
...
@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
static_cast
<
std
::
streamsize
>
(
size
));
}
}
// the 3st field, Tensor
SerializeToStream
(
os
,
static_cast
<
Tensor
>
(
tensor
),
dev_ctx
);
}
void
DeserializeFromStream
(
std
::
istream
&
is
,
LoDTensor
*
tensor
)
{
uint32_t
version
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
version
),
sizeof
(
version
));
PADDLE_ENFORCE_EQ
(
version
,
0U
,
"Only version 0 is supported"
);
proto
::
TensorDesc
desc
;
{
// int32_t size
// proto buffer
int32_t
size
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
size
),
sizeof
(
size
));
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
size
]);
is
.
read
(
reinterpret_cast
<
char
*>
(
buf
.
get
()),
size
);
PADDLE_ENFORCE
(
desc
.
ParseFromArray
(
buf
.
get
(),
size
),
"Cannot parse tensor desc"
);
}
{
// read tensor
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
static_cast
<
size_t
>
(
desc
.
dims
().
size
()));
std
::
copy
(
desc
.
dims
().
begin
(),
desc
.
dims
().
end
(),
std
::
back_inserter
(
dims
));
tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
void
*
buf
;
platform
::
Place
cpu
=
platform
::
CPUPlace
();
switch
(
desc
.
data_type
())
{
case
proto
::
FP32
:
buf
=
tensor
->
mutable_data
<
float
>
(
cpu
);
break
;
case
proto
::
FP64
:
buf
=
tensor
->
mutable_data
<
double
>
(
cpu
);
break
;
case
proto
::
INT32
:
buf
=
tensor
->
mutable_data
<
int
>
(
cpu
);
break
;
case
proto
::
INT64
:
buf
=
tensor
->
mutable_data
<
int64_t
>
(
cpu
);
break
;
default:
PADDLE_THROW
(
"DataType %d not supported"
,
desc
.
data_type
());
}
is
.
read
(
static_cast
<
char
*>
(
buf
),
tensor
->
memory_size
());
}
{
// read lod
{
// the 1st field, unit32_t version for SelectedRows
uint32_t
version
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
version
),
sizeof
(
version
));
PADDLE_ENFORCE_EQ
(
version
,
0U
,
"Only version 0 is supported"
);
}
{
// the 2st field, LoD information
uint64_t
lod_level
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
lod_level
),
sizeof
(
lod_level
));
auto
&
lod
=
*
tensor
->
mutable_lod
();
...
...
@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
lod
[
i
]
=
tmp
;
}
}
// the 3st filed, Tensor
DeserializeFromStream
(
is
,
static_cast
<
Tensor
*>
(
tensor
));
}
}
// namespace framework
...
...
paddle/framework/lod_tensor_test.cc
浏览文件 @
812c5f60
...
...
@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
EXPECT_NE
(
t1
.
data
<
float
>
(),
lod_tensor_
.
data
<
float
>
());
}
TEST_F
(
LoDTensorTester
,
SerializeAndDeserialize
)
{
LoDTensor
dst_tensor
;
platform
::
CPUDeviceContext
cpu_ctx
((
platform
::
CPUPlace
()));
std
::
ostringstream
oss
;
SerializeToStream
(
oss
,
lod_tensor_
,
cpu_ctx
);
std
::
istringstream
iss
(
oss
.
str
());
DeserializeFromStream
(
iss
,
&
dst_tensor
);
float
*
dst_ptr
=
dst_tensor
.
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
kLodTensorSize
;
++
i
)
{
EXPECT_EQ
(
dst_ptr
[
i
],
i
);
}
EXPECT_EQ
(
dst_tensor
.
lod
(),
lod_tensor_
.
lod
());
}
TEST
(
LodExpand
,
test
)
{
LoD
lod
{{
0
,
2
}};
LoDTensor
tensor
;
...
...
paddle/framework/op_desc.cc
浏览文件 @
812c5f60
...
...
@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
need_update_
=
true
;
}
void
OpDesc
::
CopyFrom
(
const
OpDesc
&
op_desc
)
{
desc_
.
set_type
(
op_desc
.
Type
());
inputs_
=
op_desc
.
inputs_
;
outputs_
=
op_desc
.
outputs_
;
attrs_
=
op_desc
.
attrs_
;
need_update_
=
true
;
}
OpDesc
::
OpDesc
(
const
proto
::
OpDesc
&
desc
,
ProgramDesc
*
prog
)
:
desc_
(
desc
),
need_update_
(
false
)
{
// restore inputs_
...
...
paddle/framework/op_desc.h
浏览文件 @
812c5f60
...
...
@@ -35,6 +35,8 @@ class OpDesc {
OpDesc
(
const
proto
::
OpDesc
&
desc
,
ProgramDesc
*
prog
);
void
CopyFrom
(
const
OpDesc
&
op_desc
);
proto
::
OpDesc
*
Proto
();
std
::
string
Type
()
const
{
return
desc_
.
type
();
}
...
...
paddle/framework/op_kernel_type.h
浏览文件 @
812c5f60
...
...
@@ -68,6 +68,8 @@ struct OpKernelType {
data_type_
==
o
.
data_type_
&&
data_layout_
==
o
.
data_layout_
&&
library_type_
==
o
.
library_type_
;
}
bool
operator
!=
(
const
OpKernelType
&
o
)
const
{
return
!
(
*
this
==
o
);
}
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
...
...
@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os,
return
os
;
}
inline
std
::
string
KernelTypeToString
(
const
OpKernelType
&
kernel_key
)
{
std
::
ostringstream
stream
;
stream
<<
kernel_key
;
return
stream
.
str
();
}
}
// namespace framework
}
// namespace paddle
paddle/framework/op_kernel_type_test.cc
浏览文件 @
812c5f60
...
...
@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) {
OpKernelType
op_kernel_type
(
DataType
::
FP32
,
CPUPlace
(),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
std
::
ostringstream
stream
;
stream
<<
op_kernel_type
;
ASSERT_EQ
(
stream
.
str
(
),
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type
),
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"
);
}
...
...
paddle/framework/operator.cc
浏览文件 @
812c5f60
...
...
@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
const
Scope
&
scope_
;
};
const
platform
::
DeviceContext
*
GetDeviceContext
(
framework
::
KernelTypePair
&
kernel_pair
)
{
auto
&
actual_kernel_key
=
kernel_pair
.
first
;
auto
&
expected_kernel_key
=
kernel_pair
.
second
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
if
(
platform
::
is_gpu_place
(
actual_kernel_key
.
place_
)
&&
platform
::
is_cpu_place
(
expected_kernel_key
.
place_
))
{
return
pool
.
Get
(
actual_kernel_key
.
place_
);
}
else
if
(
platform
::
is_cpu_place
(
actual_kernel_key
.
place_
)
&&
platform
::
is_gpu_place
(
expected_kernel_key
.
place_
))
{
return
pool
.
Get
(
expected_kernel_key
.
place_
);
}
else
{
PADDLE_THROW
(
"Currently, model parallelism is only supported between CPU and CUDA"
);
}
}
void
OperatorWithKernel
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
scope
);
this
->
InferShape
(
&
infer_shape_ctx
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
dev_ctx
=
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
dev_ctx
=
pool
.
Get
(
place
);
// check if op[type] has kernel registered.
auto
&
all_op_kernels
=
AllOpKernels
();
...
...
@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope,
}
if
(
actual_kernel_key
==
expected_kernel_key
)
{
kernel_iter
->
second
->
Compute
(
ctx
);
PADDLE_ENFORCE_EQ
(
actual_kernel_key
.
place_
,
expected_kernel_key
.
place_
,
"Currently, model parallelism is only supported between "
"CPU and other devices. For example, multi-GPU model "
"parallelism will failed."
);
}
else
{
Scope
&
op_scope
=
scope
.
NewScope
();
auto
input_vars
=
this
->
InputVars
();
for
(
auto
var_name
:
input_vars
)
{
op_scope
.
Var
(
var_name
);
}
// TODO(qijun) get appropriate DeviceContext from DeviceContext pool
platform
::
DeviceContext
*
trans_dev_ctx
=
nullptr
;
std
::
vector
<
platform
::
DeviceContext
*>
trans_dev_ctx_vec
{
trans_dev_ctx
};
auto
kernel_pair
=
std
::
make_pair
(
actual_kernel_key
,
expected_kernel_key
);
const
DataTransformFn
*
trans_fun
=
DataTransformFnMap
::
Instance
().
GetNullable
(
kernel_pair
);
if
(
trans_fun
)
{
auto
input_vars
=
this
->
InputVars
();
// TODO(qijun) filter the input vars that do not need to be transformed
// filter vars that has been transformed
std
::
vector
<
std
::
string
>
need_trans
;
for
(
auto
var_name
:
input_vars
)
{
auto
var_name_trans
=
var_name
+
framework
::
KernelTypeToString
(
expected_kernel_key
);
if
(
!
scope
.
FindVar
(
var_name_trans
))
{
const_cast
<
Scope
&>
(
scope
).
Var
(
var_name_trans
);
need_trans
.
push_back
(
var_name
);
}
}
// TODO(qijun) get appropriate DataTransformFN from global map
framework
::
DataTransformFN
trans_fun
=
nullptr
;
if
(
!
need_trans
.
empty
())
{
auto
trans_dev_ctx
=
GetDeviceContext
(
kernel_pair
)
;
// Wait for transform starting
dev_ctx
->
Wait
();
// Wait for transform starting
dev_ctx
->
Wait
();
for
(
auto
var_name
:
input_vars
)
{
trans_fun
(
trans_dev_ctx_vec
,
*
(
scope
.
FindVar
(
var_name
)),
op_scope
.
FindVar
(
var_name
));
}
// Wait for data transform finishing
for
(
auto
ctx
:
trans_dev_ctx_vec
)
{
ctx
->
Wait
();
for
(
auto
var_name
:
need_trans
)
{
(
*
trans_fun
)(
trans_dev_ctx
,
*
(
scope
.
FindVar
(
var_name
)),
scope
.
FindVar
(
var_name
+
framework
::
KernelTypeToString
(
expected_kernel_key
)));
}
// Wait for data transform finishing
trans_dev_ctx
->
Wait
();
}
}
// Create a new ExecutionContext
ExecutionContext
op_ctx
(
*
this
,
op_scope
,
*
dev_ctx
);
kernel_iter
->
second
->
Compute
(
op_ctx
);
}
kernel_iter
->
second
->
Compute
(
ctx
);
}
OpKernelType
OperatorWithKernel
::
GetActualKernelType
(
...
...
paddle/framework/selected_rows.cc
浏览文件 @
812c5f60
...
...
@@ -12,5 +12,58 @@ limitations under the License. */
#include "paddle/framework/selected_rows.h"
namespace
paddle
{
namespace
framework
{}
// namespace framework
namespace
framework
{
void
SerializeToStream
(
std
::
ostream
&
os
,
const
SelectedRows
&
selected_rows
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
{
// the 1st field, uint32_t version
constexpr
uint32_t
version
=
0
;
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
version
),
sizeof
(
version
));
}
{
// the 2st field, rows information
auto
&
rows
=
selected_rows
.
rows
();
uint64_t
size
=
rows
.
size
();
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
size
),
sizeof
(
size
));
for
(
uint64_t
i
=
0
;
i
<
size
;
++
i
)
{
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
rows
[
i
]),
sizeof
(
rows
[
i
]));
}
}
{
// the 3st field, the height of SelectedRows
int64_t
height
=
selected_rows
.
height
();
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
height
),
sizeof
(
height
));
}
// the 4st field, Tensor data
SerializeToStream
(
os
,
selected_rows
.
value
(),
dev_ctx
);
}
void
DeserializeFromStream
(
std
::
istream
&
is
,
SelectedRows
*
selected_rows
)
{
auto
tensor
=
*
selected_rows
->
mutable_value
();
{
// the 1st field, unit32_t version for SelectedRows
uint32_t
version
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
version
),
sizeof
(
version
));
PADDLE_ENFORCE_EQ
(
version
,
0U
,
"Only version 0 is supported"
);
}
{
// the 2st field, rows information
uint64_t
size
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
size
),
sizeof
(
size
));
auto
&
rows
=
*
selected_rows
->
mutable_rows
();
rows
.
resize
(
size
);
for
(
uint64_t
i
=
0
;
i
<
size
;
++
i
)
{
is
.
read
(
reinterpret_cast
<
char
*>
(
&
rows
[
i
]),
sizeof
(
int64_t
));
}
}
{
// the 3st field, the height of the SelectedRows
int64_t
height
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
height
),
sizeof
(
int64_t
));
selected_rows
->
set_height
(
height
);
}
// the 4st field, tensor which contains the data
DeserializeFromStream
(
is
,
&
tensor
);
}
}
// namespace framework
}
// namespace paddle
paddle/framework/selected_rows.h
浏览文件 @
812c5f60
...
...
@@ -59,5 +59,14 @@ class SelectedRows {
int64_t
height_
;
};
/*
* Serialize/Desiralize SelectedRows to std::ostream
* You can pass ofstream or ostringstream to serilize to file
* or to a in memory string. GPU tensor will be copied to CPU.
*/
void
SerializeToStream
(
std
::
ostream
&
os
,
const
SelectedRows
&
selected_rows
,
const
platform
::
DeviceContext
&
dev_ctx
);
void
DeserializeFromStream
(
std
::
istream
&
is
,
SelectedRows
*
selected_rows
);
}
// namespace framework
}
// namespace paddle
paddle/framework/selected_rows_test.cc
浏览文件 @
812c5f60
...
...
@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) {
ASSERT_EQ
(
selected_rows_
->
GetCompleteDims
(),
make_ddim
({
10
,
100
}));
}
TEST_F
(
SelectedRowsTester
,
SerializeAndDeseralize
)
{
SelectedRows
dst_tensor
;
platform
::
CPUDeviceContext
cpu_ctx
(
place_
);
std
::
ostringstream
oss
;
SerializeToStream
(
oss
,
*
selected_rows_
,
cpu_ctx
);
std
::
istringstream
iss
(
oss
.
str
());
DeserializeFromStream
(
iss
,
&
dst_tensor
);
ASSERT_EQ
(
selected_rows_
->
rows
(),
dst_tensor
.
rows
());
ASSERT_EQ
(
selected_rows_
->
height
(),
dst_tensor
.
height
());
}
}
// namespace framework
}
// namespace paddle
paddle/framework/tensor.h
浏览文件 @
812c5f60
...
...
@@ -178,7 +178,7 @@ class Tensor {
DDim
dims_
;
/**
* @brief the layout of memory block, default is N
CHW
.
* @brief the layout of memory block, default is N
HWC
.
*
* @note the memory allocation order, describe how weight/data is stored
* For example, in 4-D Tensor(rank=4), there are three commonly
...
...
paddle/framework/tensor_test.cc
浏览文件 @
812c5f60
...
...
@@ -15,12 +15,13 @@
#include <gtest/gtest.h>
#include <string>
namespace
framework
=
paddle
::
framework
;
namespace
platform
=
paddle
::
platform
;
TEST
(
Tensor
,
Dims
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
Tensor
tt
;
framework
::
Tensor
tt
;
tt
.
Resize
({
2
,
3
,
4
});
DDim
dims
=
tt
.
dims
();
framework
::
DDim
dims
=
tt
.
dims
();
ASSERT_EQ
(
arity
(
dims
),
3
);
for
(
int
i
=
0
;
i
<
3
;
++
i
)
{
EXPECT_EQ
(
i
+
2
,
dims
[
i
]);
...
...
@@ -28,12 +29,12 @@ TEST(Tensor, Dims) {
}
TEST
(
Tensor
,
DataAssert
)
{
paddle
::
framework
::
Tensor
src_tensor
;
framework
::
Tensor
src_tensor
;
bool
caught
=
false
;
try
{
src_tensor
.
data
<
double
>
();
}
catch
(
p
addle
::
p
latform
::
EnforceNotMet
err
)
{
}
catch
(
platform
::
EnforceNotMet
err
)
{
caught
=
true
;
std
::
string
msg
=
"holder_ should not be null
\n
Tensor holds no memory. Call "
...
...
@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) {
because Memory::Alloc() and Memory::Free() have not been ready.
*/
TEST
(
Tensor
,
MutableData
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
{
Tensor
src_tensor
;
framework
::
Tensor
src_tensor
;
float
*
p1
=
nullptr
;
float
*
p2
=
nullptr
;
// initialization
p1
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
1
,
2
,
3
}),
CPUPlace
());
p1
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
1
,
2
,
3
}),
platform
::
CPUPlace
());
EXPECT_NE
(
p1
,
nullptr
);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
3
,
4
}),
CPUPlace
());
p2
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
3
,
4
}),
platform
::
CPUPlace
());
EXPECT_NE
(
p2
,
nullptr
);
EXPECT_NE
(
p1
,
p2
);
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
2
,
2
,
3
}),
CPUPlace
());
p1
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
2
,
2
,
3
}),
platform
::
CPUPlace
());
EXPECT_EQ
(
p1
,
p2
);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
2
,
2
}),
CPUPlace
());
p2
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
2
,
2
}),
platform
::
CPUPlace
());
EXPECT_EQ
(
p1
,
p2
);
}
#ifdef PADDLE_WITH_CUDA
{
Tensor
src_tensor
;
framework
::
Tensor
src_tensor
;
float
*
p1
=
nullptr
;
float
*
p2
=
nullptr
;
// initialization
p1
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
1
,
2
,
3
}),
CUDAPlace
());
p1
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
1
,
2
,
3
}),
platform
::
CUDAPlace
());
EXPECT_NE
(
p1
,
nullptr
);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
3
,
4
}),
CUDAPlace
());
p2
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
3
,
4
}),
platform
::
CUDAPlace
());
EXPECT_NE
(
p2
,
nullptr
);
EXPECT_NE
(
p1
,
p2
);
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
2
,
2
,
3
}),
CUDAPlace
());
p1
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
2
,
2
,
3
}),
platform
::
CUDAPlace
());
EXPECT_EQ
(
p1
,
p2
);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2
=
src_tensor
.
mutable_data
<
float
>
(
make_ddim
({
2
,
2
}),
CUDAPlace
());
p2
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
2
,
2
}),
platform
::
CUDAPlace
());
EXPECT_EQ
(
p1
,
p2
);
}
#endif
}
TEST
(
Tensor
,
ShareDataWith
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
{
Tensor
src_tensor
;
Tensor
dst_tensor
;
framework
::
Tensor
src_tensor
;
framework
::
Tensor
dst_tensor
;
// Try to share data form uninitialized tensor
bool
caught
=
false
;
try
{
...
...
@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) {
}
ASSERT_TRUE
(
caught
);
src_tensor
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
,
4
}),
CPUPlace
());
src_tensor
.
mutable_data
<
int
>
(
framework
::
make_ddim
({
2
,
3
,
4
}),
platform
::
CPUPlace
());
dst_tensor
.
ShareDataWith
(
src_tensor
);
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
}
#ifdef PADDLE_WITH_CUDA
{
Tensor
src_tensor
;
Tensor
dst_tensor
;
src_tensor
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
,
4
}),
CUDAPlace
());
framework
::
Tensor
src_tensor
;
framework
::
Tensor
dst_tensor
;
src_tensor
.
mutable_data
<
int
>
(
framework
::
make_ddim
({
2
,
3
,
4
}),
platform
::
CUDAPlace
());
dst_tensor
.
ShareDataWith
(
src_tensor
);
ASSERT_EQ
(
src_tensor
.
data
<
int
>
(),
dst_tensor
.
data
<
int
>
());
}
...
...
@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) {
}
TEST
(
Tensor
,
Slice
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
{
Tensor
src_tensor
;
src_tensor
.
mutable_data
<
int
>
(
make_ddim
({
5
,
3
,
4
}),
CPUPlace
());
Tensor
slice_tensor
=
src_tensor
.
Slice
(
1
,
3
);
DDim
slice_dims
=
slice_tensor
.
dims
();
framework
::
Tensor
src_tensor
;
src_tensor
.
mutable_data
<
int
>
(
framework
::
make_ddim
({
5
,
3
,
4
}),
platform
::
CPUPlace
());
framework
::
Tensor
slice_tensor
=
src_tensor
.
Slice
(
1
,
3
);
framework
::
DDim
slice_dims
=
slice_tensor
.
dims
();
ASSERT_EQ
(
arity
(
slice_dims
),
3
);
EXPECT_EQ
(
slice_dims
[
0
],
2
);
EXPECT_EQ
(
slice_dims
[
1
],
3
);
...
...
@@ -153,11 +159,12 @@ TEST(Tensor, Slice) {
uintptr_t
src_data_address
=
reinterpret_cast
<
uintptr_t
>
(
src_tensor
.
data
<
int
>
());
uintptr_t
src_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
src_tensor
.
mutable_data
<
int
>
(
src_tensor
.
dims
(),
CPUPlace
()));
src_tensor
.
mutable_data
<
int
>
(
src_tensor
.
dims
(),
platform
::
CPUPlace
()));
uintptr_t
slice_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
data
<
int
>
());
uintptr_t
slice_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
mutable_data
<
int
>
(
slice_tensor
.
dims
(),
CPUPlace
()));
uintptr_t
slice_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
mutable_data
<
int
>
(
slice_tensor
.
dims
(),
platform
::
CPUPlace
()));
EXPECT_EQ
(
src_data_address
,
src_mutable_data_address
);
EXPECT_EQ
(
slice_data_address
,
slice_mutable_data_address
);
EXPECT_EQ
(
src_data_address
+
3
*
4
*
1
*
sizeof
(
int
),
slice_data_address
);
...
...
@@ -165,22 +172,25 @@ TEST(Tensor, Slice) {
#ifdef PADDLE_WITH_CUDA
{
Tensor
src_tensor
;
src_tensor
.
mutable_data
<
double
>
(
make_ddim
({
6
,
9
}),
CUDAPlace
());
Tensor
slice_tensor
=
src_tensor
.
Slice
(
2
,
6
);
DDim
slice_dims
=
slice_tensor
.
dims
();
framework
::
Tensor
src_tensor
;
src_tensor
.
mutable_data
<
double
>
(
framework
::
make_ddim
({
6
,
9
}),
platform
::
CUDAPlace
());
framework
::
Tensor
slice_tensor
=
src_tensor
.
Slice
(
2
,
6
);
framework
::
DDim
slice_dims
=
slice_tensor
.
dims
();
ASSERT_EQ
(
arity
(
slice_dims
),
2
);
EXPECT_EQ
(
slice_dims
[
0
],
4
);
EXPECT_EQ
(
slice_dims
[
1
],
9
);
uintptr_t
src_data_address
=
reinterpret_cast
<
uintptr_t
>
(
src_tensor
.
data
<
double
>
());
uintptr_t
src_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
src_tensor
.
mutable_data
<
double
>
(
src_tensor
.
dims
(),
CUDAPlace
()));
uintptr_t
src_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
src_tensor
.
mutable_data
<
double
>
(
src_tensor
.
dims
(),
platform
::
CUDAPlace
()));
uintptr_t
slice_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
data
<
double
>
());
uintptr_t
slice_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
mutable_data
<
double
>
(
slice_tensor
.
dims
(),
CUDAPlace
()));
uintptr_t
slice_mutable_data_address
=
reinterpret_cast
<
uintptr_t
>
(
slice_tensor
.
mutable_data
<
double
>
(
slice_tensor
.
dims
(),
platform
::
CUDAPlace
()));
EXPECT_EQ
(
src_data_address
,
src_mutable_data_address
);
EXPECT_EQ
(
slice_data_address
,
slice_mutable_data_address
);
EXPECT_EQ
(
src_data_address
+
9
*
2
*
sizeof
(
double
),
slice_data_address
);
...
...
@@ -189,23 +199,19 @@ TEST(Tensor, Slice) {
}
TEST
(
Tensor
,
ReshapeToMatrix
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
Tensor
src
;
int
*
src_ptr
=
src
.
mutable_data
<
int
>
({
2
,
3
,
4
,
9
},
CPUPlace
());
framework
::
Tensor
src
;
int
*
src_ptr
=
src
.
mutable_data
<
int
>
({
2
,
3
,
4
,
9
},
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
2
*
3
*
4
*
9
;
++
i
)
{
src_ptr
[
i
]
=
i
;
}
Tensor
res
=
ReshapeToMatrix
(
src
,
2
);
framework
::
Tensor
res
=
framework
::
ReshapeToMatrix
(
src
,
2
);
ASSERT_EQ
(
res
.
dims
()[
0
],
2
*
3
);
ASSERT_EQ
(
res
.
dims
()[
1
],
4
*
9
);
}
TEST
(
Tensor
,
Layout
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
Tensor
src
;
ASSERT_EQ
(
src
.
layout
(),
DataLayout
::
kNHWC
);
src
.
set_layout
(
DataLayout
::
kAnyLayout
);
ASSERT_EQ
(
src
.
layout
(),
DataLayout
::
kAnyLayout
);
framework
::
Tensor
src
;
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kNHWC
);
src
.
set_layout
(
framework
::
DataLayout
::
kAnyLayout
);
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kAnyLayout
);
}
paddle/framework/tensor_util.cc
0 → 100644
浏览文件 @
812c5f60
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/tensor_util.h"
namespace
paddle
{
namespace
framework
{
template
<
typename
Predicate
,
typename
DevCtx
>
struct
AnyDTypeVisitor
{
Predicate
predicate_
;
const
Tensor
&
tensor_
;
const
DevCtx
&
ctx_
;
Tensor
*
out_
;
AnyDTypeVisitor
(
Predicate
predicate
,
const
Tensor
&
tensor
,
const
DevCtx
&
ctx
,
Tensor
*
out
)
:
predicate_
(
predicate
),
tensor_
(
tensor
),
ctx_
(
ctx
),
out_
(
out
)
{}
template
<
typename
T
>
void
operator
()()
const
{
auto
t
=
EigenVector
<
T
>::
Flatten
(
tensor_
);
auto
o
=
EigenScalar
<
bool
>::
From
(
*
out_
);
// return any of predicate_(t) is true.
o
.
device
(
*
ctx_
.
eigen_device
())
=
predicate_
(
t
).
any
();
}
};
template
<
typename
Predicate
,
typename
DevCtx
>
inline
void
AnyImpl
(
Predicate
predicate
,
const
framework
::
Tensor
&
tensor
,
const
DevCtx
&
ctx
,
framework
::
Tensor
*
out
)
{
VisitDataType
(
ToDataType
(
tensor
.
type
()),
AnyDTypeVisitor
<
Predicate
,
DevCtx
>
(
predicate
,
tensor
,
ctx
,
out
));
}
template
<
typename
Predicate
>
struct
AnyVisitor
:
public
boost
::
static_visitor
<
bool
>
{
const
framework
::
Tensor
&
tensor_
;
Predicate
predicate_
;
AnyVisitor
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
)
:
tensor_
(
tensor
),
predicate_
(
std
::
move
(
predicate
))
{}
template
<
typename
Place
>
bool
operator
()(
const
Place
&
place
)
const
{
framework
::
Tensor
out
;
out
.
Resize
({
1
});
out
.
mutable_data
<
bool
>
(
place
);
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
place
);
AnyImpl
(
predicate_
,
tensor_
,
*
ctx
,
&
out
);
return
this
->
GetResult
(
out
,
place
);
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
CUDAPlace
&
gpu
)
const
{
platform
::
CPUPlace
cpu
;
framework
::
Tensor
tmp
;
tmp
.
Resize
({
1
});
tmp
.
mutable_data
<
bool
>
(
cpu
);
auto
gpuctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
gpu
);
gpuctx
->
Wait
();
CopyFrom
(
out
,
cpu
,
*
gpuctx
,
&
tmp
);
gpuctx
->
Wait
();
return
GetResult
(
tmp
,
cpu
);
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
CPUPlace
&
cpu
)
const
{
return
*
out
.
data
<
bool
>
();
}
};
template
<
typename
Predicate
>
inline
bool
Any
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
)
{
AnyVisitor
<
Predicate
>
visitor
(
tensor
,
predicate
);
auto
place
=
tensor
.
place
();
return
platform
::
VisitPlace
(
place
,
visitor
);
}
struct
HasNANPredicate
{
template
<
typename
T
>
auto
operator
()(
const
T
&
eigen_vec
)
const
->
decltype
(
std
::
declval
<
T
>
().
isnan
())
{
// Cast eigen_vector to vector of bool. true if is inf.
return
eigen_vec
.
isnan
();
}
};
bool
HasNAN
(
const
framework
::
Tensor
&
tensor
)
{
HasNANPredicate
predicate
;
return
Any
(
tensor
,
predicate
);
}
struct
HasInfPredicate
{
template
<
typename
T
>
auto
operator
()(
const
T
&
eigen_vec
)
const
->
decltype
(
std
::
declval
<
T
>
().
isinf
())
{
// Cast eigen_vector to vector of bool. true if is inf.
return
eigen_vec
.
isinf
();
}
};
bool
HasInf
(
const
framework
::
Tensor
&
tensor
)
{
HasInfPredicate
predicate
;
return
Any
(
tensor
,
predicate
);
}
}
// namespace framework
}
// namespace paddle
paddle/framework/tensor_util.cu
0 → 120000
浏览文件 @
812c5f60
.
/
tensor_util
.
cc
\ No newline at end of file
paddle/framework/tensor_util.h
浏览文件 @
812c5f60
...
...
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/data_type.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
src_ptr
,
size
);
}
// Returns true if a tensor contains NAN, i.e., Not A Number.
bool
HasNAN
(
const
framework
::
Tensor
&
tensor
);
// Returns true if a tensor contains Inf, i.e., Infinity.
bool
HasInf
(
const
framework
::
Tensor
&
tensor
);
inline
void
SerializeToStream
(
std
::
ostream
&
os
,
const
Tensor
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
// TODO(typhoonzero): serialize to ostream
{
// the 1st field, uint32_t version
constexpr
uint32_t
version
=
0
;
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
version
),
sizeof
(
version
));
}
{
// the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto
::
TensorDesc
desc
;
desc
.
set_data_type
(
framework
::
ToDataType
(
tensor
.
type
()));
auto
dims
=
framework
::
vectorize
(
tensor
.
dims
());
auto
*
pb_dims
=
desc
.
mutable_dims
();
pb_dims
->
Resize
(
static_cast
<
int
>
(
dims
.
size
()),
0
);
std
::
copy
(
dims
.
begin
(),
dims
.
end
(),
pb_dims
->
begin
());
int32_t
size
=
desc
.
ByteSize
();
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
size
),
sizeof
(
size
));
auto
out
=
desc
.
SerializeAsString
();
os
.
write
(
out
.
data
(),
size
);
}
{
// the 3rd field, tensor data
uint64_t
size
=
tensor
.
memory_size
();
auto
*
data_ptr
=
tensor
.
data
<
void
>
();
PADDLE_ENFORCE
(
size
<
std
::
numeric_limits
<
std
::
streamsize
>::
max
(),
"Index overflow when writing tensor"
);
if
(
platform
::
is_gpu_place
(
tensor
.
place
()))
{
#ifdef PADDLE_WITH_CUDA
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
gpu_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
platform
::
CPUPlace
cpu
;
uintptr_t
data
=
reinterpret_cast
<
uintptr_t
>
(
data_ptr
);
while
(
size
!=
0
)
{
size_t
size_to_write
=
std
::
min
(
kBufSize
,
static_cast
<
size_t
>
(
size
));
memory
::
Copy
(
cpu
,
buf
.
get
(),
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
.
place
()),
reinterpret_cast
<
const
void
*>
(
data
),
size_to_write
,
gpu_dev_ctx
.
stream
());
gpu_dev_ctx
.
Wait
();
os
.
write
(
buf
.
get
(),
size_to_write
);
data
+=
size_to_write
;
size
-=
size_to_write
;
}
#else
PADDLE_THROW
(
"Unexpected branch"
);
#endif
}
else
{
os
.
write
(
static_cast
<
const
char
*>
(
data_ptr
),
static_cast
<
std
::
streamsize
>
(
size
));
}
}
}
inline
void
DeserializeFromStream
(
std
::
istream
&
is
,
Tensor
*
tensor
)
{
uint32_t
version
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
version
),
sizeof
(
version
));
PADDLE_ENFORCE_EQ
(
version
,
0U
,
"Only version 0 is supported"
);
proto
::
TensorDesc
desc
;
{
// int32_t size
// proto buffer
int32_t
size
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
size
),
sizeof
(
size
));
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
size
]);
is
.
read
(
reinterpret_cast
<
char
*>
(
buf
.
get
()),
size
);
PADDLE_ENFORCE
(
desc
.
ParseFromArray
(
buf
.
get
(),
size
),
"Cannot parse tensor desc"
);
}
{
// read tensor
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
static_cast
<
size_t
>
(
desc
.
dims
().
size
()));
std
::
copy
(
desc
.
dims
().
begin
(),
desc
.
dims
().
end
(),
std
::
back_inserter
(
dims
));
tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
void
*
buf
;
platform
::
Place
cpu
=
platform
::
CPUPlace
();
// TODO(Yancey1989): use VisiterDataType instead of DataType switch
switch
(
desc
.
data_type
())
{
case
proto
::
FP32
:
buf
=
tensor
->
mutable_data
<
float
>
(
cpu
);
break
;
case
proto
::
FP64
:
buf
=
tensor
->
mutable_data
<
double
>
(
cpu
);
break
;
case
proto
::
INT32
:
buf
=
tensor
->
mutable_data
<
int
>
(
cpu
);
break
;
case
proto
::
INT64
:
buf
=
tensor
->
mutable_data
<
int64_t
>
(
cpu
);
break
;
default:
PADDLE_THROW
(
"DataType %d not supported"
,
desc
.
data_type
());
}
is
.
read
(
static_cast
<
char
*>
(
buf
),
tensor
->
memory_size
());
}
}
}
// namespace framework
}
// namespace paddle
paddle/framework/tensor_util_test.cc
浏览文件 @
812c5f60
...
...
@@ -13,6 +13,7 @@
#include "paddle/framework/tensor_util.h"
#include <gtest/gtest.h>
#include <cmath>
#include <string>
namespace
paddle
{
...
...
@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) {
#endif
}
TEST
(
HasNAN
,
CPU
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
Tensor
src
;
float
*
buf
=
src
.
mutable_data
<
float
>
({
3
},
CPUPlace
());
buf
[
0
]
=
0.0
;
buf
[
1
]
=
NAN
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
HasNAN
(
src
));
}
TEST
(
HasInf
,
CPU
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
Tensor
src
;
double
*
buf
=
src
.
mutable_data
<
double
>
({
3
},
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
HasInf
(
src
));
}
TEST
(
Tensor
,
SerializeAndDeserialize
)
{
framework
::
Tensor
src_tensor
;
int
array
[
6
]
=
{
1
,
2
,
3
,
4
,
5
,
6
};
src_tensor
.
Resize
({
2
,
3
});
int
*
src_ptr
=
src_tensor
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
src_ptr
[
i
]
=
array
[
i
];
}
{
framework
::
Tensor
dst_tensor
;
auto
place
=
new
platform
::
CPUPlace
();
platform
::
CPUDeviceContext
cpu_ctx
(
*
place
);
std
::
ostringstream
oss
;
SerializeToStream
(
oss
,
src_tensor
,
cpu_ctx
);
std
::
istringstream
iss
(
oss
.
str
());
DeserializeFromStream
(
iss
,
&
dst_tensor
);
int
*
dst_ptr
=
dst_tensor
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
ASSERT_EQ
(
dst_ptr
[
i
],
array
[
i
]);
}
delete
place
;
}
#ifdef PADDLE_WITH_CUDA
{
Tensor
gpu_tensor
;
gpu_tensor
.
Resize
({
2
,
3
});
Tensor
dst_tensor
;
auto
gpu_place
=
new
platform
::
CUDAPlace
();
platform
::
CUDADeviceContext
gpu_ctx
(
*
gpu_place
);
CopyFrom
(
src_tensor
,
*
gpu_place
,
gpu_ctx
,
&
gpu_tensor
);
std
::
ostringstream
oss
;
SerializeToStream
(
oss
,
gpu_tensor
,
gpu_ctx
);
std
::
istringstream
iss
(
oss
.
str
());
DeserializeFromStream
(
iss
,
&
dst_tensor
);
int
*
dst_ptr
=
dst_tensor
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
ASSERT_EQ
(
dst_ptr
[
i
],
array
[
i
]);
}
delete
gpu_place
;
}
#endif
}
}
// namespace framework
}
// namespace paddle
paddle/framework/tensor_util_test.cu
0 → 100644
浏览文件 @
812c5f60
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
namespace
paddle
{
namespace
framework
{
static
__global__
void
FillNAN
(
float
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
0.1
;
buf
[
2
]
=
NAN
;
}
static
__global__
void
FillInf
(
float
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.5
;
}
TEST
(
HasNAN
,
GPU
)
{
Tensor
tensor
;
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
HasNAN
(
tensor
));
}
TEST
(
HasInf
,
GPU
)
{
Tensor
tensor
;
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
HasInf
(
tensor
));
}
}
// namespace framework
}
// namespace paddle
paddle/framework/threadpool.h
浏览文件 @
812c5f60
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <condition_variable>
#include <functional>
#include <future>
#include <mutex>
#include <queue>
#include <thread>
...
...
@@ -25,10 +26,11 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
typedef
std
::
function
<
void
()
>
Task
;
class
ThreadPool
{
public:
typedef
std
::
packaged_task
<
void
()
>
Task
;
typedef
std
::
function
<
void
()
>
Fun
;
/**
* @brief Get a instance of threadpool, the thread number will
* be specified as the number of hardware thread contexts
...
...
@@ -61,13 +63,18 @@ class ThreadPool {
/**
* @brief Push a function to the queue, and will be scheduled and
* executed if a thread is available.
* @param[in] Task will be pushed to the task queue.
* @param[in] Task, will be pushed to the task queue.
* @return std::future<void>, we could wait for the task finished by
* f.wait().
*/
void
Run
(
const
Task
&
fn
)
{
std
::
future
<
void
>
Run
(
const
Fun
&
fn
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
tasks_
.
push
(
fn
);
Task
task
(
std
::
bind
(
fn
));
std
::
future
<
void
>
f
=
task
.
get_future
();
tasks_
.
push
(
std
::
move
(
task
));
lock
.
unlock
();
scheduled_
.
notify_one
();
return
f
;
}
/**
...
...
@@ -110,7 +117,7 @@ class ThreadPool {
break
;
}
// pop a task from the task queue
auto
task
=
tasks_
.
front
(
);
auto
task
=
std
::
move
(
tasks_
.
front
()
);
tasks_
.
pop
();
--
available_
;
...
...
paddle/framework/threadpool_test.cc
浏览文件 @
812c5f60
...
...
@@ -20,16 +20,21 @@ limitations under the License. */
namespace
framework
=
paddle
::
framework
;
void
do_sum
(
framework
::
ThreadPool
*
pool
,
std
::
atomic
<
int
>&
sum
,
int
cnt
)
{
std
::
vector
<
std
::
future
<
void
>>
fs
;
for
(
int
i
=
0
;
i
<
cnt
;
++
i
)
{
pool
->
Run
([
&
sum
]()
{
sum
.
fetch_add
(
1
);
});
auto
f
=
pool
->
Run
([
&
sum
]()
{
sum
.
fetch_add
(
1
);
});
fs
.
push_back
(
std
::
move
(
f
));
}
for
(
auto
&
f
:
fs
)
{
f
.
wait
();
}
}
TEST
(
ThreadPool
,
ConcurrentInit
)
{
framework
::
ThreadPool
*
pool
;
int
concurrent_cnt
=
50
;
int
n
=
50
;
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
i
=
0
;
i
<
concurrent_cnt
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
std
::
thread
t
([
&
pool
]()
{
pool
=
framework
::
ThreadPool
::
GetInstance
();
});
threads
.
push_back
(
std
::
move
(
t
));
}
...
...
@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) {
}
}
TEST
(
ThreadPool
,
Concurrent
Start
)
{
TEST
(
ThreadPool
,
Concurrent
Run
)
{
framework
::
ThreadPool
*
pool
=
framework
::
ThreadPool
::
GetInstance
();
std
::
atomic
<
int
>
sum
(
0
);
std
::
vector
<
std
::
thread
>
threads
;
int
concurrent_cnt
=
50
;
int
n
=
50
;
// sum = (n * (n + 1)) / 2
for
(
int
i
=
1
;
i
<=
concurrent_cnt
;
++
i
)
{
for
(
int
i
=
1
;
i
<=
n
;
++
i
)
{
std
::
thread
t
(
do_sum
,
pool
,
std
::
ref
(
sum
),
i
);
threads
.
push_back
(
std
::
move
(
t
));
}
...
...
@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) {
t
.
join
();
}
pool
->
Wait
();
EXPECT_EQ
(
sum
,
((
concurrent_cnt
+
1
)
*
concurrent_cnt
)
/
2
);
EXPECT_EQ
(
sum
,
((
n
+
1
)
*
n
)
/
2
);
}
paddle/framework/var_desc.cc
浏览文件 @
812c5f60
...
...
@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
case
proto
::
VarDesc
::
LOD_TENSOR_ARRAY
:
return
desc_
.
tensor_array
().
tensor
();
default:
PADDLE_THROW
(
"
Unexpected branch
."
);
PADDLE_THROW
(
"
The type of var '"
,
this
->
Name
(),
"' is unsupported
."
);
}
}
...
...
paddle/function/GemmConvOp.cpp
浏览文件 @
812c5f60
...
...
@@ -126,14 +126,165 @@ public:
inputData
+=
inputChannels
*
inputHeight
*
inputWidth
;
outputData
+=
outputChannels
*
outputHeight
*
outputWidth
;
}
}
};
#ifdef PADDLE_MOBILE_INFERENCE
if
(
Device
==
DEVICE_TYPE_CPU
)
{
memory_
.
reset
();
/*
* \brief Forward calculation of convolution, optimized for mobile.
*/
template
<
DeviceType
Device
>
class
GemmConvMobileFunction
:
public
ConvFunctionBase
{
public:
void
init
(
const
FuncConfig
&
config
)
override
{
ConvFunctionBase
::
init
(
config
);
}
void
check
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
const
TensorShape
&
input
=
inputs
[
0
].
shape
();
const
TensorShape
&
filter
=
inputs
[
1
].
shape
();
const
TensorShape
&
output
=
outputs
[
0
].
shape
();
checkShape
(
input
,
filter
,
output
);
}
void
calc
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
CHECK_EQ
(
numInputs_
,
inputs
.
size
());
CHECK_EQ
(
numOutputs_
,
outputs
.
size
());
check
(
inputs
,
outputs
);
// TODO(hedaoyuan): Need to define some index macros,
// to avoid useing 0 and 1.
const
TensorShape
&
input
=
inputs
[
0
].
shape
();
const
TensorShape
&
filter
=
inputs
[
1
].
shape
();
const
TensorShape
&
output
=
outputs
[
0
].
shape
();
real
beta
;
if
(
outputs
[
0
].
getArgType
()
==
ADD_TO
)
{
beta
=
1.0
;
}
else
{
beta
=
0.0
;
}
#endif
size_t
batchSize
=
input
[
0
];
size_t
inputChannels
=
input
[
1
];
size_t
inputHeight
=
input
[
2
];
size_t
inputWidth
=
input
[
3
];
size_t
filterHeight
=
getFilterHeight
(
filter
);
size_t
filterWidth
=
getFilterWidth
(
filter
);
size_t
outputChannels
=
output
[
1
];
size_t
outputHeight
=
output
[
2
];
size_t
outputWidth
=
output
[
3
];
real
*
inputData
=
inputs
[
0
].
data
<
real
>
();
real
*
filterData
=
inputs
[
1
].
data
<
real
>
();
real
*
outputData
=
outputs
[
0
].
data
<
real
>
();
bool
needIm2col
=
isNeedIm2col
(
filter
);
TensorShape
imShape
=
TensorShape
({
inputChannels
/
groups_
,
inputHeight
,
inputWidth
});
TensorShape
colShape
;
real
*
colData
=
NULL
;
size_t
colHeight
=
inputChannels
/
groups_
*
filterHeight
*
filterWidth
;
size_t
colWidth
=
outputHeight
*
outputWidth
;
// Max col matrix height 256, Max col matrix width 1024
size_t
stepColHeight
=
std
::
min
(
colHeight
,
static_cast
<
size_t
>
(
256
));
size_t
stepColWidth
=
std
::
min
(
colWidth
,
static_cast
<
size_t
>
(
2048
));
if
(
needIm2col
)
{
colShape
=
TensorShape
({
inputChannels
/
groups_
,
filterHeight
,
filterWidth
,
outputHeight
,
outputWidth
});
resizeBuffer
<
Device
>
(
stepColHeight
*
stepColWidth
*
sizeof
(
real
));
colData
=
reinterpret_cast
<
real
*>
(
memory_
->
getBuf
());
}
Im2ColMobileFunctor
<
real
>
im2col
;
size_t
inputOffset
=
imShape
.
getElements
();
size_t
outputOffset
=
(
outputChannels
/
groups_
)
*
outputHeight
*
outputWidth
;
size_t
filterOffset
=
filter
.
getElements
()
/
groups_
;
int
nStride
=
colWidth
;
int
kStride
=
colHeight
;
for
(
size_t
i
=
0
;
i
<
batchSize
;
i
++
)
{
for
(
size_t
g
=
0
;
g
<
groups_
;
g
++
)
{
if
(
needIm2col
)
{
real
beta_
=
beta
;
for
(
size_t
colHeightStart
=
0
;
colHeightStart
<
colHeight
;
colHeightStart
+=
stepColHeight
)
{
for
(
size_t
colWidthStart
=
0
;
colWidthStart
<
colWidth
;
colWidthStart
+=
stepColWidth
)
{
int
N
=
std
::
min
(
colWidth
-
colWidthStart
,
stepColWidth
);
int
K
=
std
::
min
(
colHeight
-
colHeightStart
,
stepColHeight
);
// im2col
im2col
(
inputData
+
g
*
inputOffset
,
imShape
,
colData
,
colShape
,
strideH
(),
strideW
(),
paddingH
(),
paddingW
(),
dilationH
(),
dilationW
(),
colHeightStart
,
K
,
colWidthStart
,
N
);
// gemm
int
M
=
outputChannels
/
groups_
;
BlasGemm
<
Device
,
real
>::
compute
(
false
,
false
,
M
,
N
,
K
,
1.0
f
,
filterData
+
g
*
filterOffset
+
colHeightStart
,
kStride
,
colData
,
N
,
beta_
,
outputData
+
g
*
outputOffset
+
colWidthStart
,
nStride
);
}
beta_
=
1.0
;
}
}
else
{
int
M
=
outputChannels
/
groups_
;
int
N
=
outputHeight
*
outputWidth
;
int
K
=
inputChannels
/
groups_
*
filterHeight
*
filterWidth
;
BlasGemm
<
Device
,
real
>::
compute
(
false
,
false
,
M
,
N
,
K
,
1.0
f
,
filterData
+
g
*
filterOffset
,
K
,
inputData
+
g
*
inputOffset
,
N
,
beta
,
outputData
+
g
*
outputOffset
,
N
);
}
}
inputData
+=
inputChannels
*
inputHeight
*
inputWidth
;
outputData
+=
outputChannels
*
outputHeight
*
outputWidth
;
}
memory_
.
reset
();
}
};
#endif
/*
* \brief Backward input calculation of convolution.
*/
...
...
@@ -348,7 +499,11 @@ public:
}
};
#ifdef PADDLE_MOBILE_INFERENCE
REGISTER_TYPED_FUNC
(
GemmConv
,
CPU
,
GemmConvMobileFunction
);
#else
REGISTER_TYPED_FUNC
(
GemmConv
,
CPU
,
GemmConvFunction
);
#endif
REGISTER_TYPED_FUNC
(
GemmConvGradInput
,
CPU
,
GemmConvGradInputFunction
);
REGISTER_TYPED_FUNC
(
GemmConvGradFilter
,
CPU
,
GemmConvGradFilterFunction
);
#ifdef PADDLE_WITH_CUDA
...
...
paddle/function/Im2Col.h
浏览文件 @
812c5f60
...
...
@@ -98,4 +98,54 @@ public:
int
dilationWidth
=
1
);
};
template
<
class
T
>
class
Im2ColMobileFunctor
{
public:
void
operator
()(
const
T
*
imData
,
const
TensorShape
&
imShape
,
T
*
colData
,
const
TensorShape
&
colShape
,
int
strideHeight
,
int
strideWidth
,
int
paddingHeight
,
int
paddingWidth
,
int
dilationHeight
,
int
dilationWidth
,
int
colHeightStart
,
int
colHeightSize
,
int
colWidthStart
,
int
colWidthSize
)
{
int
inputHeight
=
imShape
[
1
];
int
inputWidth
=
imShape
[
2
];
int
filterHeight
=
colShape
[
1
];
int
filterWidth
=
colShape
[
2
];
int
outputWidth
=
colShape
[
4
];
for
(
int
colh
=
0
;
colh
<
colHeightSize
;
colh
++
)
{
int
wOffset
=
(
colHeightStart
+
colh
)
%
filterWidth
;
int
hOffset
=
((
colHeightStart
+
colh
)
/
filterWidth
)
%
filterHeight
;
int
c_im
=
(
colHeightStart
+
colh
)
/
filterWidth
/
filterHeight
;
for
(
int
colw
=
0
;
colw
<
colWidthSize
;
colw
++
)
{
int
h
=
(
colWidthStart
+
colw
)
/
outputWidth
;
int
w
=
(
colWidthStart
+
colw
)
%
outputWidth
;
int
imRowIdx
=
h
*
strideHeight
+
hOffset
*
dilationHeight
;
int
imColIdx
=
w
*
strideWidth
+
wOffset
*
dilationWidth
;
if
((
imRowIdx
-
paddingHeight
)
<
0
||
(
imRowIdx
-
paddingHeight
)
>=
inputHeight
||
(
imColIdx
-
paddingWidth
)
<
0
||
(
imColIdx
-
paddingWidth
)
>=
inputWidth
)
{
colData
[
colh
*
colWidthSize
+
colw
]
=
static_cast
<
T
>
(
0
);
}
else
{
imRowIdx
+=
c_im
*
inputHeight
-
paddingHeight
;
imColIdx
-=
paddingWidth
;
colData
[
colh
*
colWidthSize
+
colw
]
=
imData
[
imRowIdx
*
inputWidth
+
imColIdx
];
}
}
}
}
};
}
// namespace paddle
paddle/function/Im2ColTest.cpp
浏览文件 @
812c5f60
...
...
@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
#endif
template
<
class
T
>
void
TestIm2ColMobileFunctor
()
{
for
(
size_t
channels
:
{
32
})
{
for
(
size_t
inputHeight
:
{
33
,
100
})
{
for
(
size_t
inputWidth
:
{
32
,
96
})
{
for
(
size_t
filterHeight
:
{
5
})
{
for
(
size_t
filterWidth
:
{
7
})
{
for
(
size_t
stride
:
{
2
})
{
for
(
size_t
padding
:
{
1
})
{
for
(
size_t
dilation
:
{
1
,
3
})
{
size_t
filterSizeH
=
(
filterHeight
-
1
)
*
dilation
+
1
;
size_t
filterSizeW
=
(
filterWidth
-
1
)
*
dilation
+
1
;
if
(
inputHeight
+
2
*
padding
<
filterSizeH
||
inputWidth
+
2
*
padding
<
filterSizeW
)
break
;
if
(
padding
>=
filterSizeH
||
padding
>=
filterSizeW
)
break
;
size_t
outputHeight
=
(
inputHeight
-
filterSizeH
+
2
*
padding
)
/
stride
+
1
;
size_t
outputWidth
=
(
inputWidth
-
filterSizeW
+
2
*
padding
)
/
stride
+
1
;
TensorShape
imShape
=
TensorShape
({
channels
,
inputHeight
,
inputWidth
});
TensorShape
colShape1
=
TensorShape
({
channels
,
filterHeight
,
filterWidth
,
outputHeight
,
outputWidth
});
size_t
height
=
channels
*
filterHeight
*
filterWidth
;
size_t
width
=
outputHeight
*
outputWidth
;
VectorPtr
input1
=
Vector
::
create
(
imShape
.
getElements
(),
false
);
VectorPtr
input2
=
Vector
::
create
(
imShape
.
getElements
(),
false
);
MatrixPtr
output1
=
Matrix
::
create
(
height
,
width
,
false
,
false
);
MatrixPtr
output2
=
Matrix
::
create
(
height
,
width
,
false
,
false
);
input1
->
uniform
(
0.001
,
1
);
input2
->
copyFrom
(
*
input1
);
Im2ColFunctor
<
kCFO
,
DEVICE_TYPE_CPU
,
T
>
im2Col1
;
Im2ColMobileFunctor
<
T
>
im2Col2
;
im2Col1
(
input1
->
getData
(),
imShape
,
output1
->
getData
(),
colShape1
,
stride
,
stride
,
padding
,
padding
,
dilation
,
dilation
);
im2Col2
(
input2
->
getData
(),
imShape
,
output2
->
getData
(),
colShape1
,
stride
,
stride
,
padding
,
padding
,
dilation
,
dilation
,
0
,
height
,
0
,
width
);
autotest
::
TensorCheckEqual
(
*
output1
,
*
output2
);
}
}
}
}
}
}
}
}
}
TEST
(
Im2ColFunctor
,
Mobile
)
{
TestIm2ColMobileFunctor
<
float
>
();
}
}
// namespace paddle
paddle/gserver/layers/MKLDNNLRNLayer.cpp
浏览文件 @
812c5f60
...
...
@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
}
/* the size of inputs for norm-layer is 1 */
CHECK_EQ
(
config_
.
inputs_size
(),
1
UL
);
CHECK_EQ
(
config_
.
inputs_size
(),
1
);
const
NormConfig
&
conf
=
config_
.
inputs
(
0
).
norm_conf
();
localSize_
=
conf
.
size
();
alpha_
=
conf
.
scale
();
...
...
paddle/operators/CMakeLists.txt
浏览文件 @
812c5f60
file
(
GLOB GENERAL_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*_op.cc"
)
string
(
REPLACE
".cc"
""
GENERAL_OPS
"
${
GENERAL_OPS
}
"
)
set
(
DEPS_OPS
""
)
set
(
pybind_file
${
PADDLE_SOURCE_DIR
}
/paddle/pybind/pybind.h
)
file
(
WRITE
${
pybind_file
}
"// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!
\n\n
"
)
function
(
op_library TARGET
)
...
...
@@ -48,6 +49,11 @@ function(op_library TARGET)
message
(
FATAL_ERROR
"The op library
${
TARGET
}
should contains at least one .cc file"
)
endif
()
list
(
LENGTH op_library_DEPS op_library_DEPS_len
)
if
(
${
op_library_DEPS_len
}
GREATER 0
)
set
(
DEPS_OPS
${
TARGET
}
${
DEPS_OPS
}
PARENT_SCOPE
)
endif
()
if
(
WITH_GPU
)
nv_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
cu_cc_srcs
}
${
cu_srcs
}
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
...
...
@@ -181,56 +187,26 @@ endfunction()
add_subdirectory
(
math
)
add_subdirectory
(
nccl
)
set
(
DEPS_OPS
cond_op
cross_entropy_op
recurrent_op
softmax_with_cross_entropy_op
softmax_op
sequence_softmax_op
sum_op
pool_op
maxout_op
unpool_op
pool_with_index_op
conv_op
conv_transpose_op
nccl_op
sequence_conv_op
sequence_pool_op
lod_rank_table_op
lod_tensor_to_array_op
array_to_lod_tensor_op
max_sequence_len_op
lstm_op
tensor_array_read_write_op
gru_op
adagrad_op
sgd_op
save_op
load_op
send_op
recv_op
cos_sim_op
)
if
(
WITH_GPU
)
op_library
(
nccl_op DEPS nccl_common
)
else
()
set
(
DEPS_OPS
${
DEPS_OPS
}
nccl_op
)
endif
()
if
(
WITH_DISTRIBUTE
)
add_subdirectory
(
detail
)
op_library
(
send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf
)
set_source_files_properties
(
send_op.cc
PROPERTIES
COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
op_library
(
recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf
)
set_source_files_properties
(
recv_op.cc
PROPERTIES
COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
cc_test
(
test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor
)
add_subdirectory
(
detail
)
set
(
DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
op_library
(
send_op DEPS
${
DISTRIBUTE_DEPS
}
)
set_source_files_properties
(
send_op.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
op_library
(
recv_op DEPS
${
DISTRIBUTE_DEPS
}
)
set_source_files_properties
(
recv_op.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_test
(
test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor
)
else
()
set
(
DEPS_OPS
${
DEPS_OPS
}
send_op recv_op
)
endif
()
op_library
(
cond_op
SRCS cond_op.cc DEPS framework_proto tensor operat
or net_op
)
op_library
(
cond_op
DEPS framework_proto tens
or net_op
)
op_library
(
cross_entropy_op DEPS cross_entropy
)
op_library
(
softmax_with_cross_entropy_op DEPS cross_entropy softmax
)
op_library
(
softmax_op DEPS softmax
)
...
...
@@ -243,22 +219,17 @@ op_library(pool_op DEPS pooling)
op_library
(
maxout_op DEPS maxouting
)
op_library
(
unpool_op DEPS unpooling
)
op_library
(
pool_with_index_op DEPS pooling
)
op_library
(
lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table
)
op_library
(
lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op
)
op_library
(
array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op
)
op_library
(
max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table
)
op_library
(
tensor_array_read_write_op SRCS tensor_array_read_write_op.cc
)
if
(
WITH_GPU
)
op_library
(
nccl_op DEPS nccl_common
)
endif
()
op_library
(
lod_rank_table_op DEPS lod_rank_table
)
op_library
(
lod_tensor_to_array_op DEPS lod_rank_table_op
)
op_library
(
array_to_lod_tensor_op DEPS lod_rank_table_op
)
op_library
(
max_sequence_len_op DEPS lod_rank_table
)
op_library
(
sequence_conv_op DEPS context_project
)
op_library
(
sequence_pool_op DEPS sequence_pooling
)
op_library
(
lstm_op DEPS sequence2batch lstm_compute
)
op_library
(
conv_transpose_op DEPS vol2col
)
op_library
(
gru_op DEPS sequence2batch gru_compute
)
op_library
(
recurrent_op
SRCS recurrent_op.cc
DEPS executor
)
op_library
(
recurrent_op DEPS executor
)
op_library
(
cos_sim_op DEPS cos_sim_functor
)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library
(
save_op DEPS lod_tensor
)
op_library
(
load_op DEPS lod_tensor
)
...
...
@@ -271,13 +242,12 @@ endforeach()
set
(
GLOB_OP_LIB
${
OP_LIBRARY
}
CACHE INTERNAL
"Global OP library"
)
cc_test
(
gather_test SRCS gather_test.cc DEPS tensor
)
cc_test
(
net_op_test SRCS net_op_test.cc DEPS net_op
)
cc_test
(
scatter_test SRCS scatter_test.cc DEPS tensor
)
cc_test
(
beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor
)
cc_test
(
strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory
)
if
(
WITH_GPU
)
cc_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
cc_test
(
nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context
)
endif
()
cc_test
(
save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op
)
paddle/operators/array_operator.h
浏览文件 @
812c5f60
...
...
@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ
(
i_tensor
.
numel
(),
1
);
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
size_t
offset
;
if
(
platform
::
is_gpu_place
(
i_tensor
.
place
()))
{
...
...
paddle/operators/array_to_lod_tensor_op.cc
浏览文件 @
812c5f60
...
...
@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
}
auto
slice
=
out
->
Slice
(
out_offset
,
out_offset
+
len
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
CopyFrom
(
x
[
x_idx
].
Slice
(
start_offset
,
end_offset
),
place
,
dev_ctx
,
&
slice
);
...
...
paddle/operators/assign_op.cc
浏览文件 @
812c5f60
...
...
@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase {
out
!=
nullptr
,
"The Output(Out) should not be null if the Input(X) is set."
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
VisitVarType
(
*
x
,
AssignFunctor
(
out
,
dev_ctx
));
}
...
...
paddle/operators/beam_search_decode_op.cc
浏览文件 @
812c5f60
...
...
@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
dev_place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
);
...
...
paddle/operators/cond_op.cc
浏览文件 @
812c5f60
...
...
@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
void
CondOp
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
PrepareDataForSubnet
(
scope
,
dev_ctx
);
std
::
vector
<
framework
::
Scope
*>&
sub_scopes
=
GetSubScopes
(
scope
);
...
...
paddle/operators/conv_op.cc
浏览文件 @
812c5f60
...
...
@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
std
::
vector
<
int
>
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
int
groups
=
ctx
->
Attrs
().
Get
<
int
>
(
"groups"
);
std
::
vector
<
int
>
dilations
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dilations"
);
int
input_channels
=
in_dims
[
1
];
int
output_channels
=
filter_dims
[
0
];
PADDLE_ENFORCE
(
in_dims
.
size
()
==
4
||
in_dims
.
size
()
==
5
,
"Conv intput should be 4-D or 5-D tensor."
);
...
...
@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ
(
paddings
.
size
(),
strides
.
size
(),
"Conv paddings dimension and Conv strides dimension should be the same."
);
int
input_channels
=
in_dims
[
1
];
PADDLE_ENFORCE_EQ
(
input_channels
,
filter_dims
[
1
]
*
groups
,
"The number of input channels should be equal to filter "
"channels * groups."
);
int
output_channels
=
filter_dims
[
0
];
PADDLE_ENFORCE_EQ
(
output_channels
%
groups
,
0
,
"The number of output channels should be divided by groups."
);
...
...
paddle/operators/cross_entropy_op.cc
浏览文件 @
812c5f60
...
...
@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
CrossEntropyOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"(Tensor, default Tensor<float>), a 2-D tensor with shape
N x D,
"
"where N is the batch size and D is the number of classes. "
"(Tensor, default Tensor<float>), a 2-D tensor with shape
[N x D],
"
"
where N is the batch size and D is the number of classes. "
"This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator."
);
AddInput
(
"Label"
,
"(Tensor), the ground truth which is a 2-D tensor. When "
"soft_label is set to false, Label is a Tensor<int64> with shape "
"[N x 1]. When soft_label is set to true, Label is a "
"Tensor<float/double> with shape [N x
K
]."
);
"Tensor<float/double> with shape [N x
D
]."
);
AddOutput
(
"Y"
,
"(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The cross entropy loss."
);
...
...
paddle/operators/feed_op.cc
浏览文件 @
812c5f60
...
...
@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase {
auto
*
out_item
=
out_var
->
GetMutable
<
framework
::
FeedFetchType
>
();
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
CopyFrom
(
feed_item
,
place
,
dev_ctx
,
out_item
);
out_item
->
set_lod
(
feed_item
.
lod
());
...
...
paddle/operators/fetch_op.cc
浏览文件 @
812c5f60
...
...
@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs?
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
CopyFrom
(
src_item
,
platform
::
CPUPlace
(),
dev_ctx
,
&
dst_item
);
dev_ctx
.
Wait
();
...
...
paddle/operators/fill_constant_op.cc
浏览文件 @
812c5f60
...
...
@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase {
out
.
mutable_data
(
dev_place
,
framework
::
ToTypeIndex
(
data_type
));
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
dev_place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
math
::
set_constant
(
dev_ctx
,
&
out
,
value
);
}
};
...
...
paddle/operators/fill_op.cc
浏览文件 @
812c5f60
...
...
@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase {
if
(
!
force_cpu
&&
platform
::
is_gpu_place
(
place
))
{
// Copy tensor to out
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
CopyFrom
(
tensor
,
place
,
dev_ctx
,
&
out
);
}
}
...
...
paddle/operators/load_op.cc
浏览文件 @
812c5f60
...
...
@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
out_var_name
);
auto
*
tensor
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
DeserializeFromStream
(
fin
,
tensor
);
DeserializeFromStream
(
fin
,
tensor
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
if
(
platform
::
is_gpu_place
(
place
))
{
// copy CPU to GPU
...
...
paddle/operators/lod_tensor_to_array_op.cc
浏览文件 @
812c5f60
...
...
@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
auto
slice
=
out
[
i
].
Slice
(
static_cast
<
int
>
(
offset
),
static_cast
<
int
>
(
offset
+
len
));
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
CopyFrom
(
x
.
Slice
(
static_cast
<
int
>
(
each_range
.
begin
),
static_cast
<
int
>
(
each_range
.
end
)),
...
...
paddle/operators/math/CMakeLists.txt
浏览文件 @
812c5f60
...
...
@@ -9,9 +9,9 @@ if(WITH_GPU)
nv_library
(
cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context
)
nv_library
(
pooling SRCS pooling.cc pooling.cu DEPS device_context
)
nv_library
(
sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function
)
nv_library
(
vol2col SRCS vol2col.cc vol2col.cu DEPS device_context
)
nv_library
(
vol2col SRCS vol2col.cc vol2col.cu DEPS device_context
tensor
)
nv_library
(
context_project SRCS context_project.cc context_project.cu DEPS device_context math_function
)
nv_library
(
sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context
)
nv_library
(
sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context
tensor
)
nv_library
(
lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions
)
nv_library
(
maxouting SRCS maxouting.cc maxouting.cu DEPS device_context
)
nv_library
(
unpooling SRCS unpooling.cc unpooling.cu DEPS device_context
)
...
...
@@ -24,9 +24,9 @@ else()
cc_library
(
cross_entropy SRCS cross_entropy.cc DEPS device_context
)
cc_library
(
pooling SRCS pooling.cc DEPS device_context
)
cc_library
(
sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function
)
cc_library
(
vol2col SRCS vol2col.cc DEPS device_context
)
cc_library
(
vol2col SRCS vol2col.cc DEPS device_context
tensor
)
cc_library
(
context_project SRCS context_project.cc DEPS device_context math_function
)
cc_library
(
sequence2batch SRCS sequence2batch.cc DEPS device_context
)
cc_library
(
sequence2batch SRCS sequence2batch.cc DEPS device_context
tensor
)
cc_library
(
lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions
)
cc_library
(
maxouting SRCS maxouting.cc DEPS device_context
)
cc_library
(
unpooling SRCS unpooling.cc DEPS device_context
)
...
...
paddle/operators/math/math_function.cc
浏览文件 @
812c5f60
...
...
@@ -302,8 +302,29 @@ void set_constant(const platform::DeviceContext& context,
#endif
}
template
<
typename
T
>
struct
RowwiseAdd
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
vector
,
framework
::
Tensor
*
output
)
{
auto
in_dims
=
input
.
dims
();
auto
size
=
input
.
numel
()
/
in_dims
[
0
];
PADDLE_ENFORCE_EQ
(
vector
.
numel
(),
size
);
PADDLE_ENFORCE_EQ
(
output
->
dims
(),
in_dims
);
auto
in
=
framework
::
EigenMatrix
<
T
>::
From
(
input
);
auto
vec
=
framework
::
EigenVector
<
T
>::
Flatten
(
vector
);
auto
out
=
framework
::
EigenMatrix
<
T
>::
From
(
*
output
);
for
(
int64_t
i
=
0
;
i
<
in_dims
[
0
];
++
i
)
{
out
.
chip
(
i
,
0
)
=
in
.
chip
(
i
,
0
)
+
vec
;
}
}
};
template
struct
RowwiseAdd
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
RowwiseAdd
<
platform
::
CPUDeviceContext
,
double
>;
template
struct
ColwiseSum
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
ColwiseSum
<
platform
::
CPUDeviceContext
,
double
>;
...
...
paddle/operators/math/math_function.cu
浏览文件 @
812c5f60
...
...
@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>(
TensorSetConstantGPU
(
context
,
tensor
,
value
));
}
template
<
typename
T
>
__global__
void
RowwiseAddKernel
(
const
T
*
a
,
const
T
*
b
,
T
*
c
,
int
width
,
int
num
)
{
T
tmp
=
1.0
/
width
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
h
=
i
*
tmp
;
int
w
=
i
-
h
*
width
;
c
[
i
]
=
a
[
i
]
+
b
[
w
];
}
}
template
<
typename
T
>
struct
RowwiseAdd
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
vector
,
framework
::
Tensor
*
output
)
{
auto
in_dims
=
input
.
dims
();
auto
size
=
input
.
numel
()
/
in_dims
[
0
];
PADDLE_ENFORCE_EQ
(
vector
.
numel
(),
size
);
PADDLE_ENFORCE_EQ
(
output
->
dims
(),
in_dims
);
int
blocks
=
512
;
int
grids
=
(
input
.
numel
()
+
blocks
-
1
)
/
blocks
;
RowwiseAddKernel
<
T
><<<
grids
,
blocks
,
0
,
context
.
stream
()
>>>
(
input
.
data
<
T
>
(),
vector
.
data
<
T
>
(),
output
->
data
<
T
>
(),
static_cast
<
int
>
(
in_dims
[
1
]),
static_cast
<
int
>
(
input
.
numel
()));
}
};
template
struct
RowwiseAdd
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
RowwiseAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
ColwiseSum
<
platform
::
CUDADeviceContext
,
float
>;
...
...
paddle/operators/math/math_function_impl.h
浏览文件 @
812c5f60
...
...
@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
eigen_out
.
device
(
*
dev
)
=
eigen_in
.
shuffle
(
permute
);
}
template
<
typename
DeviceContext
,
typename
T
>
void
RowwiseAdd
<
DeviceContext
,
T
>::
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
vector
,
framework
::
Tensor
*
output
)
{
auto
in_dims
=
input
.
dims
();
auto
size
=
input
.
numel
()
/
in_dims
[
0
];
PADDLE_ENFORCE_EQ
(
vector
.
numel
(),
size
);
PADDLE_ENFORCE_EQ
(
output
->
dims
(),
in_dims
);
auto
in
=
framework
::
EigenMatrix
<
T
>::
From
(
input
);
auto
vec
=
framework
::
EigenMatrix
<
T
>::
From
(
vector
);
auto
out
=
framework
::
EigenMatrix
<
T
>::
From
(
*
output
);
Eigen
::
array
<
int
,
2
>
shape
({{
1
,
static_cast
<
int
>
(
size
)}});
Eigen
::
array
<
int
,
2
>
bcast
({{
static_cast
<
int
>
(
in_dims
[
0
]),
1
}});
out
.
device
(
*
context
.
eigen_device
())
=
in
+
vec
.
reshape
(
shape
).
broadcast
(
bcast
);
}
template
<
typename
DeviceContext
,
typename
T
>
void
ColwiseSum
<
DeviceContext
,
T
>::
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
...
...
paddle/operators/merge_lod_tensor_op.cc
浏览文件 @
812c5f60
...
...
@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
dev_place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
&
x
=
scope
.
FindVar
(
Input
(
"X"
))
->
Get
<
framework
::
LoDTensor
>
();
auto
&
mask
=
scope
.
FindVar
(
Input
(
"Mask"
))
->
Get
<
framework
::
LoDTensor
>
();
...
...
paddle/operators/nccl_op_test.cu.cc
浏览文件 @
812c5f60
...
...
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
}
VLOG
(
0
)
<<
" DeviceCount "
<<
count
;
paddle
::
platform
::
DeviceContextPool
::
Create
(
places
);
paddle
::
platform
::
DeviceContextPool
::
Init
(
places
);
testing
::
InitGoogleTest
(
&
argc
,
argv
);
...
...
paddle/operators/recurrent_op.cc
浏览文件 @
812c5f60
...
...
@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase {
false
/*create_local_scope*/
);
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
// Copy inside::output -> outside::output
// outside::output[seq_offset: seq_offset + 1] = inside::output
...
...
@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase {
auto
*
program
=
block
->
Program
();
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
for
(
size_t
step_id
=
0
;
step_id
<
seq_len
;
++
step_id
)
{
size_t
seq_offset
=
reverse
?
step_id
:
seq_len
-
step_id
-
1
;
...
...
paddle/operators/reorder_lod_tensor_by_rank_op.cc
浏览文件 @
812c5f60
...
...
@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
auto
x_sliced
=
x
.
Slice
(
x_offset
,
x_offset
+
len
);
auto
out_sliced
=
out
->
Slice
(
out_offset
,
out_offset
+
len
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
CopyFrom
(
x_sliced
,
out_sliced
.
place
(),
dev_ctx
,
&
out_sliced
);
out_offset
+=
len
;
return
out_offset
;
...
...
paddle/operators/save_op.cc
浏览文件 @
812c5f60
...
...
@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase {
auto
&
tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
SerializeToStream
(
fout
,
tensor
,
dev_ctx
);
}
...
...
paddle/operators/send_op.cc
浏览文件 @
812c5f60
...
...
@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment
(
R"DOC(
Recv operator
This operator will
recv tensor from send_op
This operator will
send tensor to recv_op.
)DOC"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"endpoints"
,
"(string vector, default 127.0.0.1:6164)"
...
...
paddle/operators/shrink_rnn_memory_op.cc
浏览文件 @
812c5f60
...
...
@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
dx_tensor
.
mutable_data
(
x_tensor
.
place
(),
x_tensor
.
type
());
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
if
(
dout_var
==
nullptr
)
{
// dx_tensor fill zero
math
::
set_constant
(
dev_ctx
,
&
dx_tensor
,
0.0
f
);
...
...
paddle/operators/split_lod_tensor_op.cc
浏览文件 @
812c5f60
...
...
@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
auto
&
x_lod
=
x
.
lod
();
auto
&
mask_dim
=
mask
.
dims
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
dev_place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
std
::
unique_ptr
<
framework
::
LoDTensor
>
cpu_mask
{
new
framework
::
LoDTensor
()};
if
(
platform
::
is_cpu_place
(
mask
.
place
()))
{
...
...
paddle/operators/tensor_array_read_write_op.cc
浏览文件 @
812c5f60
...
...
@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp {
if
(
x_tensor
.
memory_size
()
>
0
)
{
auto
*
out_tensor
=
&
out
->
at
(
offset
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
CopyFrom
(
x_tensor
,
place
,
dev_ctx
,
out_tensor
);
out_tensor
->
set_lod
(
x_tensor
.
lod
());
...
...
@@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp {
auto
*
out_tensor
=
out
->
GetMutable
<
framework
::
LoDTensor
>
();
size_t
offset
=
GetOffset
(
scope
,
place
);
if
(
offset
<
x_array
.
size
())
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
framework
::
CopyFrom
(
x_array
[
offset
],
place
,
dev_ctx
,
out_tensor
);
out_tensor
->
set_lod
(
x_array
[
offset
].
lod
());
}
else
{
...
...
paddle/platform/device_context.cc
浏览文件 @
812c5f60
...
...
@@ -17,7 +17,7 @@ namespace platform {
DeviceContextPool
*
DeviceContextPool
::
pool
=
nullptr
;
const
platform
::
DeviceContext
*
DeviceContextPool
::
Borrow
(
const
platform
::
DeviceContext
*
DeviceContextPool
::
Get
(
const
platform
::
Place
&
place
)
{
auto
it
=
device_contexts_
.
find
(
place
);
if
(
it
==
device_contexts_
.
end
())
{
...
...
@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow(
return
it
->
second
;
}
std
::
vector
<
const
platform
::
DeviceContext
*>
DeviceContextPool
::
Borrow
(
const
std
::
vector
<
platform
::
Place
>&
places
)
{
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
PADDLE_ENFORCE_LE
(
places
.
size
(),
device_contexts_
.
size
());
std
::
vector
<
const
platform
::
DeviceContext
*>
borrowed_contexts
;
for
(
auto
&
place
:
places
)
{
auto
it
=
device_contexts_
.
find
(
place
);
if
(
it
!=
device_contexts_
.
end
())
{
borrowed_contexts
.
emplace_back
(
it
->
second
);
}
else
{
PADDLE_THROW
(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option"
);
}
}
return
borrowed_contexts
;
}
DeviceContextPool
::
DeviceContextPool
(
const
std
::
vector
<
platform
::
Place
>&
places
)
{
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
...
...
paddle/platform/device_context.h
浏览文件 @
812c5f60
...
...
@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext {
std
::
unique_ptr
<
Eigen
::
DefaultDevice
>
eigen_device_
;
};
template
<
typename
Place
>
struct
DefaultDeviceContextType
;
template
<
>
struct
DefaultDeviceContextType
<
platform
::
CPUPlace
>
{
using
TYPE
=
CPUDeviceContext
;
};
#ifdef PADDLE_WITH_CUDA
class
EigenCudaStreamDevice
;
...
...
@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext {
cublasHandle_t
cublas_handle_
;
};
template
<
>
struct
DefaultDeviceContextType
<
platform
::
CUDAPlace
>
{
using
TYPE
=
CUDADeviceContext
;
};
class
CUDNNDeviceContext
:
public
CUDADeviceContext
{
public:
explicit
CUDNNDeviceContext
(
CUDAPlace
place
);
...
...
@@ -109,13 +122,13 @@ class DeviceContextPool {
public:
explicit
DeviceContextPool
(
const
std
::
vector
<
platform
::
Place
>&
places
);
static
DeviceContextPool
&
Get
()
{
static
DeviceContextPool
&
Instance
()
{
PADDLE_ENFORCE_NOT_NULL
(
pool
,
"Need to Create DeviceContextPool first!"
);
return
*
pool
;
}
/*! \brief Create should only called by Init function */
static
DeviceContextPool
&
Create
(
const
std
::
vector
<
platform
::
Place
>&
places
)
{
static
DeviceContextPool
&
Init
(
const
std
::
vector
<
platform
::
Place
>&
places
)
{
if
(
pool
==
nullptr
)
{
pool
=
new
DeviceContextPool
(
places
);
}
...
...
@@ -123,13 +136,14 @@ class DeviceContextPool {
}
/*! \brief Return handle of single device context. */
const
platform
::
DeviceContext
*
Borrow
(
const
platform
::
Place
&
place
);
/*! \brief Return handle of multi-device context. */
std
::
vector
<
const
platform
::
DeviceContext
*>
Borrow
(
const
std
::
vector
<
platform
::
Place
>&
places
);
const
platform
::
DeviceContext
*
Get
(
const
platform
::
Place
&
place
);
~
DeviceContextPool
()
{}
template
<
typename
Place
>
const
typename
DefaultDeviceContextType
<
Place
>::
TYPE
*
GetByPlace
(
const
Place
&
place
)
{
return
reinterpret_cast
<
const
typename
DefaultDeviceContextType
<
Place
>::
TYPE
*>
(
Get
(
place
));
}
private:
static
DeviceContextPool
*
pool
;
...
...
paddle/platform/device_context_test.cu
浏览文件 @
812c5f60
...
...
@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) {
using
paddle
::
platform
::
CPUPlace
;
using
paddle
::
platform
::
CUDAPlace
;
DeviceContextPool
&
pool
=
DeviceContextPool
::
Get
();
auto
cpu_dev_ctx1
=
pool
.
Borrow
(
CPUPlace
());
auto
cpu_dev_ctx2
=
pool
.
Borrow
(
CPUPlace
());
EXPECT_TRUE
(
cpu_dev_ctx2
==
cpu_dev_ctx1
);
DeviceContextPool
&
pool
=
DeviceContextPool
::
Instance
();
auto
cpu_dev_ctx1
=
pool
.
Get
(
CPUPlace
());
auto
cpu_dev_ctx2
=
pool
.
Get
(
CPUPlace
());
ASSERT_EQ
(
cpu_dev_ctx2
,
cpu_dev_ctx1
);
std
::
vector
<
Place
>
gpu_places
;
int
count
=
paddle
::
platform
::
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
gpu_places
.
emplace_back
(
CUDAPlace
(
i
));
}
auto
dev_ctxs
=
pool
.
Borrow
(
gpu_places
);
for
(
size_t
i
=
0
;
i
<
dev_ctxs
.
size
();
++
i
)
{
auto
*
dev_ctx
=
static_cast
<
const
CUDADeviceContext
*>
(
dev_ctxs
[
i
]);
// check same as CUDAPlace(i)
CUDAPlace
place
=
boost
::
get
<
CUDAPlace
>
(
dev_ctx
->
GetPlace
());
EXPECT_EQ
(
place
.
GetDeviceId
(),
static_cast
<
int
>
(
i
));
auto
dev_ctx
=
pool
.
Get
(
CUDAPlace
(
i
));
ASSERT_NE
(
dev_ctx
,
nullptr
);
}
}
int
main
(
int
argc
,
char
**
argv
)
{
int
dev_count
=
paddle
::
platform
::
GetCUDADeviceCount
();
if
(
dev_count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu DeviceContextPool, because the CUDA "
"device count is "
<<
dev_count
;
return
0
;
}
std
::
vector
<
paddle
::
platform
::
Place
>
places
;
places
.
emplace_back
(
paddle
::
platform
::
CPUPlace
());
...
...
@@ -109,7 +94,7 @@ int main(int argc, char** argv) {
}
VLOG
(
0
)
<<
" DeviceCount "
<<
count
;
paddle
::
platform
::
DeviceContextPool
::
Create
(
places
);
paddle
::
platform
::
DeviceContextPool
::
Init
(
places
);
testing
::
InitGoogleTest
(
&
argc
,
argv
);
return
RUN_ALL_TESTS
();
...
...
paddle/platform/for_range.h
浏览文件 @
812c5f60
...
...
@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> {
template
<
typename
Function
>
inline
void
operator
()(
Function
func
)
const
{
constexpr
size_
t
num_threads
=
1024
;
constexpr
in
t
num_threads
=
1024
;
int
block_size
=
limit_
<=
num_threads
?
limit_
:
num_threads
;
int
grid_size
=
(
limit_
+
num_threads
-
1
)
/
num_threads
;
...
...
paddle/platform/nccl_test.cu
浏览文件 @
812c5f60
...
...
@@ -144,7 +144,7 @@ int main(int argc, char** argv) {
}
VLOG
(
0
)
<<
" DeviceCount "
<<
count
;
paddle
::
platform
::
DeviceContextPool
::
Create
(
places
);
paddle
::
platform
::
DeviceContextPool
::
Init
(
places
);
testing
::
InitGoogleTest
(
&
argc
,
argv
);
return
RUN_ALL_TESTS
();
...
...
paddle/platform/place.h
浏览文件 @
812c5f60
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <iostream>
#include "paddle/platform/enforce.h"
#include "paddle/platform/variant.h"
namespace
paddle
{
...
...
@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &);
std
::
ostream
&
operator
<<
(
std
::
ostream
&
,
const
Place
&
);
template
<
typename
Visitor
>
struct
PlaceVisitorWrapper
:
public
boost
::
static_visitor
<
typename
Visitor
::
result_type
>
{
const
Visitor
&
visitor_
;
explicit
PlaceVisitorWrapper
(
const
Visitor
&
visitor
)
:
visitor_
(
visitor
)
{}
typename
Visitor
::
result_type
operator
()(
const
CPUPlace
&
cpu
)
const
{
return
visitor_
(
cpu
);
}
typename
Visitor
::
result_type
operator
()(
const
CUDAPlace
&
cuda
)
const
{
#ifdef PADDLE_WITH_CUDA
return
visitor_
(
cuda
);
#else
PADDLE_THROW
(
"Paddle is not compiled with CUDA. Cannot visit cuda device"
);
return
typename
Visitor
::
result_type
();
#endif
}
};
template
<
typename
Visitor
>
typename
Visitor
::
result_type
VisitPlace
(
const
Place
&
place
,
const
Visitor
&
visitor
)
{
return
boost
::
apply_visitor
(
PlaceVisitorWrapper
<
Visitor
>
(
visitor
),
place
);
}
}
// namespace platform
}
// namespace paddle
paddle/pybind/CMakeLists.txt
浏览文件 @
812c5f60
...
...
@@ -3,6 +3,7 @@ if(WITH_PYTHON)
SRCS pybind.cc exception.cc protobuf.cc const_value.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init
${
GLOB_OP_LIB
}
)
target_link_libraries
(
paddle_pybind rt
)
endif
(
WITH_PYTHON
)
if
(
WITH_DOC
)
...
...
paddle/pybind/protobuf.cc
浏览文件 @
812c5f60
...
...
@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) {
std
::
string
name
=
byte_name
;
return
self
.
HasVar
(
name
);
})
.
def
(
"has_var_recursive"
,
[](
BlockDesc
&
self
,
py
::
bytes
byte_name
)
{
std
::
string
name
=
byte_name
;
return
self
.
HasVarRecursive
(
name
);
})
.
def
(
"find_var"
,
[](
BlockDesc
&
self
,
py
::
bytes
byte_name
)
{
std
::
string
name
=
byte_name
;
return
self
.
FindVar
(
name
);
},
py
::
return_value_policy
::
reference
)
.
def
(
"find_var_recursive"
,
[](
BlockDesc
&
self
,
py
::
bytes
byte_name
)
{
std
::
string
name
=
byte_name
;
return
self
.
FindVarRecursive
(
name
);
},
py
::
return_value_policy
::
reference
)
.
def
(
"all_vars"
,
&
BlockDesc
::
AllVars
,
py
::
return_value_policy
::
reference
)
.
def
(
"op_size"
,
&
BlockDesc
::
OpSize
)
.
def
(
"op"
,
&
BlockDesc
::
Op
,
py
::
return_value_policy
::
reference
)
...
...
@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) {
.
def
(
"set_shape"
,
&
VarDesc
::
SetShape
)
.
def
(
"set_dtype"
,
&
VarDesc
::
SetDataType
)
.
def
(
"shape"
,
&
VarDesc
::
Shape
,
py
::
return_value_policy
::
reference
)
.
def
(
"dtype"
,
&
VarDesc
::
GetDataType
)
.
def
(
"dtype"
,
&
VarDesc
::
GetDataType
,
py
::
return_value_policy
::
reference
)
.
def
(
"lod_level"
,
&
VarDesc
::
GetLodLevel
)
.
def
(
"set_lod_level"
,
&
VarDesc
::
SetLoDLevel
)
.
def
(
"type"
,
&
VarDesc
::
GetType
)
...
...
@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) {
.
value
(
"BLOCK"
,
proto
::
AttrType
::
BLOCK
);
py
::
class_
<
OpDesc
>
op_desc
(
m
,
"OpDesc"
,
""
);
op_desc
.
def
(
"type"
,
&
OpDesc
::
Type
)
op_desc
.
def
(
"__init__"
,
[](
OpDesc
&
self
)
{
new
(
&
self
)
OpDesc
();
},
py
::
return_value_policy
::
reference
)
.
def
(
"copy_from"
,
&
OpDesc
::
CopyFrom
)
.
def
(
"type"
,
&
OpDesc
::
Type
)
.
def
(
"set_type"
,
&
OpDesc
::
SetType
)
.
def
(
"input"
,
&
OpDesc
::
Input
)
.
def
(
"input_names"
,
&
OpDesc
::
InputNames
)
.
def
(
"set_input"
,
&
OpDesc
::
SetInput
)
.
def
(
"output"
,
&
OpDesc
::
Output
)
.
def
(
"output_names"
,
&
OpDesc
::
OutputNames
)
.
def
(
"set_input"
,
&
OpDesc
::
SetInput
)
.
def
(
"set_output"
,
&
OpDesc
::
SetOutput
)
.
def
(
"input_arg_names"
,
&
OpDesc
::
InputArgumentNames
)
.
def
(
"output_arg_names"
,
&
OpDesc
::
OutputArgumentNames
)
.
def
(
"rename_input"
,
&
OpDesc
::
RenameInput
)
.
def
(
"rename_output"
,
&
OpDesc
::
RenameOutput
)
.
def
(
"has_attr"
,
&
OpDesc
::
HasAttr
)
.
def
(
"attr_type"
,
&
OpDesc
::
GetAttrType
)
.
def
(
"attr_names"
,
&
OpDesc
::
AttrNames
)
...
...
paddle/pybind/pybind.cc
浏览文件 @
812c5f60
...
...
@@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle.
}
return
ret_values
;
});
m
.
def
(
"get_grad_op_descs"
,
[](
const
OpDesc
&
op_desc
,
const
std
::
unordered_set
<
std
::
string
>
&
no_grad_set
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
&
grad_to_var
,
const
std
::
vector
<
BlockDesc
*>
&
grad_sub_block
)
{
std
::
vector
<
std
::
unique_ptr
<
OpDesc
>>
grad_op_descs
=
framework
::
OpInfoMap
::
Instance
()
.
Get
(
op_desc
.
Type
())
.
GradOpMaker
()(
op_desc
,
no_grad_set
,
&
grad_to_var
,
grad_sub_block
);
std
::
vector
<
OpDesc
*>
grad_op_desc_ptrs
(
grad_op_descs
.
size
());
std
::
transform
(
grad_op_descs
.
begin
(),
grad_op_descs
.
end
(),
grad_op_desc_ptrs
.
begin
(),
[](
std
::
unique_ptr
<
OpDesc
>
&
p
)
{
return
p
.
release
();
});
return
grad_op_desc_ptrs
;
});
m
.
def
(
"get_grad_op_desc"
,
[](
const
OpDesc
&
op_desc
,
const
std
::
unordered_set
<
std
::
string
>
&
no_grad_set
,
const
std
::
vector
<
BlockDesc
*>
&
grad_sub_block
)
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
grad_to_var
;
std
::
vector
<
std
::
unique_ptr
<
OpDesc
>>
grad_op_descs
=
framework
::
OpInfoMap
::
Instance
()
.
Get
(
op_desc
.
Type
())
.
GradOpMaker
()(
op_desc
,
no_grad_set
,
&
grad_to_var
,
grad_sub_block
);
std
::
vector
<
OpDesc
*>
grad_op_desc_ptrs
(
grad_op_descs
.
size
());
std
::
transform
(
grad_op_descs
.
begin
(),
grad_op_descs
.
end
(),
grad_op_desc_ptrs
.
begin
(),
[](
std
::
unique_ptr
<
OpDesc
>
&
p
)
{
return
p
.
release
();
});
return
std
::
make_pair
(
grad_op_desc_ptrs
,
grad_to_var
);
});
m
.
def
(
"prune"
,
[](
const
ProgramDesc
&
origin
,
const
std
::
vector
<
std
::
array
<
size_t
,
2
>>
&
targets
)
{
ProgramDesc
prog_with_targets
(
origin
);
...
...
@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle.
InferenceOptimize
(
*
(
origin
.
Proto
()),
&
pruned_desc
);
return
new
ProgramDesc
(
pruned_desc
);
});
m
.
def
(
"empty_var_name"
,
[]()
{
return
framework
::
kEmptyVarName
;
});
m
.
def
(
"grad_var_suffix"
,
[]()
{
return
framework
::
kGradVarSuffix
;
});
m
.
def_submodule
(
"var_names"
,
"The module will return special predefined variable name in Paddle"
)
...
...
paddle/pybind/tensor_py.h
浏览文件 @
812c5f60
...
...
@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto
*
dst_ptr
=
static_cast
<
void
*>
(
dst_tensor
.
mutable_data
<
CUR_TYPE
>
(
tensor
.
dims
(),
platform
::
CPUPlace
()));
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Borrow
(
tensor
.
place
()));
pool
.
Get
(
tensor
.
place
()));
paddle
::
platform
::
GpuMemcpyAsync
(
dst_ptr
,
src_ptr
,
sizeof
(
CUR_TYPE
)
*
tensor
.
numel
(),
...
...
@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray(
self
.
Resize
(
framework
::
make_ddim
(
dims
));
auto
*
dst
=
self
.
mutable_data
<
T
>
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Borrow
(
place
));
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
));
paddle
::
platform
::
GpuMemcpyAsync
(
dst
,
array
.
data
(),
sizeof
(
T
)
*
array
.
size
(),
cudaMemcpyHostToDevice
,
dev_ctx
->
stream
());
}
...
...
paddle/scripts/docker/build.sh
浏览文件 @
812c5f60
...
...
@@ -178,7 +178,7 @@ EOF
# run paddle version to install python packages first
RUN apt-get update &&
\
${
NCCL_DEPS
}
\
apt-get install -y wget python-pip dmidecode && pip install -U pip &&
\
apt-get install -y wget python-pip dmidecode
python-tk
&& pip install -U pip &&
\
pip install /*.whl; apt-get install -f -y &&
\
apt-get clean -y &&
\
rm -f /*.whl &&
\
...
...
paddle/scripts/submit_local.sh.in
浏览文件 @
812c5f60
...
...
@@ -71,9 +71,7 @@ function threads_config() {
# auto set OMP_NUM_THREADS and MKL_NUM_THREADS
# according to trainer_count and total processors
# only when MKL enabled
if
[
"@WITH_MKL@"
==
"OFF"
]
;
then
return
0
fi
# auto set OPENBLAS_NUM_THREADS when do not use MKL
processors
=
`
grep
"processor"
/proc/cpuinfo|sort
-u
|wc
-l
`
trainers
=
`
grep
-Eo
'trainer_count.[0-9]+'
<<<
"
$@
"
|grep
-Eo
'[0-9]+'
|xargs
`
if
[
-z
$trainers
]
;
then
...
...
@@ -83,12 +81,19 @@ function threads_config() {
if
[
$threads
-eq
0
]
;
then
threads
=
1
fi
if
[
-z
"
$OMP_NUM_THREADS
"
]
;
then
export
OMP_NUM_THREADS
=
$threads
fi
if
[
-z
"
$MKL_NUM_THREADS
"
]
;
then
export
MKL_NUM_THREADS
=
$threads
if
[
"@WITH_MKL@"
==
"ON"
]
;
then
if
[
-z
"
$OMP_NUM_THREADS
"
]
;
then
export
OMP_NUM_THREADS
=
$threads
fi
if
[
-z
"
$MKL_NUM_THREADS
"
]
;
then
export
MKL_NUM_THREADS
=
$threads
fi
else
if
[
-z
"
$OPENBLAS_NUM_THREADS
"
]
;
then
export
OPENBLAS_NUM_THREADS
=
$threads
fi
fi
}
PADDLE_CONF_HOME
=
"
$HOME
/.config/paddle"
...
...
@@ -150,7 +155,7 @@ fi
case
"
$1
"
in
"train"
)
threads_config
$@
# echo $OMP_NUM_THREADS $MKL_NUM_THREADS
# echo $OMP_NUM_THREADS $MKL_NUM_THREADS
$OPENBLAS_NUM_THREADS
${
DEBUGGER
}
$PADDLE_BIN_PATH
/paddle_trainer
${
@
:2
}
;;
"merge_model"
)
...
...
python/paddle/v2/dataset/flowers.py
浏览文件 @
812c5f60
...
...
@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid']
DATA_URL
=
'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
LABEL_URL
=
'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
SETID_URL
=
'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
DATA_MD5
=
'
52808999861908f626f3c1f4e79d11fa
'
DATA_MD5
=
'
33bfc11892f1e405ca193ae9a9f2a118
'
LABEL_MD5
=
'e0620be6f572b9609742df49c70aed4d'
SETID_MD5
=
'a5357ecc9cb78c4bef273ce3793fc85c'
# In official 'readme', tstid is the flag of test data
...
...
python/paddle/v2/fluid/__init__.py
浏览文件 @
812c5f60
...
...
@@ -36,7 +36,7 @@ def __read_gflags_from_env__():
"""
import
sys
import
core
read_env_flags
=
[
'use_pinned_memory'
]
read_env_flags
=
[
'use_pinned_memory'
,
'check_nan_inf'
]
if
core
.
is_compile_gpu
():
read_env_flags
.
append
(
'fraction_of_gpu_memory_to_use'
)
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
python/paddle/v2/fluid/backward.py
浏览文件 @
812c5f60
from
paddle.v2.fluid
import
framework
as
framework
from
.
import
core
import
collections
__all__
=
[
'append_backward
_ops
'
]
__all__
=
[
'append_backward'
]
def
append_backward_ops
(
loss
,
parameter_list
=
None
,
no_grad_set
=
None
):
def
_rename_arg_
(
op_desc_list
,
old_name
,
new_name
,
begin_idx
=
None
,
end_idx
=
None
):
if
begin_idx
is
None
:
begin_idx
=
0
if
end_idx
is
None
:
end_idx
=
len
(
op_desc_list
)
for
i
in
range
(
begin_idx
,
end_idx
):
op_desc
=
op_desc_list
[
i
]
if
isinstance
(
op_desc
,
tuple
):
op_desc
=
op_desc
[
0
]
op_desc
.
rename_input
(
old_name
,
new_name
)
op_desc
.
rename_output
(
old_name
,
new_name
)
def
_create_op_desc_
(
op_type
,
inputs
,
outputs
,
attrs
):
op_desc
=
core
.
OpDesc
()
op_desc
.
set_type
(
op_type
)
for
para
,
args
in
inputs
.
iteritems
():
op_desc
.
set_input
(
para
,
args
)
for
para
,
args
in
outputs
.
iteritems
():
op_desc
.
set_output
(
para
,
args
)
for
name
,
val
in
attrs
.
iteritems
():
if
isinstance
(
val
,
framework
.
Block
):
op_desc
.
set_block_attr
(
name
,
val
.
desc
)
else
:
op_desc
.
set_attr
(
name
,
val
)
return
op_desc
def
_infer_var_data_type_
(
var_name
,
block
):
grad_var
=
block
.
desc
.
find_var
(
var_name
.
encode
(
"ascii"
))
fwd_name
=
_strip_grad_suffix_
(
var_name
.
encode
(
"ascii"
))
if
block
.
desc
.
has_var_recursive
(
fwd_name
):
fwd_var
=
block
.
desc
.
find_var_recursive
(
fwd_name
.
encode
(
"ascii"
))
grad_var
.
set_dtype
(
fwd_var
.
dtype
())
else
:
grad_var
.
set_dtype
(
core
.
DataType
.
FP32
)
def
_all_in_set_
(
cands
,
s
):
for
c
in
cands
:
if
not
c
in
s
:
return
False
return
True
def
_strip_grad_suffix_
(
name
):
pos
=
name
.
find
(
core
.
grad_var_suffix
())
return
name
[:
pos
]
if
pos
!=
-
1
else
name
def
_append_grad_suffix_
(
name
):
return
name
+
core
.
grad_var_suffix
()
def
_addup_repetitive_outputs_
(
op_descs
):
# In backward part, an variable my be the output of more than one ops.
# In this case, the variable should be the accumulation of all the outputs.
# We adopt adding `sum_op`s to implement the accumulate.
pending_sum_ops
=
[]
var_rename_count
=
collections
.
defaultdict
(
int
)
renamed_vars
=
collections
.
defaultdict
(
list
)
for
idx
,
op_desc
in
enumerate
(
op_descs
):
for
var_name
in
op_desc
.
input_arg_names
():
if
len
(
renamed_vars
[
var_name
])
>
1
:
pending_sum_ops
.
append
(
(
_create_op_desc_
(
"sum"
,
{
"X"
:
renamed_vars
[
var_name
]},
{
"Out"
:
[
var_name
]},
{}),
idx
))
renamed_vars
[
var_name
]
=
[
var_name
]
for
var_name
in
op_desc
.
output_arg_names
():
if
var_name
==
core
.
empty_var_name
(
)
or
var_name
in
op_desc
.
input_arg_names
():
# empty variable or inplace op
continue
if
len
(
renamed_vars
[
var_name
])
==
0
:
# it's the first time we get the variable
renamed_vars
[
var_name
]
=
[
var_name
]
else
:
if
len
(
renamed_vars
[
var_name
])
==
1
:
new_name
=
var_name
+
"@RENAME@"
+
\
str
(
var_rename_count
[
var_name
])
var_rename_count
[
var_name
]
+=
1
# rename original var_name
renamed_vars
[
var_name
][
0
]
=
new_name
_rename_arg_
(
op_descs
,
var_name
,
new_name
,
0
,
idx
)
_rename_arg_
(
pending_sum_ops
,
var_name
,
new_name
)
new_name
=
var_name
+
"@RENAME@"
+
\
str
(
var_rename_count
[
var_name
])
var_rename_count
[
var_name
]
+=
1
op_desc
.
rename_output
(
var_name
,
new_name
)
renamed_vars
[
var_name
].
append
(
new_name
)
for
var_name
,
inputs
in
renamed_vars
.
iteritems
():
if
len
(
inputs
)
>
1
:
pending_sum_ops
.
append
((
_create_op_desc_
(
"sum"
,
{
"X"
:
inputs
},
{
"Out"
:
[
var_name
]},
{}),
len
(
op_descs
)))
# sum_op descs are sorted according to their insert position
for
p
in
reversed
(
pending_sum_ops
):
op_descs
.
insert
(
p
[
1
],
p
[
0
])
return
op_descs
def
_remove_no_grad_branch_
(
op_descs
,
no_grad_set
):
# Remove ops whose outputs are all in no_grad_dict
op_descs
=
filter
(
lambda
op_desc
:
not
_all_in_set_
(
op_desc
.
output_arg_names
(),
no_grad_set
),
op_descs
)
# Insert fill_zeros_like_op
to_insert
=
[]
for
idx
,
op_desc
in
enumerate
(
op_descs
):
for
arg
in
op_desc
.
input_arg_names
():
if
core
.
grad_var_suffix
()
in
arg
and
arg
in
no_grad_set
:
to_insert
.
append
((
_create_op_desc_
(
"fill_zeros_like"
,
{
"X"
:
[
_strip_grad_suffix_
(
arg
)]
},
{
"Y"
:
[
arg
]},
{}),
idx
))
map
(
lambda
p
:
op_descs
.
insert
(
p
[
1
],
p
[
0
]),
reversed
(
to_insert
))
return
op_descs
def
_append_backward_ops_
(
target
,
block
,
target_block
,
no_grad_dict
,
grad_to_var
,
callback
=
None
):
grad_op_descs
=
[]
program
=
block
.
program
for
op
in
reversed
(
block
.
ops
):
grad_sub_block_list
=
[]
# If the op has its own sub-block, deal with the sub-block first
if
op
.
has_attr
(
"sub_block"
):
sub_block
=
program
.
block
(
op
.
block_attr
(
"sub_block"
))
grad_sub_block
=
program
.
create_block
(
parent_idx
=
sub_block
.
idx
)
_append_backward_ops_
(
target
,
sub_block
,
grad_sub_block
,
no_grad_dict
,
grad_to_var
,
callback
)
grad_sub_block_list
.
append
(
grad_sub_block
.
desc
)
grad_op_desc
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
op
.
desc
,
no_grad_dict
[
block
.
idx
],
grad_sub_block_list
)
grad_op_descs
.
extend
(
grad_op_desc
)
grad_to_var
.
update
(
op_grad_to_var
)
grad_op_descs
=
_addup_repetitive_outputs_
(
grad_op_descs
)
grad_op_descs
=
_remove_no_grad_branch_
(
grad_op_descs
,
no_grad_dict
[
block
.
idx
])
if
target_block
.
idx
==
0
:
grad_op_descs
.
insert
(
0
,
_create_op_desc_
(
"fill_constant"
,
{},
{
"Out"
:
[
_append_grad_suffix_
(
target
.
name
)]
},
{
"shape"
:
[
1
],
"value"
:
1.0
,
"dtype"
:
target
.
dtype
}))
# append op_desc in grad_op_descs to target_block
for
op_desc
in
grad_op_descs
:
new_op_desc
=
target_block
.
desc
.
append_op
()
new_op_desc
.
copy_from
(
op_desc
)
def
_append_backward_vars_
(
block
,
start_op_idx
,
grad_to_var
,
grad_info_map
):
for
op_idx
in
range
(
start_op_idx
,
block
.
desc
.
op_size
()):
op_desc
=
block
.
desc
.
op
(
op_idx
)
if
op_desc
.
has_attr
(
"sub_block"
):
sub_block
=
block
.
program
.
block
(
op_desc
.
block_attr
(
"sub_block"
))
_append_backward_vars_
(
sub_block
,
0
,
grad_to_var
,
grad_info_map
)
new_vars
=
set
()
# create new gradient variables
for
grad_var_name
in
op_desc
.
output_arg_names
():
grad_var_name
=
grad_var_name
.
encode
(
"ascii"
)
if
block
.
desc
.
has_var_recursive
(
grad_var_name
)
or
grad_var_name
==
core
.
empty_var_name
():
continue
block
.
desc
.
var
(
grad_var_name
)
new_vars
.
add
(
grad_var_name
)
if
not
grad_to_var
.
has_key
(
grad_var_name
):
continue
grad_info_map
[
grad_to_var
[
grad_var_name
]]
=
(
grad_var_name
,
block
)
# infer_shape and infer_type
op_desc
.
infer_var_type
(
block
.
desc
)
op_desc
.
infer_shape
(
block
.
desc
)
for
arg
in
op_desc
.
output_arg_names
():
if
arg
in
new_vars
:
_infer_var_data_type_
(
arg
,
block
)
def
append_backward
(
loss
,
parameter_list
=
None
,
no_grad_set
=
None
):
"""
Create and add gradient Operators in BlockDesc to compute
gradients of `loss` for parameters in parameter_list
:param loss: an variable generated by cost function.
:type loss: Variable
:param no_grad_
se
t: variable that should not create gradient
:type no_grad_
se
t: set
:param no_grad_
dic
t: variable that should not create gradient
:type no_grad_
dic
t: set
:param parameter_list: parameters that need to compute gradient and
update to optimize the lost.
:type: list
...
...
@@ -20,35 +212,53 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
"""
assert
isinstance
(
loss
,
framework
.
Variable
)
program
=
loss
.
block
.
program
no_grad_dict
=
dict
()
if
no_grad_set
is
None
:
program
=
loss
.
block
.
program
assert
isinstance
(
program
,
framework
.
Program
)
no_grad_set
=
list
()
for
block
in
program
.
blocks
:
assert
isinstance
(
block
,
framework
.
Block
)
block_no_grad_set
=
set
()
for
var
in
block
.
vars
.
itervalues
():
assert
isinstance
(
var
,
framework
.
Variable
)
if
var
.
stop_gradient
:
no_grad_set
.
append
(
var
.
name
)
no_grad_set
=
set
(
no_grad_set
)
block_no_grad_set
.
add
(
_append_grad_suffix_
(
var
.
name
))
no_grad_dict
[
block
.
idx
]
=
block_no_grad_set
elif
isinstance
(
no_grad_set
,
set
):
no_grad_dict
=
{
0
:
no_grad_set
}
else
:
raise
ValueError
(
"'no_grad_set' should be a set or None."
)
grad_info_map
=
dict
()
root_block
=
program
.
block
(
0
)
fwd_op_num
=
root_block
.
desc
.
op_size
()
current_block_idx
=
program
.
current_block_idx
grad_to_var
=
dict
()
_append_backward_ops_
(
loss
,
root_block
,
root_block
,
no_grad_dict
,
grad_to_var
)
_append_backward_vars_
(
root_block
,
fwd_op_num
,
grad_to_var
,
grad_info_map
)
program
.
current_block_idx
=
current_block_idx
program
.
sync_with_cpp
()
param_grad_map
=
loss
.
block
.
program
.
append_backward
(
loss
,
no_grad_set
)
if
parameter_list
is
not
None
:
parameters
=
parameter_list
else
:
params
=
loss
.
block
.
program
.
global_block
().
all_parameters
()
params
=
program
.
global_block
().
all_parameters
()
parameters
=
[
param
.
name
for
param
in
params
]
params_and_grads
=
[]
for
param
in
parameters
:
if
param
not
in
param_grad
_map
:
if
param
not
in
grad_info
_map
:
raise
ValueError
(
"param %s is not in map"
%
param
)
grad_info
=
param_grad
_map
[
param
]
grad_block
=
loss
.
block
.
program
.
block
(
grad_info
[
1
])
grad_info
=
grad_info
_map
[
param
]
grad_block
=
grad_info
[
1
]
if
not
grad_block
.
has_var
(
grad_info
[
0
]):
raise
ValueError
(
"grad block[{0}] did not have grad var {1}"
.
format
(
grad_info
[
1
],
grad_info
[
0
]))
# Get the param var from the global block
param_var
=
loss
.
block
.
program
.
global_block
().
var
(
param
)
param_var
=
program
.
global_block
().
var
(
param
)
grad_var
=
grad_block
.
var
(
grad_info
[
0
])
if
loss
.
block
.
has_var
(
grad_info
[
0
]):
params_and_grads
.
append
((
param_var
,
grad_var
))
...
...
python/paddle/v2/fluid/data_feeder.py
浏览文件 @
812c5f60
...
...
@@ -3,7 +3,7 @@ import core
import
numpy
import
six.moves
as
six
from
framework
import
Variable
from
framework
import
Variable
,
default_main_program
__all__
=
[
'DataFeeder'
]
...
...
@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object):
class
DataFeeder
(
object
):
def
__init__
(
self
,
feed_list
,
place
):
def
__init__
(
self
,
feed_list
,
place
,
program
=
None
):
self
.
feed_dtypes
=
[]
self
.
feed_names
=
[]
self
.
feed_shapes
=
[]
self
.
feed_lod_level
=
[]
if
program
is
None
:
program
=
default_main_program
()
for
each_var
in
feed_list
:
if
isinstance
(
each_var
,
basestring
):
each_var
=
program
.
block
(
0
).
var
(
each_var
)
if
not
isinstance
(
each_var
,
Variable
):
raise
TypeError
(
"Feed list should contain a list of variable"
)
self
.
feed_dtypes
.
append
(
each_var
.
dtype
)
...
...
python/paddle/v2/fluid/framework.py
浏览文件 @
812c5f60
...
...
@@ -846,9 +846,11 @@ class Program(object):
self
.
sync_with_cpp
()
return
param_to_grad_info
def
create_block
(
self
):
def
create_block
(
self
,
parent_idx
=
None
):
new_block_idx
=
len
(
self
.
blocks
)
self
.
desc
.
append_block
(
self
.
current_block
().
desc
)
parent
=
self
.
current_block
()
if
parent_idx
is
None
else
self
.
block
(
parent_idx
)
self
.
desc
.
append_block
(
parent
.
desc
)
self
.
current_block_idx
=
new_block_idx
self
.
blocks
.
append
(
Block
(
self
,
self
.
current_block_idx
))
return
self
.
current_block
()
...
...
python/paddle/v2/fluid/io.py
浏览文件 @
812c5f60
...
...
@@ -188,7 +188,7 @@ def save_inference_model(dirname,
raise
ValueError
(
"'feed_var_names' should be a list of str."
)
if
isinstance
(
target_vars
,
Variable
):
feeded_var_names
=
[
feeded_var_name
s
]
target_vars
=
[
target_var
s
]
else
:
if
not
(
bool
(
target_vars
)
and
all
(
isinstance
(
var
,
Variable
)
for
var
in
target_vars
)):
...
...
python/paddle/v2/fluid/layers/nn.py
浏览文件 @
812c5f60
...
...
@@ -270,6 +270,7 @@ def gru_unit(input,
attr
=
helper
.
param_attr
,
shape
=
[
size
,
3
*
size
],
dtype
=
dtype
)
# create bias
if
bias
is
None
:
bias_size
=
[
1
,
3
*
size
]
bias
=
helper
.
create_parameter
(
...
...
@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs):
def
cross_entropy
(
input
,
label
,
**
kwargs
):
"""
This function computes cross_entropy using the input and label.
**Cross Entropy Layer**
This layer computes the cross entropy between `input` and `label`. It supports
both standard cross-entropy and soft-label cross-entropy loss computation.
1) One-hot cross-entropy:
`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
.. math::
Y[i] = -\log(X[i, Label[i]])
2) Soft-label cross-entropy:
`soft_label = True`, `Label[i, j]` indicates the soft label of class j
for sample i:
.. math::
Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
Please make sure that in this case the summation of each row of `label`
equals one.
3) One-hot cross-entropy with vecterized `label`:
As a special case of 2), when each row of 'label' has only one
non-zero element which is equal to 1, soft-label cross-entropy degenerates
to a one-hot cross-entropy with one-hot label representation.
Args:
input (Variable|list): a 2-D tensor with shape [N x D], where N is the
batch size and D is the number of classes. This input is a probability
computed by the previous operator, which is almost always the result
of a softmax operator.
label (Variable|list): the ground truth which is a 2-D tensor. When
`soft_label` is set to `False`, `label` is a tensor<int64> with shape
[N x 1]. When `soft_label` is set to `True`, `label` is a
tensor<float/double> with shape [N x D].
soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
the given labels as soft labels, default `False`.
Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss.
Raises:
`ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
`soft_label == True`, and the 2nd dimension of `input` and `label` are not
\
equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
Examples:
.. code-block:: python
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
"""
helper
=
LayerHelper
(
'cross_entropy'
,
**
kwargs
)
out
=
helper
.
create_tmp_variable
(
dtype
=
input
.
dtype
)
...
...
@@ -514,14 +567,83 @@ def conv2d(input,
groups
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
act
=
None
,
name
=
None
):
act
=
None
):
"""
This function creates the op for a 2-dimensional Convolution.
This is performed using the parameters of filters(size, dimensionality etc)
, stride and other configurations for a Convolution operation.
This funciton can also append an activation on top of the
conv-2d output, if mentioned in the input parameters.
**Convlution2D Layer**
The convolution2D layer calculates the output based on the input, filter
and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
are in NCHW format. Where N is batch size, C is the number of channels, H is the height
of the feature, and W is the width of the feature.
The details of convolution layer, please refer UFLDL's `convolution,
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
If bias attribution and activation type are provided, bias is added to the output of the convolution,
and the corresponding activation function is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W
\\
ast X + b)
In the above equation:
* :math:`X`: Input value, a tensor with NCHW format.
* :math:`W`: Filter value, a tensor with MCHW format.
* :math:`
\\
ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`
\\
sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
Input:
Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
Output:
Output shape: $(N, C_{out}, H_{out}, W_{out})$
Where
.. math::
H_{out}&=
\\
frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1
\\\\
W_{out}&=
\\
frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride. Default: stride = 1.
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding. Default: padding = 0.
groups(int): The groups number of the Conv2d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1
param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
act(str): Activation type. Default: None
Returns:
Variable: The tensor variable storing the convolution and
\
non-linearity activation result.
Raises:
ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
"""
if
stride
is
None
:
...
...
python/paddle/v2/fluid/optimizer.py
浏览文件 @
812c5f60
from
collections
import
defaultdict
import
framework
from
backward
import
append_backward
_ops
from
backward
import
append_backward
from
framework
import
unique_name
,
program_guard
from
initializer
import
Constant
from
layer_helper
import
LayerHelper
...
...
@@ -194,10 +194,10 @@ class Optimizer(object):
no_grad_set
=
None
):
"""Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward
_ops
()` and
This method combines interface `append_backward()` and
`create_optimization_pass()` into one.
"""
params_grads
=
append_backward
_ops
(
loss
,
parameter_list
,
no_grad_set
)
params_grads
=
append_backward
(
loss
,
parameter_list
,
no_grad_set
)
params_grads
=
append_gradient_clip_ops
(
params_grads
)
...
...
python/paddle/v2/fluid/tests/op_test.py
浏览文件 @
812c5f60
...
...
@@ -4,7 +4,7 @@ import random
import
itertools
import
paddle.v2.fluid.core
as
core
import
collections
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
from
paddle.v2.fluid.op
import
Operator
from
paddle.v2.fluid.executor
import
Executor
from
paddle.v2.fluid.framework
import
Program
,
OpProtoHolder
...
...
@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase):
op_loss
.
desc
.
infer_var_type
(
block
.
desc
)
op_loss
.
desc
.
infer_shape
(
block
.
desc
)
param_grad_list
=
append_backward
_ops
(
param_grad_list
=
append_backward
(
loss
=
loss
,
parameter_list
=
input_to_check
,
no_grad_set
=
no_grad_set
)
feed_dict
=
{
...
...
python/paddle/v2/fluid/tests/test_array_read_write_op.py
浏览文件 @
812c5f60
...
...
@@ -2,7 +2,7 @@ import unittest
import
paddle.v2.fluid.core
as
core
import
paddle.v2.fluid.layers
as
layers
from
paddle.v2.fluid.executor
import
Executor
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
from
paddle.v2.fluid.framework
import
default_main_program
import
numpy
...
...
@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase):
total_sum
=
layers
.
sums
(
input
=
[
a_sum
,
x_sum
])
total_sum_scaled
=
layers
.
scale
(
x
=
total_sum
,
scale
=
1
/
6.0
)
append_backward
_ops
(
total_sum_scaled
)
append_backward
(
total_sum_scaled
)
g_vars
=
map
(
default_main_program
().
global_block
().
var
,
[
each_x
.
name
+
"@GRAD"
for
each_x
in
x
])
...
...
python/paddle/v2/fluid/tests/test_conditional_block.py
浏览文件 @
812c5f60
...
...
@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers
import
paddle.v2.fluid.core
as
core
from
paddle.v2.fluid.framework
import
default_startup_program
,
default_main_program
from
paddle.v2.fluid.executor
import
Executor
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
import
numpy
...
...
@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase):
outs
=
exe
.
run
(
feed
=
{
'X'
:
x
},
fetch_list
=
[
out
])[
0
]
print
outs
loss
=
layers
.
mean
(
x
=
out
)
append_backward
_ops
(
loss
=
loss
)
append_backward
(
loss
=
loss
)
outs
=
exe
.
run
(
feed
=
{
'X'
:
x
},
fetch_list
=
[
...
...
python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
浏览文件 @
812c5f60
...
...
@@ -4,7 +4,7 @@ import numpy
import
paddle.v2.fluid.layers
as
layers
from
paddle.v2.fluid.framework
import
Program
,
program_guard
from
paddle.v2.fluid.executor
import
Executor
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
class
TestCPULoDTensorArrayOps
(
unittest
.
TestCase
):
...
...
@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
mean
=
layers
.
mean
(
x
=
result
)
append_backward
_ops
(
mean
)
append_backward
(
mean
)
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
numpy
.
arange
(
10
).
reshape
(
10
,
1
).
astype
(
'float32'
),
place
)
...
...
python/paddle/v2/fluid/tests/test_optimizer.py
浏览文件 @
812c5f60
...
...
@@ -2,7 +2,7 @@ import unittest
import
paddle.v2.fluid.framework
as
framework
import
paddle.v2.fluid.optimizer
as
optimizer
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
class
TestOptimizer
(
unittest
.
TestCase
):
...
...
@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase):
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
opts
=
momentum_optimizer
.
create_optimization_pass
(
...
...
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
learning_rate
=
0.01
momentum_optimizer
=
self
.
MockMomentum
(
learning_rate
=
learning_rate
,
momentum
=
0.2
,
use_nesterov
=
True
)
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
opts
=
momentum_optimizer
.
create_optimization_pass
(
...
...
@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase):
learning_rate
=
0.01
adagrad_optimizer
=
self
.
MockAdagrad
(
learning_rate
=
learning_rate
,
epsilon
=
1.0e-6
)
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adagrad_optimizer
.
get_accumulators
()),
0
)
opts
=
adagrad_optimizer
.
create_optimization_pass
(
params_grads
,
mul_out
,
...
...
@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase):
learning_rate
=
0.01
adam_optimizer
=
self
.
MockAdam
(
learning_rate
=
learning_rate
,
beta1
=
0.9
,
beta2
=
0.999
)
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adam_optimizer
.
get_accumulators
()),
0
)
opts
=
adam_optimizer
.
create_optimization_pass
(
params_grads
,
mul_out
,
...
...
@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
learning_rate
=
0.01
adamax_optimizer
=
self
.
MockAdamax
(
learning_rate
=
learning_rate
,
beta1
=
0.9
,
beta2
=
0.999
)
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adamax_optimizer
.
get_accumulators
()),
0
)
opts
=
adamax_optimizer
.
create_optimization_pass
(
params_grads
,
mul_out
,
...
...
@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
learning_rate
=
0.01
decayed_adagrad_optimizer
=
self
.
MockDecayedAdagrad
(
learning_rate
=
learning_rate
,
decay
=
0.95
,
epsilon
=
1.0e-6
)
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
decayed_adagrad_optimizer
.
get_accumulators
()),
0
)
opts
=
decayed_adagrad_optimizer
.
create_optimization_pass
(
...
...
python/paddle/v2/fluid/tests/test_recurrent_op.py
浏览文件 @
812c5f60
...
...
@@ -3,7 +3,7 @@ import unittest
import
paddle.v2.fluid.layers
as
layers
from
paddle.v2.fluid.framework
import
Program
,
grad_var_name
from
paddle.v2.fluid.executor
import
Executor
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
import
numpy
as
np
import
paddle.v2.fluid.core
as
core
...
...
@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase):
def
test_backward
(
self
):
self
.
check_forward
()
append_backward
_ops
(
self
.
output
)
append_backward
(
self
.
output
)
ana_grad
=
[
np
.
array
(
x
)
for
x
in
self
.
backward
()]
...
...
python/paddle/v2/fluid/tests/test_regularizer.py
浏览文件 @
812c5f60
...
...
@@ -3,7 +3,7 @@ import unittest
import
paddle.v2.fluid.framework
as
framework
import
paddle.v2.fluid.optimizer
as
optimizer
import
paddle.v2.fluid.regularizer
as
regularizer
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
class
TestL2DecayRegularizer
(
unittest
.
TestCase
):
...
...
@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
count_ops
=
len
(
block
.
ops
)
params_grads
=
optimizer
.
append_regularization_ops
(
params_grads
)
...
...
@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
params_grads
=
append_backward
_ops
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
count_ops
=
len
(
block
.
ops
)
params_grads
=
optimizer
.
append_regularization_ops
(
params_grads
)
...
...
python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
浏览文件 @
812c5f60
...
...
@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase):
new_dat
=
fluid
.
layers
.
reorder_lod_tensor_by_rank
(
x
=
dat
,
rank_table
=
table
)
loss
=
fluid
.
layers
.
mean
(
x
=
new_dat
)
fluid
.
backward
.
append_backward
_ops
(
loss
=
loss
)
fluid
.
backward
.
append_backward
(
loss
=
loss
)
cpu
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
cpu
)
...
...
python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
浏览文件 @
812c5f60
...
...
@@ -2,7 +2,7 @@ import unittest
from
paddle.v2.fluid.framework
import
Program
from
paddle.v2.fluid.executor
import
Executor
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
import
numpy
as
np
import
paddle.v2.fluid.core
as
core
...
...
python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
浏览文件 @
812c5f60
...
...
@@ -2,7 +2,7 @@ import unittest
import
paddle.v2.fluid.core
as
core
from
paddle.v2.fluid.executor
import
Executor
import
paddle.v2.fluid.layers
as
layers
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
from
paddle.v2.fluid.framework
import
default_main_program
import
numpy
...
...
@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
self
.
assertTrue
(
numpy
.
allclose
(
tensor_np
[
0
:
1
],
outs
[
2
]))
mem3_mean
=
layers
.
mean
(
x
=
mem3
)
append_backward
_ops
(
loss
=
mem3_mean
)
append_backward
(
loss
=
mem3_mean
)
x_grad
=
exe
.
run
(
feed
=
{
'x'
:
tensor
},
fetch_list
=
[
main_program
.
global_block
().
var
(
'x@GRAD'
)])[
0
]
...
...
python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
浏览文件 @
812c5f60
...
...
@@ -4,7 +4,7 @@ import numpy as np
import
paddle.v2.fluid.layers
as
layers
from
paddle.v2.fluid.framework
import
Program
,
program_guard
from
paddle.v2.fluid.executor
import
Executor
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
class
TestCPULoDTensorArrayOps
(
unittest
.
TestCase
):
...
...
@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
in_true
=
out_true
,
in_false
=
out_false
,
mask
=
y
,
x
=
x
,
level
=
level
)
mean
=
layers
.
mean
(
x
=
out
)
append_backward
_ops
(
mean
)
append_backward
(
mean
)
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
np
.
arange
(
10
).
reshape
(
10
,
1
).
astype
(
'float32'
),
place
)
...
...
python/paddle/v2/fluid/tests/test_while_op.py
浏览文件 @
812c5f60
...
...
@@ -2,7 +2,7 @@ import unittest
import
paddle.v2.fluid.layers
as
layers
from
paddle.v2.fluid.executor
import
Executor
import
paddle.v2.fluid.core
as
core
from
paddle.v2.fluid.backward
import
append_backward
_ops
from
paddle.v2.fluid.backward
import
append_backward
import
numpy
...
...
@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase):
sum_result
=
layers
.
array_read
(
array
=
mem_array
,
i
=
i
)
loss
=
layers
.
mean
(
x
=
sum_result
)
append_backward
_ops
(
loss
)
append_backward
(
loss
)
cpu
=
core
.
CPUPlace
()
exe
=
Executor
(
cpu
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录