Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
459d4cc8
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
459d4cc8
编写于
9月 04, 2018
作者:
L
luotao1
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into multi-thread2
上级
f507e5c1
90769670
变更
68
隐藏空白更改
内联
并排
Showing
68 changed file
with
1900 addition
and
1104 deletion
+1900
-1104
Dockerfile
Dockerfile
+1
-1
README.md
README.md
+7
-14
cmake/cuda.cmake
cmake/cuda.cmake
+13
-1
cmake/external/grpc.cmake
cmake/external/grpc.cmake
+1
-1
cmake/inference_lib.cmake
cmake/inference_lib.cmake
+7
-9
doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
+0
-1
doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
...cs/user_guides/howto/prepare_data/use_recordio_reader.rst
+0
-167
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-0
paddle/fluid/framework/.gitignore
paddle/fluid/framework/.gitignore
+2
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+33
-2
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+23
-8
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+4
-8
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+30
-88
paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+29
-13
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+14
-7
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+6
-0
paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+3
-2
paddle/fluid/framework/ir/infer_clean_graph_pass.cc
paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+11
-12
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+7
-11
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+3
-2
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+4
-16
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+15
-10
paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+1
-7
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+1
-4
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+1
-4
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+1
-0
paddle/fluid/inference/paddle_fluid.map
paddle/fluid/inference/paddle_fluid.map
+1
-0
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+2
-2
paddle/fluid/operators/attention_lstm_op.cc
paddle/fluid/operators/attention_lstm_op.cc
+0
-1
paddle/fluid/operators/detection/bbox_util.h
paddle/fluid/operators/detection/bbox_util.h
+66
-0
paddle/fluid/operators/detection/generate_proposal_labels_op.cc
.../fluid/operators/detection/generate_proposal_labels_op.cc
+8
-31
paddle/fluid/operators/detection/generate_proposals_op.cc
paddle/fluid/operators/detection/generate_proposals_op.cc
+2
-3
paddle/fluid/operators/detection/rpn_target_assign_op.cc
paddle/fluid/operators/detection/rpn_target_assign_op.cc
+179
-112
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+14
-3
paddle/fluid/operators/fusion_gru_op.cc
paddle/fluid/operators/fusion_gru_op.cc
+263
-163
paddle/fluid/operators/fusion_lstm_op.cc
paddle/fluid/operators/fusion_lstm_op.cc
+198
-182
paddle/fluid/operators/gru_unit_op.h
paddle/fluid/operators/gru_unit_op.h
+8
-8
paddle/fluid/operators/label_smooth_op.h
paddle/fluid/operators/label_smooth_op.h
+2
-1
paddle/fluid/operators/math/cpu_vec.h
paddle/fluid/operators/math/cpu_vec.h
+115
-0
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+31
-0
paddle/fluid/operators/math/maxouting.h
paddle/fluid/operators/math/maxouting.h
+1
-2
paddle/fluid/operators/math/pooling.h
paddle/fluid/operators/math/pooling.h
+1
-4
paddle/fluid/operators/math/sequence2batch.h
paddle/fluid/operators/math/sequence2batch.h
+5
-5
paddle/fluid/operators/roi_pool_op.cu
paddle/fluid/operators/roi_pool_op.cu
+6
-6
paddle/fluid/operators/roi_pool_op.h
paddle/fluid/operators/roi_pool_op.h
+2
-2
paddle/fluid/operators/save_combine_op.cc
paddle/fluid/operators/save_combine_op.cc
+1
-31
paddle/fluid/operators/save_op.cc
paddle/fluid/operators/save_op.cc
+1
-31
paddle/fluid/operators/sequence_enumerate_op.cc
paddle/fluid/operators/sequence_enumerate_op.cc
+97
-0
paddle/fluid/operators/sequence_enumerate_op.cu
paddle/fluid/operators/sequence_enumerate_op.cu
+84
-0
paddle/fluid/operators/sequence_enumerate_op.h
paddle/fluid/operators/sequence_enumerate_op.h
+56
-0
paddle/fluid/platform/macros.h
paddle/fluid/platform/macros.h
+5
-0
paddle/fluid/platform/port.h
paddle/fluid/platform/port.h
+124
-7
python/paddle/fluid/inferencer.py
python/paddle/fluid/inferencer.py
+3
-4
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+18
-21
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+47
-0
python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
.../image_classification/test_image_classification_resnet.py
+28
-8
python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
...api/image_classification/test_image_classification_vgg.py
+27
-10
python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
...-level-api/recognize_digits/test_recognize_digits_conv.py
+24
-9
python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
...h-level-api/recognize_digits/test_recognize_digits_mlp.py
+28
-9
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+8
-10
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+11
-9
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+3
-1
python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
...le/fluid/tests/unittests/test_generate_proposal_labels.py
+2
-2
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+7
-0
python/paddle/fluid/tests/unittests/test_roi_pool_op.py
python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+2
-2
python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
...paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+95
-36
python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
...addle/fluid/tests/unittests/test_sequence_enumerate_op.py
+105
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+2
-1
未找到文件。
Dockerfile
浏览文件 @
459d4cc8
...
@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
...
@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN
wget
-qO-
http://paddlepaddledeps.
bj
.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz |
\
RUN
wget
-qO-
http://paddlepaddledeps.
cdn
.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz |
\
tar
-xz
-C
/usr/local
&&
\
tar
-xz
-C
/usr/local
&&
\
cp
-rf
/usr/local/TensorRT/include /usr
&&
\
cp
-rf
/usr/local/TensorRT/include /usr
&&
\
cp
-rf
/usr/local/TensorRT/lib /usr
cp
-rf
/usr/local/TensorRT/lib /usr
...
...
README.md
浏览文件 @
459d4cc8
...
@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
...
@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
## Installation
## Installation
It is recommended to check out the
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html
)
on our website.
[
Docker installation guide
](
http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html
)
before looking into the
[
build from source guide
](
http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html
)
.
## Documentation
## Documentation
We provide
[
English
](
http://
www.paddlepaddle.org/docs/develop/documentation/en
/getstarted/index_en.html
)
and
We provide
[
English
](
http://
paddlepaddle.org/documentation/docs/en/0.14.0
/getstarted/index_en.html
)
and
[
Chinese
](
http://
www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn
.html
)
documentation.
[
Chinese
](
http://
paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index
.html
)
documentation.
-
[
Deep Learning 101
](
http
://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html
)
-
[
Deep Learning 101
](
http
s://github.com/PaddlePaddle/book
)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
You might want to start from this online interactive book that can run in a Jupyter Notebook.
-
[
Distributed Training
](
http://
www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en
.html
)
-
[
Distributed Training
](
http://
paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto
.html
)
You can run distributed training jobs on MPI clusters.
You can run distributed training jobs on MPI clusters.
-
[
Distributed Training on Kubernetes
](
http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html
)
You can also run distributed training jobs on Kubernetes clusters.
-
[
Python API
](
http://www.paddlepaddle.org/docs/develop/api/en/overview.html
)
Our new API enables much shorter programs.
Our new API enables much shorter programs.
-
[
How to Contribute
](
http://
www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en
.html
)
-
[
How to Contribute
](
http://
paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle
.html
)
We appreciate your contributions!
We appreciate your contributions!
...
...
cmake/cuda.cmake
浏览文件 @
459d4cc8
...
@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
...
@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
# So, don't set these flags here.
if
(
NOT WIN32
)
# windows msvc2015 support c++11 natively.
# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
list
(
APPEND CUDA_NVCC_FLAGS
"-std=c++11"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-std=c++11"
)
list
(
APPEND CUDA_NVCC_FLAGS
"--use_fast_math"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-Xcompiler -fPIC"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-Xcompiler -fPIC"
)
endif
(
NOT WIN32
)
list
(
APPEND CUDA_NVCC_FLAGS
"--use_fast_math"
)
# in cuda9, suppress cuda warning on eigen
# in cuda9, suppress cuda warning on eigen
list
(
APPEND CUDA_NVCC_FLAGS
"-w"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-w"
)
# Set :expt-relaxed-constexpr to suppress Eigen warnings
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list
(
APPEND CUDA_NVCC_FLAGS
"--expt-relaxed-constexpr"
)
list
(
APPEND CUDA_NVCC_FLAGS
"--expt-relaxed-constexpr"
)
if
(
NOT WIN32
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
list
(
APPEND CUDA_NVCC_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG
}
)
list
(
APPEND CUDA_NVCC_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG
}
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"Release"
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"Release"
)
...
@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
...
@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
# nvcc 9 does not support -Os. Use Release flags instead
list
(
APPEND CUDA_NVCC_FLAGS
${
CMAKE_CXX_FLAGS_RELEASE
}
)
list
(
APPEND CUDA_NVCC_FLAGS
${
CMAKE_CXX_FLAGS_RELEASE
}
)
endif
()
endif
()
else
(
NOT WIN32
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Release"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-O3 -DNDEBUG"
)
else
()
message
(
FATAL
"Windows only support Release build now. Please set visual studio build type to Release, x64 build."
)
endif
()
endif
(
NOT WIN32
)
mark_as_advanced
(
CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD
)
mark_as_advanced
(
CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD
)
mark_as_advanced
(
CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION
)
mark_as_advanced
(
CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION
)
cmake/external/grpc.cmake
浏览文件 @
459d4cc8
...
@@ -44,7 +44,7 @@ ExternalProject_Add(
...
@@ -44,7 +44,7 @@ ExternalProject_Add(
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party
# checkout and clean other dirs under third_party
# 4. remove .git, and package the directory.
# 4. remove .git, and package the directory.
URL
"http://paddlepaddledeps.
bj
.bcebos.com/grpc-v1.10.x.tar.gz"
URL
"http://paddlepaddledeps.
cdn
.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5
"1f268a2aff6759839dccd256adcc91cf"
URL_MD5
"1f268a2aff6759839dccd256adcc91cf"
PREFIX
${
GRPC_SOURCES_DIR
}
PREFIX
${
GRPC_SOURCES_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
...
...
cmake/inference_lib.cmake
浏览文件 @
459d4cc8
...
@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
...
@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set
(
dst_dir
"
${
FLUID_INSTALL_DIR
}
/paddle/fluid"
)
set
(
dst_dir
"
${
FLUID_INSTALL_DIR
}
/paddle/fluid"
)
set
(
module
"framework"
)
set
(
module
"framework"
)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
copy
(
framework_lib DEPS framework_py_proto
set
(
framework_lib_deps framework_py_proto
)
SRCS
${
src_dir
}
/
${
module
}
/*.h
${
src_dir
}
/
${
module
}
/details/*.h
${
PADDLE_BINARY_DIR
}
/paddle/fluid/framework/framework.pb.h
endif
(
NOT WIN32
)
DSTS
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
/details
${
dst_dir
}
/
${
module
}
copy
(
framework_lib DEPS
${
framework_lib_deps
}
)
else
()
copy
(
framework_lib
SRCS
${
src_dir
}
/
${
module
}
/*.h
${
src_dir
}
/
${
module
}
/details/*.h
${
PADDLE_BINARY_DIR
}
/paddle/fluid/framework/framework.pb.h
SRCS
${
src_dir
}
/
${
module
}
/*.h
${
src_dir
}
/
${
module
}
/details/*.h
${
PADDLE_BINARY_DIR
}
/paddle/fluid/framework/framework.pb.h
DSTS
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
/details
${
dst_dir
}
/
${
module
}
${
src_dir
}
/
${
module
}
/ir/*.h
DSTS
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
/details
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
/ir
)
)
endif
(
NOT WIN32
)
set
(
module
"memory"
)
set
(
module
"memory"
)
copy
(
memory_lib
copy
(
memory_lib
...
@@ -161,7 +158,8 @@ set(module "inference")
...
@@ -161,7 +158,8 @@ set(module "inference")
copy
(
inference_lib DEPS
${
inference_deps
}
copy
(
inference_lib DEPS
${
inference_deps
}
SRCS
${
src_dir
}
/
${
module
}
/*.h
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/libpaddle_fluid.*
SRCS
${
src_dir
}
/
${
module
}
/*.h
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/libpaddle_fluid.*
${
src_dir
}
/
${
module
}
/api/paddle_inference_api.h
${
src_dir
}
/
${
module
}
/api/demo_ci
${
src_dir
}
/
${
module
}
/api/paddle_inference_api.h
${
src_dir
}
/
${
module
}
/api/demo_ci
DSTS
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
${
dst_dir
}
/
${
module
}
)
)
set
(
module
"platform"
)
set
(
module
"platform"
)
...
...
doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
浏览文件 @
459d4cc8
...
@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
...
@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
:maxdepth: 2
:maxdepth: 2
feeding_data
feeding_data
use_recordio_reader
Python Reader
Python Reader
#############
#############
...
...
doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
已删除
100644 → 0
浏览文件 @
f507e5c1
.. _user_guide_use_recordio_as_train_data:
############################
使用RecordIO文件作为训练数据
############################
相比于 :ref:`user_guide_use_numpy_array_as_train_data`,
:ref:`user_guide_use_recordio_as_train_data` 的性能更好;
但是用户需要先将训练数据集转换成RecordIO文件格式,再使用
:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝,
使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
将训练数据转换成RecordIO文件格式
################################
:code:`fluid.recordio_writer` 中,每个记录都是一个
:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
的所有特征。例如对于图像分类来说,这个数组可以包含图片和分类标签。
用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
:ref:`user_guide_reader` 转换成多个RecordIO文件。
具体使用方法为:
.. code-block:: python
import paddle.fluid as fluid
import numpy
def reader_creator():
def __impl__():
for i in range(1000):
yield [
numpy.random.random(size=[3,224,224], dtype="float32"),
numpy.random.random(size=[1], dtype="int64")
]
return __impl__
img = fluid.layers.data(name="image", shape=[3, 224, 224])
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
BATCH_SIZE = 32
reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
fluid.recordio_writer.convert_reader_to_recordio_file(
"train.recordio", feeder=feeder, reader_creator=reader)
其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
:ref:`_api_fluid_data_feeder_DataFeeder`
是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
:ref:`user_guide_reader` 。
上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件,
其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整,
用户可以将每一个Record的样本数设置成1。并参考
:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
配置神经网络, 打开RecordIO文件
##############################
RecordIO文件转换好之后,用户可以使用 :code:`fluid.layers.open_files()`
打开文件,并使用 :code:`fluid.layers.read_file` 读取文件内容。
简单使用方法如下:
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(
filenames=["train.recordio"],
shape=[[3, 224, 224], [1]],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
pass_num=100
)
image, label = fluid.layers.read_file(file_obj)
其中如果设置了 :code:`pass_num` ,那么当所有数据读完后,会重新读取数据,
直到读取了 :code:`pass_num` 遍。
进阶使用
########
使用 :code:`fluid.layers.double_buffer()`
------------------------------------------
:code:`Double buffer` 使用双缓冲技术,将训练数据从内存中复制到显存中。配置双缓冲
需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
.. code-block:: python
import paddle.fliud as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.double_buffer(file_obj)
image, label = fluid.layers.read_file(file_obj)
双缓冲技术可以参考
`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
配置数据增强
------------
使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
preprocessor = fluid.layers.Preprocessor(reader=data_file)
with preprocessor.block():
image, label = preprocessor.inputs()
image = image / 2
label = label + 1
preprocessor.outputs(image, label)
如上代码所示,使用 :code:`Preprocessor` 定义了一个数据增强模块,并在
:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
:code:`preprocessor.outputs()` 标记预处理后的输出。
.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
使用Op组batch
-------------
使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.batch(file_obj, batch_size=32)
img, label = fluid.layers.read_file(file_obj)
需要注意的是,如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据,
那么这几个样本直接组成一个批量数据进行训练。
读入数据的shuffle
-----------------
使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
img, label = fliud.layers.read_file(file_obj)
需要注意的是:
1. :code:`shuffle` 实现方法是:
先读入 :code:`buffer_size` 条样本,再随机的选出样本进行训练。
2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存,需要确定训练过程中内存
足够支持缓存 :code:`buffer_size` 条数据。
paddle/fluid/API.spec
浏览文件 @
459d4cc8
...
@@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'],
...
@@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'],
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
...
...
paddle/fluid/framework/.gitignore
0 → 100644
浏览文件 @
459d4cc8
.tensor_util.cu
.data_type_transform.cu
\ No newline at end of file
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
459d4cc8
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function
(
windows_symbolic TARGET
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS
)
cmake_parse_arguments
(
windows_symbolic
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
foreach
(
src
${
windows_symbolic_SRCS
}
)
get_filename_component
(
src
${
src
}
NAME_WE
)
if
(
NOT EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc OR NOT EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cu
)
message
(
FATAL
"
${
src
}
.cc and
${
src
}
.cu must exsits, and
${
src
}
.cu must be symbolic file."
)
endif
()
add_custom_command
(
OUTPUT .
${
src
}
.cu
COMMAND
${
CMAKE_COMMAND
}
-E remove
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu
COMMAND
${
CMAKE_COMMAND
}
-E copy
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu"
COMMENT
"create hidden file of
${
src
}
.cu"
)
add_custom_target
(
${
TARGET
}
ALL DEPENDS .
${
src
}
.cu
)
endforeach
()
endfunction
()
add_subdirectory
(
ir
)
add_subdirectory
(
ir
)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
add_subdirectory
(
details
)
add_subdirectory
(
details
)
...
@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
...
@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
if
(
WIN32
)
windows_symbolic
(
tensor_util SRCS tensor_util.cu
)
nv_library
(
tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context
)
add_dependencies
(
tensor tensor_util
)
else
()
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
endif
(
WIN32
)
else
()
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
)
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
)
endif
()
endif
()
...
@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
...
@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function
)
DEPS operator op_registry device_context math_function
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
if
(
WIN32
)
windows_symbolic
(
hidden_file SRCS data_type_transform.cu
)
nv_library
(
data_type_transform SRCS .data_type_transform.cu DEPS tensor
)
add_dependencies
(
data_type_transform hidden_file
)
else
()
nv_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
endif
(
WIN32
)
nv_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
nv_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
else
()
else
()
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
459d4cc8
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
WRITE
${
pass_file
}
"// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!
\n\n
"
)
file
(
APPEND
${
pass_file
}
"
\#
include
\"
paddle/fluid/framework/ir/pass.h
\"\n
"
)
function
(
pass_library TARGET
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS
)
cmake_parse_arguments
(
op_library
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cc_library
(
${
TARGET
}
SRCS
${
TARGET
}
.cc DEPS graph_pattern_detector pass
)
file
(
APPEND
${
pass_file
}
"USE_PASS(
${
TARGET
}
);
\n
"
)
set
(
PASS_LIBRARY
${
TARGET
}
${
PASS_LIBRARY
}
PARENT_SCOPE
)
endfunction
()
cc_library
(
node SRCS node.cc DEPS proto_desc
)
cc_library
(
node SRCS node.cc DEPS proto_desc
)
cc_library
(
graph SRCS graph.cc DEPS node
)
cc_library
(
graph SRCS graph.cc DEPS node
)
cc_library
(
graph_helper SRCS graph_helper.cc DEPS graph
)
cc_library
(
graph_helper SRCS graph_helper.cc DEPS graph
)
cc_library
(
pass SRCS pass.cc DEPS graph node graph_helper
)
cc_library
(
pass SRCS pass.cc DEPS graph node graph_helper
)
cc_library
(
graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper
)
cc_library
(
graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper
)
cc_library
(
graph_traits SRCS graph_traits.cc DEPS graph
)
cc_library
(
graph_traits SRCS graph_traits.cc DEPS graph
)
cc_library
(
graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits
)
cc_library
(
graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits
)
cc_library
(
fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector
)
cc_library
(
attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector
)
pass_library
(
graph_to_program_pass
)
cc_library
(
infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass
)
pass_library
(
graph_viz_pass
)
cc_library
(
fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector
)
pass_library
(
fc_fuse_pass
)
cc_library
(
seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector
)
pass_library
(
attention_lstm_fuse_pass
)
pass_library
(
infer_clean_graph_pass
)
pass_library
(
fc_lstm_fuse_pass
)
pass_library
(
seq_concat_fc_fuse_pass
)
set
(
GLOB_PASS_LIB
${
PASS_LIBRARY
}
CACHE INTERNAL
"Global PASS library"
)
cc_test
(
pass_test SRCS pass_test.cc DEPS graph pass graph_helper
)
cc_test
(
pass_test SRCS pass_test.cc DEPS graph pass graph_helper
)
cc_test
(
graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry
)
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass
graph_pattern_detector graph pass graph_traits
framework_proto
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
浏览文件 @
459d4cc8
...
@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
...
@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
auto
*
cell_init
=
graph
->
RetriveNode
(
6
);
auto
*
cell_init
=
graph
->
RetriveNode
(
6
);
auto
*
hidden_init
=
graph
->
RetriveNode
(
8
);
auto
*
hidden_init
=
graph
->
RetriveNode
(
8
);
#define LINK_TO(node0, node1) \
node0->outputs.push_back(node1); \
node1->inputs.push_back(node0);
auto
*
lstm_op
=
graph
->
CreateOpNode
(
&
op_desc
);
auto
*
lstm_op
=
graph
->
CreateOpNode
(
&
op_desc
);
PrepareParameters
(
graph
,
param
);
PrepareParameters
(
graph
,
param
);
LINK_TO
(
X
,
lstm_op
);
IR_NODE_
LINK_TO
(
X
,
lstm_op
);
LINK_TO
(
cell_init
,
lstm_op
);
IR_NODE_
LINK_TO
(
cell_init
,
lstm_op
);
LINK_TO
(
hidden_init
,
lstm_op
);
IR_NODE_
LINK_TO
(
hidden_init
,
lstm_op
);
LINK_TO
(
lstm_op
,
LSTMOUT
);
IR_NODE_
LINK_TO
(
lstm_op
,
LSTMOUT
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
}
}
...
...
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
459d4cc8
...
@@ -21,74 +21,26 @@ namespace paddle {
...
@@ -21,74 +21,26 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
ir
{
namespace
ir
{
bool
VarOutLinksToOp
(
Node
*
node
,
const
std
::
string
&
op_type
)
{
for
(
auto
*
out
:
node
->
outputs
)
{
if
(
out
->
IsOp
()
&&
out
->
Op
()
->
Type
()
==
op_type
)
{
return
true
;
}
}
return
false
;
}
void
BuildFCPattern
(
PDPattern
*
pattern
)
{
// Create Operators
auto
*
mul_op
=
pattern
->
NewNode
(
"mul"
)
->
assert_is_op
(
"mul"
);
auto
*
elementwise_add_op
=
pattern
->
NewNode
(
"elementwise_add"
)
->
assert_is_op
(
"elementwise_add"
);
// Create variables
// w
auto
*
mul_weight_var
=
pattern
->
NewNode
(
"mul_weight"
)
->
AsInput
()
->
assert_is_op_nth_input
(
"mul"
,
"Y"
,
0
);
// x
auto
*
mul_tmp_var
=
pattern
->
NewNode
(
"mul_tmp_var"
)
->
AsInput
()
->
assert_is_op_nth_input
(
"mul"
,
"X"
,
0
);
// intermediate variable, will be removed in the IR after fuse.
auto
*
mul_out_var
=
pattern
->
NewNode
(
"mul_out"
)
->
AsIntermediate
()
->
assert_is_only_output_of_op
(
"mul"
)
->
assert_is_op_input
(
"elementwise_add"
);
// bias
auto
*
elementwise_add_tmp_var
=
pattern
->
NewNode
(
"elementwise_add_tmpvar"
)
->
assert_is_op_input
(
"elementwise_add"
)
->
AsInput
();
// output
auto
*
elementwise_add_out_var
=
pattern
->
NewNode
(
"elementwise_add_out"
)
->
AsOutput
()
->
assert_is_op_output
(
"elementwise_add"
);
mul_op
->
LinksFrom
({
mul_weight_var
,
mul_tmp_var
}).
LinksTo
({
mul_out_var
});
elementwise_add_op
->
LinksFrom
({
mul_out_var
,
elementwise_add_tmp_var
})
.
LinksTo
({
elementwise_add_out_var
});
}
// Replace the node `from` in the links to `to`
bool
LinksReplace
(
std
::
vector
<
Node
*>*
links
,
Node
*
from
,
Node
*
to
)
{
for
(
auto
*&
n
:
*
links
)
{
if
(
n
==
from
)
{
n
=
to
;
return
true
;
}
}
return
false
;
}
std
::
unique_ptr
<
ir
::
Graph
>
FCFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
FCFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
PADDLE_ENFORCE
(
graph
.
get
());
PADDLE_ENFORCE
(
graph
.
get
());
FusePassBase
::
Init
(
"fc"
,
graph
.
get
());
FusePassBase
::
Init
(
"fc
_fuse
"
,
graph
.
get
());
std
::
unordered_set
<
Node
*>
nodes2delete
;
std
::
unordered_set
<
Node
*>
nodes2delete
;
GraphPatternDetector
gpd
;
GraphPatternDetector
gpd
;
BuildFCPattern
(
gpd
.
mutable_pattern
());
// BuildFCPattern(gpd.mutable_pattern());
auto
*
x
=
gpd
.
mutable_pattern
()
#define GET_NODE(id) \
->
NewNode
(
"fc_fuse/x"
)
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \
->
AsInput
()
"pattern has no Node called %s", #id); \
->
assert_is_op_input
(
"mul"
,
"X"
);
auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \
patterns
::
FC
(
gpd
.
mutable_pattern
(),
"fc_fuse"
,
x
,
true
/*with bias*/
);
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
#define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
int
found_fc_count
=
0
;
int
found_fc_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
...
@@ -98,43 +50,33 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
...
@@ -98,43 +50,33 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
// scenerio.
// scenerio.
// FC's fusion is simple, just op fuse, no need to process the
// FC's fusion is simple, just op fuse, no need to process the
// parameters.
// parameters.
GET_NODE
(
mul_tmp_var
);
// x
GET_NODE
(
x
);
// x
GET_NODE
(
mul_weight
);
// Y
GET_NODE
(
w
);
// Y
GET_NODE
(
elementwise_add_tmpvar
);
// bias
GET_NODE
(
fc_bias
);
// bias
GET_NODE
(
elementwise_add_out
);
// Out
GET_NODE
(
fc_out
);
// Out
GET_NODE
(
mul
);
// MUL op
GET_NODE
(
mul
);
// MUL op
GET_NODE
(
elementwise_add
);
// ELEMENT_ADD op
GET_NODE
(
elementwise_add
);
// ELEMENT_ADD op
GET_NODE
(
mul_out
);
// tmp
GET_NODE
(
mul_out
);
// tmp
#undef GET_NODE
#undef GET_NODE
// Create an FC Node.
// Create an FC Node.
OpDesc
desc
;
OpDesc
desc
;
std
::
string
fc_x_in
=
mul_tmp_var
->
Name
();
std
::
string
fc_x_in
=
x
->
Name
();
std
::
string
fc_Y_in
=
mul_weight
->
Name
();
std
::
string
fc_Y_in
=
w
->
Name
();
std
::
string
fc_bias_in
=
elementwise_add_tmpvar
->
Name
();
std
::
string
fc_bias_in
=
fc_bias
->
Name
();
std
::
string
fc_out
=
elementwise_add
_out
->
Name
();
std
::
string
fc_out
_out
=
fc
_out
->
Name
();
desc
.
SetInput
(
"Input"
,
std
::
vector
<
std
::
string
>
({
fc_x_in
}));
desc
.
SetInput
(
"Input"
,
std
::
vector
<
std
::
string
>
({
fc_x_in
}));
desc
.
SetInput
(
"W"
,
std
::
vector
<
std
::
string
>
({
fc_Y_in
}));
desc
.
SetInput
(
"W"
,
std
::
vector
<
std
::
string
>
({
fc_Y_in
}));
desc
.
SetInput
(
"Bias"
,
std
::
vector
<
std
::
string
>
({
fc_bias_in
}));
desc
.
SetInput
(
"Bias"
,
std
::
vector
<
std
::
string
>
({
fc_bias_in
}));
desc
.
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
fc_out
}));
desc
.
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
fc_out
_out
}));
desc
.
SetType
(
"fc"
);
desc
.
SetType
(
"fc"
);
auto
fc_node
=
g
->
CreateOpNode
(
&
desc
);
// OpDesc will be copied.
auto
fc_node
=
g
->
CreateOpNode
(
&
desc
);
// OpDesc will be copied.
fc_node
->
inputs
=
GraphSafeRemoveNodes
(
graph
.
get
(),
{
mul
,
elementwise_add
,
mul_out
});
std
::
vector
<
Node
*>
({
mul_tmp_var
,
mul_weight
,
elementwise_add_tmpvar
});
fc_node
->
outputs
.
push_back
(
elementwise_add_out
);
// Update link relatons
PADDLE_ENFORCE
(
LinksReplace
(
&
mul_tmp_var
->
outputs
,
mul
,
fc_node
));
PADDLE_ENFORCE
(
LinksReplace
(
&
mul_weight
->
outputs
,
mul
,
fc_node
));
PADDLE_ENFORCE
(
LinksReplace
(
&
elementwise_add_tmpvar
->
outputs
,
elementwise_add
,
fc_node
));
PADDLE_ENFORCE
(
LinksReplace
(
&
elementwise_add_out
->
inputs
,
elementwise_add
,
fc_node
));
// Drop old nodes
IR_NODE_LINK_TO
(
x
,
fc_node
);
graph
->
RemoveNode
(
mul
);
IR_NODE_LINK_TO
(
w
,
fc_node
);
graph
->
RemoveNode
(
elementwise_add
);
IR_NODE_LINK_TO
(
fc_bias
,
fc_node
);
graph
->
RemoveNode
(
mul_out
);
// tmp variable
IR_NODE_LINK_TO
(
fc_node
,
fc_out
);
found_fc_count
++
;
found_fc_count
++
;
};
};
...
...
paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
浏览文件 @
459d4cc8
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -94,21 +95,37 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
...
@@ -94,21 +95,37 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
op_desc
.
SetOutput
(
"Hidden"
,
{
hidden_n
->
Name
()});
op_desc
.
SetOutput
(
"Hidden"
,
{
hidden_n
->
Name
()});
op_desc
.
SetOutput
(
"Cell"
,
{
cell_n
->
Name
()});
op_desc
.
SetOutput
(
"Cell"
,
{
cell_n
->
Name
()});
op_desc
.
SetOutput
(
"XX"
,
{
xx_n
->
Name
()});
op_desc
.
SetOutput
(
"XX"
,
{
xx_n
->
Name
()});
op_desc
.
SetOutput
(
"BatchedGate"
,
{
"blstm_0.tmp_2"
});
op_desc
.
SetOutput
(
"BatchedInput"
,
{
"blstm_0.tmp_2"
});
op_desc
.
SetOutput
(
"BatchCellPreAct"
,
{
"blstm_1.tmp_2"
});
op_desc
.
SetAttr
(
"is_reverse"
,
lstm_n
->
Op
()
->
GetAttr
(
"is_reverse"
));
op_desc
.
SetAttr
(
"is_reverse"
,
lstm_n
->
Op
()
->
GetAttr
(
"is_reverse"
));
op_desc
.
SetAttr
(
"use_peepholes"
,
lstm_n
->
Op
()
->
GetAttr
(
"use_peepholes"
));
op_desc
.
SetAttr
(
"use_peepholes"
,
lstm_n
->
Op
()
->
GetAttr
(
"use_peepholes"
));
auto
*
op
=
graph
->
CreateOpNode
(
&
op_desc
);
// TODO(TJ): get from attr
op_desc
.
SetAttr
(
"use_seq"
,
true
);
#define TMP_NAME(x) "at.new.tmp." #x
#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
OP_SET_OUT
(
BatchedCell
);
OP_SET_OUT
(
BatchedHidden
);
OP_SET_OUT
(
ReorderedH0
);
OP_SET_OUT
(
ReorderedC0
);
#undef OP_SET_OUT
#define LINK_TO(a, b) \
auto
*
op
=
graph
->
CreateOpNode
(
&
op_desc
);
a->outputs.push_back(b); \
PADDLE_ENFORCE
(
graph
->
Has
(
kParamScopeAttr
));
b->inputs.push_back(a);
auto
*
scope
=
graph
->
Get
<
Scope
*>
(
kParamScopeAttr
);
LINK_TO
(
input_n
,
op
);
LINK_TO
(
weight_x_n
,
op
);
#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
LINK_TO
(
weight_h_n
,
op
);
TMP_NEW
(
BatchedCell
);
LINK_TO
(
bias_n
,
op
);
TMP_NEW
(
BatchedHidden
);
LINK_TO
(
op
,
hidden_n
);
TMP_NEW
(
ReorderedH0
);
#undef LINK_TO
TMP_NEW
(
ReorderedC0
);
#undef TMP_NEW
#undef TMP_NAME
IR_NODE_LINK_TO
(
input_n
,
op
);
IR_NODE_LINK_TO
(
weight_x_n
,
op
);
IR_NODE_LINK_TO
(
weight_h_n
,
op
);
IR_NODE_LINK_TO
(
bias_n
,
op
);
IR_NODE_LINK_TO
(
op
,
hidden_n
);
return
op
;
return
op
;
};
};
...
@@ -116,7 +133,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
...
@@ -116,7 +133,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
auto
fc_no_bias_handler
=
[
&
](
auto
fc_no_bias_handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
#define GET_NODE(name__) \
#define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \
std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \
auto* name__##n = pattern->RetrieveNode(name__##key); \
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
459d4cc8
...
@@ -111,6 +111,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
...
@@ -111,6 +111,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
return
false
;
return
false
;
}
}
}
}
for
(
auto
&
item
:
pdnodes2nodes_
)
{
for
(
auto
&
n
:
item
.
second
)
{
GetMarkedNodes
(
const_cast
<
Graph
*>
(
&
graph
)).
insert
(
n
);
}
}
VLOG
(
3
)
<<
pdnodes2nodes_
.
size
()
<<
" nodes marked"
;
VLOG
(
3
)
<<
pdnodes2nodes_
.
size
()
<<
" nodes marked"
;
return
!
pdnodes2nodes_
.
empty
();
return
!
pdnodes2nodes_
.
empty
();
...
@@ -278,7 +283,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
...
@@ -278,7 +283,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
for
(
const
auto
&
subgraph
:
*
subgraphs
)
{
for
(
const
auto
&
subgraph
:
*
subgraphs
)
{
bool
valid
=
true
;
bool
valid
=
true
;
for
(
auto
&
item
:
subgraph
)
{
for
(
auto
&
item
:
subgraph
)
{
if
(
node_set
.
count
(
item
.
second
))
{
if
(
item
.
first
->
IsIntermediate
()
&&
node_set
.
count
(
item
.
second
))
{
valid
=
false
;
valid
=
false
;
break
;
break
;
}
}
...
@@ -334,22 +339,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
...
@@ -334,22 +339,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
}
}
PDNode
*
PDNode
::
assert_is_op
()
{
PDNode
*
PDNode
::
assert_is_op
()
{
asserts_
.
emplace_back
([
this
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
();
});
asserts_
.
emplace_back
([](
Node
*
x
)
{
return
x
&&
x
->
IsOp
();
});
return
this
;
return
this
;
}
}
PDNode
*
PDNode
::
assert_is_op
(
const
std
::
string
&
op_type
)
{
PDNode
*
PDNode
::
assert_is_op
(
const
std
::
string
&
op_type
)
{
asserts_
.
emplace_back
([
this
,
op_type
](
Node
*
x
)
{
asserts_
.
emplace_back
([
op_type
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
op_type
;
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
op_type
;
});
});
return
this
;
return
this
;
}
}
PDNode
*
PDNode
::
assert_is_var
()
{
PDNode
*
PDNode
::
assert_is_var
()
{
asserts_
.
emplace_back
([
this
](
Node
*
x
)
{
return
x
&&
x
->
IsVar
();
});
asserts_
.
emplace_back
([](
Node
*
x
)
{
return
x
&&
x
->
IsVar
();
});
return
this
;
return
this
;
}
}
PDNode
*
PDNode
::
assert_var_not_persistable
()
{
PDNode
*
PDNode
::
assert_var_not_persistable
()
{
assert_is_var
();
assert_is_var
();
asserts_
.
emplace_back
([
this
](
Node
*
x
)
{
return
!
x
->
Var
()
->
Persistable
();
});
asserts_
.
emplace_back
([](
Node
*
x
)
{
return
!
x
->
Var
()
->
Persistable
();
});
return
this
;
return
this
;
}
}
PDNode
*
PDNode
::
assert_is_persistable_var
()
{
PDNode
*
PDNode
::
assert_is_persistable_var
()
{
...
@@ -491,14 +496,16 @@ void GraphSafeRemoveNodes(Graph* graph,
...
@@ -491,14 +496,16 @@ void GraphSafeRemoveNodes(Graph* graph,
for
(
auto
it
=
node
->
inputs
.
begin
();
it
!=
node
->
inputs
.
end
();)
{
for
(
auto
it
=
node
->
inputs
.
begin
();
it
!=
node
->
inputs
.
end
();)
{
if
(
nodes
.
count
(
*
it
))
{
if
(
nodes
.
count
(
*
it
))
{
it
=
const_cast
<
Node
*>
(
node
)
->
inputs
.
erase
(
it
);
it
=
const_cast
<
Node
*>
(
node
)
->
inputs
.
erase
(
it
);
}
else
}
else
{
it
++
;
it
++
;
}
}
}
for
(
auto
it
=
node
->
outputs
.
begin
();
it
!=
node
->
outputs
.
end
();)
{
for
(
auto
it
=
node
->
outputs
.
begin
();
it
!=
node
->
outputs
.
end
();)
{
if
(
nodes
.
count
(
*
it
))
{
if
(
nodes
.
count
(
*
it
))
{
it
=
const_cast
<
Node
*>
(
node
)
->
outputs
.
erase
(
it
);
it
=
const_cast
<
Node
*>
(
node
)
->
outputs
.
erase
(
it
);
}
else
}
else
{
it
++
;
it
++
;
}
}
}
}
}
}
}
...
...
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
459d4cc8
...
@@ -245,6 +245,8 @@ class GraphPatternDetector {
...
@@ -245,6 +245,8 @@ class GraphPatternDetector {
void
UniquePatterns
(
std
::
vector
<
subgraph_t
>*
subgraphs
);
void
UniquePatterns
(
std
::
vector
<
subgraph_t
>*
subgraphs
);
// Remove overlapped match subgraphs, when overlapped, keep the previous one.
// Remove overlapped match subgraphs, when overlapped, keep the previous one.
// The intermediate PDNodes will be removed, so can't shared by multiple
// patterns.
void
RemoveOverlappedMatch
(
std
::
vector
<
subgraph_t
>*
subgraphs
);
void
RemoveOverlappedMatch
(
std
::
vector
<
subgraph_t
>*
subgraphs
);
// Validate whether the intermediate nodes are linked by external nodes.
// Validate whether the intermediate nodes are linked by external nodes.
...
@@ -295,6 +297,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
...
@@ -295,6 +297,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
}
// namespace patterns
}
// namespace patterns
#define IR_NODE_LINK_TO(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
}
// namespace ir
}
// namespace ir
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
浏览文件 @
459d4cc8
...
@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
...
@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
return
node
->
IsOp
()
&&
(
node
->
Name
()
==
"op2"
||
node
->
Name
()
==
"op3"
);
return
node
->
IsOp
()
&&
(
node
->
Name
()
==
"op2"
||
node
->
Name
()
==
"op3"
);
},
},
"OP0"
);
"OP0"
);
auto
*
any_var
=
x
.
mutable_pattern
()
->
NewNode
(
auto
*
any_var
=
x
.
mutable_pattern
()
[](
Node
*
node
)
{
return
node
->
IsVar
();
},
"VAR"
);
->
NewNode
([](
Node
*
node
)
{
return
node
->
IsVar
();
},
"VAR"
)
->
AsIntermediate
();
auto
*
any_op1
=
x
.
mutable_pattern
()
->
NewNode
(
auto
*
any_op1
=
x
.
mutable_pattern
()
->
NewNode
(
[](
Node
*
node
)
{
return
node
->
IsOp
();
},
"OP1"
);
[](
Node
*
node
)
{
return
node
->
IsOp
();
},
"OP1"
);
...
...
paddle/fluid/framework/ir/infer_clean_graph_pass.cc
浏览文件 @
459d4cc8
...
@@ -13,42 +13,41 @@
...
@@ -13,42 +13,41 @@
// limitations under the License.
// limitations under the License.
#include <algorithm>
#include <algorithm>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/
pass
.h"
#include "paddle/fluid/framework/ir/
graph_pattern_detector
.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
ir
{
namespace
ir
{
class
InferCleanGraphPass
:
public
Pass
{
class
InferCleanGraphPass
:
public
FusePassBase
{
public:
public:
virtual
~
InferCleanGraphPass
()
{}
virtual
~
InferCleanGraphPass
()
{}
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
FusePassBase
::
Init
(
"original_graph"
,
graph
.
get
());
PADDLE_ENFORCE
(
graph
.
get
());
PADDLE_ENFORCE
(
graph
.
get
());
auto
is_valid_node
=
[](
Node
*
x
)
{
auto
is_valid_node
=
[](
Node
*
x
)
{
return
x
&&
IsControlDepVar
(
*
x
)
&&
x
->
IsVar
()
&&
!
x
->
Var
();
return
x
&&
IsControlDepVar
(
*
x
)
&&
x
->
IsVar
()
&&
!
x
->
Var
();
};
};
std
::
unordered_set
<
Node
*>
invalid_nodes
;
std
::
unordered_set
<
const
Node
*>
invalid_nodes
;
int
valid_op
=
0
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
is_valid_node
(
node
))
{
if
(
is_valid_node
(
node
))
{
invalid_nodes
.
insert
(
node
);
invalid_nodes
.
insert
(
node
);
}
else
if
(
node
->
IsOp
())
{
// Collect all the operators to help tracking number of operators.
++
valid_op
;
}
}
}
}
// remove nodes from the graph.
GraphSafeRemoveNodes
(
graph
.
get
(),
invalid_nodes
);
for
(
auto
*
node
:
invalid_nodes
)
{
graph
->
RemoveNode
(
node
);
}
// clean edges.
AddStatis
(
valid_op
);
for
(
auto
*
node
:
graph
->
Nodes
())
{
CleanEdges
(
&
node
->
inputs
,
invalid_nodes
);
CleanEdges
(
&
node
->
outputs
,
invalid_nodes
);
}
return
graph
;
return
graph
;
}
}
...
...
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
浏览文件 @
459d4cc8
...
@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
...
@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
op_desc
.
SetAttr
(
"fc_activation"
,
act
->
Op
()
->
Type
());
op_desc
.
SetAttr
(
"fc_activation"
,
act
->
Op
()
->
Type
());
auto
*
op_node
=
graph
->
CreateOpNode
(
&
op_desc
);
auto
*
op_node
=
graph
->
CreateOpNode
(
&
op_desc
);
// Add links
// Add links
#define NODE_LINKS(a, b) \
IR_NODE_LINK_TO
(
fc_w
,
op_node
);
a->outputs.push_back(b); \
IR_NODE_LINK_TO
(
fc_bias
,
op_node
);
b->inputs.push_back(a);
IR_NODE_LINK_TO
(
concat_in0
,
op_node
);
NODE_LINKS
(
fc_w
,
op_node
);
IR_NODE_LINK_TO
(
sequence_expand0_in
,
op_node
);
NODE_LINKS
(
fc_bias
,
op_node
);
IR_NODE_LINK_TO
(
sequence_expand1_in
,
op_node
);
NODE_LINKS
(
concat_in0
,
op_node
);
IR_NODE_LINK_TO
(
op_node
,
fc_out
);
NODE_LINKS
(
sequence_expand0_in
,
op_node
);
NODE_LINKS
(
sequence_expand1_in
,
op_node
);
NODE_LINKS
(
op_node
,
fc_out
);
// Clean nodes.
// Clean nodes.
std
::
unordered_set
<
const
Node
*>
marked_nodes
;
std
::
unordered_set
<
const
Node
*>
marked_nodes
;
...
@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
...
@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
marked_nodes
.
erase
(
sequence_expand0_in
);
marked_nodes
.
erase
(
sequence_expand0_in
);
marked_nodes
.
erase
(
sequence_expand1_in
);
marked_nodes
.
erase
(
sequence_expand1_in
);
marked_nodes
.
erase
(
fc_out
);
marked_nodes
.
erase
(
fc_out
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
});
});
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
459d4cc8
...
@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
...
@@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library
(
paddle_fluid_api
cc_library
(
paddle_fluid_api
SRCS io.cc
SRCS io.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OP_LIB
}
graph_to_program_pass
)
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OP_LIB
}
)
get_property
(
fluid_modules GLOBAL PROPERTY FLUID_MODULES
)
get_property
(
fluid_modules GLOBAL PROPERTY FLUID_MODULES
)
...
@@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
...
@@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
#endif()
#endif()
# Create static library
# Create static library
cc_library
(
paddle_fluid DEPS
${
fluid_modules
}
paddle_fluid_api paddle_inference_api
)
cc_library
(
paddle_fluid DEPS
${
fluid_modules
}
paddle_fluid_api paddle_inference_api
analysis_predictor
)
if
(
NOT APPLE
)
if
(
NOT APPLE
)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set
(
LINK_FLAGS
"-Wl,--retain-symbols-file
${
CMAKE_CURRENT_SOURCE_DIR
}
/paddle_fluid.sym"
)
set
(
LINK_FLAGS
"-Wl,--retain-symbols-file
${
CMAKE_CURRENT_SOURCE_DIR
}
/paddle_fluid.sym"
)
...
@@ -32,6 +32,7 @@ endif()
...
@@ -32,6 +32,7 @@ endif()
# Create shared library
# Create shared library
cc_library
(
paddle_fluid_shared SHARED
cc_library
(
paddle_fluid_shared SHARED
SRCS io.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api_impl.cc
SRCS io.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api_impl.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/analysis_predictor.cc
DEPS
${
fluid_modules
}
paddle_fluid_api
)
DEPS
${
fluid_modules
}
paddle_fluid_api
)
set_target_properties
(
paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid
)
set_target_properties
(
paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid
)
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
459d4cc8
...
@@ -33,7 +33,7 @@ function (inference_analysis_test TARGET)
...
@@ -33,7 +33,7 @@ function (inference_analysis_test TARGET)
endif
()
endif
()
cc_test
(
${
TARGET
}
cc_test
(
${
TARGET
}
SRCS
"
${
analysis_test_SRCS
}
"
SRCS
"
${
analysis_test_SRCS
}
"
DEPS analysis
graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass
${
analysis_test_EXTRA_DEPS
}
DEPS analysis
pass
${
GLOB_PASS_LIB
}
${
analysis_test_EXTRA_DEPS
}
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
${
mem_opt
}
${
analysis_test_ARGS
}
)
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
${
mem_opt
}
${
analysis_test_ARGS
}
)
set_tests_properties
(
${
TARGET
}
PROPERTIES DEPENDS test_word2vec
)
set_tests_properties
(
${
TARGET
}
PROPERTIES DEPENDS test_word2vec
)
endif
(
WITH_TESTING
)
endif
(
WITH_TESTING
)
...
@@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
...
@@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
endif
()
endif
()
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
analysis_predictor
# ir
fc_fuse_pass
fc_lstm_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detector
infer_clean_graph_pass
attention_lstm_fuse_pass
paddle_inference_api
pass
ARGS --infer_ditu_rnn_model=
${
DITU_INSTALL_DIR
}
/model
ARGS --infer_ditu_rnn_model=
${
DITU_INSTALL_DIR
}
/model
--infer_ditu_rnn_data=
${
DITU_INSTALL_DIR
}
/data.txt
)
--infer_ditu_rnn_data=
${
DITU_INSTALL_DIR
}
/data.txt
)
inference_analysis_test
(
test_data_flow_graph SRCS data_flow_graph_tester.cc
)
inference_analysis_test
(
test_data_flow_graph SRCS data_flow_graph_tester.cc
)
inference_analysis_test
(
test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc
EXTRA_DEPS paddle_inference_api
)
inference_analysis_test
(
test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc
)
inference_analysis_test
(
test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc
EXTRA_DEPS paddle_fluid
)
inference_analysis_test
(
test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc
)
inference_analysis_test
(
test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc
)
inference_analysis_test
(
test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc
)
inference_analysis_test
(
test_subgraph_splitter SRCS subgraph_splitter_tester.cc
)
inference_analysis_test
(
test_subgraph_splitter SRCS subgraph_splitter_tester.cc
)
inference_analysis_test
(
test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc
)
inference_analysis_test
(
test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc
)
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
459d4cc8
...
@@ -23,6 +23,7 @@
...
@@ -23,6 +23,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
DEFINE_string
(
infer_ditu_rnn_model
,
""
,
"model path for ditu RNN"
);
DEFINE_string
(
infer_ditu_rnn_model
,
""
,
"model path for ditu RNN"
);
...
@@ -329,9 +330,20 @@ void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false,
...
@@ -329,9 +330,20 @@ void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false,
LOG
(
INFO
)
<<
"fused "
<<
item
.
first
<<
" "
<<
item
.
second
;
LOG
(
INFO
)
<<
"fused "
<<
item
.
first
<<
" "
<<
item
.
second
;
}
}
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc"
));
int
num_ops
=
0
;
EXPECT_EQ
(
fuse_statis
.
at
(
"fc"
),
1
);
for
(
auto
&
node
:
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_nobias_lstm_fuse"
),
1
);
analysis_predictor
->
analysis_argument
().
main_dfg
->
nodes
.
nodes
())
{
if
(
node
->
IsFunction
())
{
++
num_ops
;
}
}
LOG
(
INFO
)
<<
"has num ops: "
<<
num_ops
;
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_nobias_lstm_fuse"
),
2
);
// bi-directional LSTM
EXPECT_EQ
(
num_ops
,
13
);
// After graph optimization, only 13 operators exists.
}
}
}
}
...
@@ -348,10 +360,3 @@ TEST(Analyzer, DituRNN_multi_thread) {
...
@@ -348,10 +360,3 @@ TEST(Analyzer, DituRNN_multi_thread) {
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
fc_fuse_pass
);
USE_PASS
(
seq_concat_fc_fuse_pass
);
USE_PASS
(
fc_lstm_fuse_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
infer_clean_graph_pass
);
USE_PASS
(
attention_lstm_fuse_pass
);
paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
浏览文件 @
459d4cc8
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
...
@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
infer_clean_graph_pass
);
USE_PASS
(
attention_lstm_fuse_pass
);
USE_PASS
(
fc_lstm_fuse_pass
);
USE_PASS
(
seq_concat_fc_fuse_pass
);
USE_PASS
(
fc_fuse_pass
);
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
459d4cc8
...
@@ -18,10 +18,7 @@ if(APPLE)
...
@@ -18,10 +18,7 @@ if(APPLE)
endif
(
APPLE
)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
${
GLOB_PASS_LIB
}
)
graph_viz_pass fc_fuse_pass
infer_clean_graph_pass
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
459d4cc8
...
@@ -20,6 +20,7 @@
...
@@ -20,6 +20,7 @@
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -132,7 +133,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
...
@@ -132,7 +133,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
}
}
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
fc_fuse_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
infer_clean_graph_pass
);
paddle/fluid/inference/api/helper.h
浏览文件 @
459d4cc8
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <glog/logging.h>
#include <glog/logging.h>
#include <sys/time.h>
#include <sys/time.h>
#include <algorithm>
#include <algorithm>
#include <numeric>
#include <sstream>
#include <sstream>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
...
paddle/fluid/inference/paddle_fluid.map
浏览文件 @
459d4cc8
{
{
global:
global:
*paddle*;
*paddle*;
*Pass*;
local:
local:
*;
*;
};
};
paddle/fluid/operators/activation_op.h
浏览文件 @
459d4cc8
...
@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
...
@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
auto
temp1
=
static_cast
<
T
>
(
1
)
/
auto
temp1
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
beta
)
*
x
).
exp
());
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
beta
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
beta
*
out
));
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
static_cast
<
T
>
(
beta
)
*
out
));
dx
.
device
(
d
)
=
dout
*
((
beta
*
out
)
+
temp2
);
dx
.
device
(
d
)
=
dout
*
((
static_cast
<
T
>
(
beta
)
*
out
)
+
temp2
);
}
}
};
};
...
...
paddle/fluid/operators/attention_lstm_op.cc
浏览文件 @
459d4cc8
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h"
#include "paddle/fluid/operators/attention_lstm_op.h"
#include <sys/time.h>
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
...
...
paddle/fluid/operators/detection/bbox_util.h
0 → 100644
浏览文件 @
459d4cc8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
/*
* transform that computes target bounding-box regression deltas
* given proposal boxes and ground-truth boxes.
*/
template
<
typename
T
>
inline
void
BoxToDelta
(
const
int
box_num
,
const
framework
::
Tensor
&
ex_boxes
,
const
framework
::
Tensor
&
gt_boxes
,
const
T
*
weights
,
const
bool
normalized
,
framework
::
Tensor
*
box_delta
)
{
auto
ex_boxes_et
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
ex_boxes
);
auto
gt_boxes_et
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
gt_boxes
);
auto
trg
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
*
box_delta
);
T
ex_w
,
ex_h
,
ex_ctr_x
,
ex_ctr_y
,
gt_w
,
gt_h
,
gt_ctr_x
,
gt_ctr_y
;
for
(
int64_t
i
=
0
;
i
<
box_num
;
++
i
)
{
ex_w
=
ex_boxes_et
(
i
,
2
)
-
ex_boxes_et
(
i
,
0
)
+
(
normalized
==
false
);
ex_h
=
ex_boxes_et
(
i
,
3
)
-
ex_boxes_et
(
i
,
1
)
+
(
normalized
==
false
);
ex_ctr_x
=
ex_boxes_et
(
i
,
0
)
+
0.5
*
ex_w
;
ex_ctr_y
=
ex_boxes_et
(
i
,
1
)
+
0.5
*
ex_h
;
gt_w
=
gt_boxes_et
(
i
,
2
)
-
gt_boxes_et
(
i
,
0
)
+
(
normalized
==
false
);
gt_h
=
gt_boxes_et
(
i
,
3
)
-
gt_boxes_et
(
i
,
1
)
+
(
normalized
==
false
);
gt_ctr_x
=
gt_boxes_et
(
i
,
0
)
+
0.5
*
gt_w
;
gt_ctr_y
=
gt_boxes_et
(
i
,
1
)
+
0.5
*
gt_h
;
trg
(
i
,
0
)
=
(
gt_ctr_x
-
ex_ctr_x
)
/
ex_w
;
trg
(
i
,
1
)
=
(
gt_ctr_y
-
ex_ctr_y
)
/
ex_h
;
trg
(
i
,
2
)
=
std
::
log
(
gt_w
/
ex_w
);
trg
(
i
,
3
)
=
std
::
log
(
gt_h
/
ex_h
);
if
(
weights
)
{
trg
(
i
,
0
)
=
trg
(
i
,
0
)
/
weights
[
0
];
trg
(
i
,
1
)
=
trg
(
i
,
1
)
/
weights
[
1
];
trg
(
i
,
2
)
=
trg
(
i
,
2
)
/
weights
[
2
];
trg
(
i
,
3
)
=
trg
(
i
,
3
)
/
weights
[
3
];
}
}
}
template
<
typename
T
>
void
Gather
(
const
T
*
in
,
const
int
in_stride
,
const
int
*
index
,
const
int
num
,
T
*
out
)
{
const
int
stride_bytes
=
in_stride
*
sizeof
(
T
);
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
id
=
index
[
i
];
memcpy
(
out
+
i
*
in_stride
,
in
+
id
*
in_stride
,
stride_bytes
);
}
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/detection/generate_proposal_labels_op.cc
浏览文件 @
459d4cc8
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
...
@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
...
@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
}
}
}
}
template
<
typename
T
>
void
BoxToDelta
(
int
box_num
,
const
Tensor
&
ex_boxes
,
const
Tensor
&
gt_boxes
,
const
std
::
vector
<
float
>&
weights
,
Tensor
*
box_delta
)
{
auto
ex_boxes_et
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
ex_boxes
);
auto
gt_boxes_et
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
gt_boxes
);
auto
box_delta_et
=
framework
::
EigenTensor
<
T
,
2
>::
From
(
*
box_delta
);
T
ex_w
,
ex_h
,
ex_ctr_x
,
ex_ctr_y
,
gt_w
,
gt_h
,
gt_ctr_x
,
gt_ctr_y
;
for
(
int64_t
i
=
0
;
i
<
box_num
;
++
i
)
{
ex_w
=
ex_boxes_et
(
i
,
2
)
-
ex_boxes_et
(
i
,
0
)
+
1
;
ex_h
=
ex_boxes_et
(
i
,
3
)
-
ex_boxes_et
(
i
,
1
)
+
1
;
ex_ctr_x
=
ex_boxes_et
(
i
,
0
)
+
0.5
*
ex_w
;
ex_ctr_y
=
ex_boxes_et
(
i
,
1
)
+
0.5
*
ex_h
;
gt_w
=
gt_boxes_et
(
i
,
2
)
-
gt_boxes_et
(
i
,
0
)
+
1
;
gt_h
=
gt_boxes_et
(
i
,
3
)
-
gt_boxes_et
(
i
,
1
)
+
1
;
gt_ctr_x
=
gt_boxes_et
(
i
,
0
)
+
0.5
*
gt_w
;
gt_ctr_y
=
gt_boxes_et
(
i
,
1
)
+
0.5
*
gt_h
;
box_delta_et
(
i
,
0
)
=
(
gt_ctr_x
-
ex_ctr_x
)
/
ex_w
/
weights
[
0
];
box_delta_et
(
i
,
1
)
=
(
gt_ctr_y
-
ex_ctr_y
)
/
ex_h
/
weights
[
1
];
box_delta_et
(
i
,
2
)
=
log
(
gt_w
/
ex_w
)
/
ex_w
/
weights
[
2
];
box_delta_et
(
i
,
3
)
=
log
(
gt_h
/
ex_h
)
/
ex_h
/
weights
[
3
];
}
}
template
<
typename
T
>
template
<
typename
T
>
std
::
vector
<
std
::
vector
<
int
>>
SampleFgBgGt
(
std
::
vector
<
std
::
vector
<
int
>>
SampleFgBgGt
(
const
platform
::
CPUDeviceContext
&
context
,
Tensor
*
iou
,
const
platform
::
CPUDeviceContext
&
context
,
Tensor
*
iou
,
...
@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
...
@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
Tensor
*
sampled_labels
,
Tensor
*
sampled_gts
)
{
Tensor
*
sampled_labels
,
Tensor
*
sampled_gts
)
{
int
fg_num
=
fg_inds
.
size
();
int
fg_num
=
fg_inds
.
size
();
int
bg_num
=
bg_inds
.
size
();
int
bg_num
=
bg_inds
.
size
();
int
gt_num
=
fg_num
+
bg_num
;
Tensor
fg_inds_t
,
bg_inds_t
,
gt_box_inds_t
,
gt_label_inds_t
;
Tensor
fg_inds_t
,
bg_inds_t
,
gt_box_inds_t
,
gt_label_inds_t
;
int
*
fg_inds_data
=
fg_inds_t
.
mutable_data
<
int
>
({
fg_num
},
context
.
GetPlace
());
int
*
fg_inds_data
=
fg_inds_t
.
mutable_data
<
int
>
({
fg_num
},
context
.
GetPlace
());
int
*
bg_inds_data
=
bg_inds_t
.
mutable_data
<
int
>
({
bg_num
},
context
.
GetPlace
());
int
*
bg_inds_data
=
bg_inds_t
.
mutable_data
<
int
>
({
bg_num
},
context
.
GetPlace
());
int
*
gt_box_inds_data
=
int
*
gt_box_inds_data
=
gt_box_inds_t
.
mutable_data
<
int
>
({
gt
_num
},
context
.
GetPlace
());
gt_box_inds_t
.
mutable_data
<
int
>
({
fg
_num
},
context
.
GetPlace
());
int
*
gt_label_inds_data
=
int
*
gt_label_inds_data
=
gt_label_inds_t
.
mutable_data
<
int
>
({
fg_num
},
context
.
GetPlace
());
gt_label_inds_t
.
mutable_data
<
int
>
({
fg_num
},
context
.
GetPlace
());
std
::
copy
(
fg_inds
.
begin
(),
fg_inds
.
end
(),
fg_inds_data
);
std
::
copy
(
fg_inds
.
begin
(),
fg_inds
.
end
(),
fg_inds_data
);
...
@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(
...
@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(
// Gather boxes and labels
// Gather boxes and labels
Tensor
sampled_boxes
,
sampled_labels
,
sampled_gts
;
Tensor
sampled_boxes
,
sampled_labels
,
sampled_gts
;
int
boxes_num
=
fg_inds
.
size
()
+
bg_inds
.
size
();
int
fg_num
=
fg_inds
.
size
();
int
bg_num
=
bg_inds
.
size
();
int
boxes_num
=
fg_num
+
bg_num
;
framework
::
DDim
bbox_dim
({
boxes_num
,
kBoxDim
});
framework
::
DDim
bbox_dim
({
boxes_num
,
kBoxDim
});
sampled_boxes
.
mutable_data
<
T
>
(
bbox_dim
,
context
.
GetPlace
());
sampled_boxes
.
mutable_data
<
T
>
(
bbox_dim
,
context
.
GetPlace
());
sampled_labels
.
mutable_data
<
int
>
({
boxes_num
},
context
.
GetPlace
());
sampled_labels
.
mutable_data
<
int
>
({
boxes_num
},
context
.
GetPlace
());
sampled_gts
.
mutable_data
<
T
>
(
bbox_dim
,
context
.
GetPlace
());
sampled_gts
.
mutable_data
<
T
>
(
{
fg_num
,
kBoxDim
}
,
context
.
GetPlace
());
GatherBoxesLabels
<
T
>
(
context
,
boxes
,
*
gt_boxes
,
*
gt_classes
,
fg_inds
,
bg_inds
,
GatherBoxesLabels
<
T
>
(
context
,
boxes
,
*
gt_boxes
,
*
gt_classes
,
fg_inds
,
bg_inds
,
gt_inds
,
&
sampled_boxes
,
&
sampled_labels
,
&
sampled_gts
);
gt_inds
,
&
sampled_boxes
,
&
sampled_labels
,
&
sampled_gts
);
// Compute targets
// Compute targets
Tensor
bbox_targets_single
;
Tensor
bbox_targets_single
;
bbox_targets_single
.
mutable_data
<
T
>
(
bbox_dim
,
context
.
GetPlace
());
bbox_targets_single
.
mutable_data
<
T
>
(
bbox_dim
,
context
.
GetPlace
());
BoxToDelta
<
T
>
(
boxes_num
,
sampled_boxes
,
sampled_gts
,
bbox_reg_weights
,
BoxToDelta
<
T
>
(
fg_num
,
sampled_boxes
,
sampled_gts
,
nullptr
,
false
,
&
bbox_targets_single
);
&
bbox_targets_single
);
// Scale rois
// Scale rois
...
@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
...
@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
auto
rpn_rois_lod
=
rpn_rois
->
lod
().
back
();
auto
rpn_rois_lod
=
rpn_rois
->
lod
().
back
();
auto
gt_classes_lod
=
gt_classes
->
lod
().
back
();
auto
gt_classes_lod
=
gt_classes
->
lod
().
back
();
auto
gt_boxes_lod
=
gt_boxes
->
lod
().
back
();
auto
gt_boxes_lod
=
gt_boxes
->
lod
().
back
();
for
(
size_
t
i
=
0
;
i
<
n
;
++
i
)
{
for
(
in
t
i
=
0
;
i
<
n
;
++
i
)
{
Tensor
rpn_rois_slice
=
Tensor
rpn_rois_slice
=
rpn_rois
->
Slice
(
rpn_rois_lod
[
i
],
rpn_rois_lod
[
i
+
1
]);
rpn_rois
->
Slice
(
rpn_rois_lod
[
i
],
rpn_rois_lod
[
i
+
1
]);
Tensor
gt_classes_slice
=
Tensor
gt_classes_slice
=
...
...
paddle/fluid/operators/detection/generate_proposals_op.cc
浏览文件 @
459d4cc8
...
@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
rpn_rois
->
mutable_data
<
T
>
({
bbox_deltas
->
numel
()
/
4
,
4
},
rpn_rois
->
mutable_data
<
T
>
({
bbox_deltas
->
numel
()
/
4
,
4
},
context
.
GetPlace
());
context
.
GetPlace
());
rpn_roi_probs
->
mutable_data
<
T
>
({
scores
->
numel
()
/
4
,
1
},
rpn_roi_probs
->
mutable_data
<
T
>
({
scores
->
numel
(),
1
},
context
.
GetPlace
());
context
.
GetPlace
());
Tensor
bbox_deltas_swap
,
scores_swap
;
Tensor
bbox_deltas_swap
,
scores_swap
;
bbox_deltas_swap
.
mutable_data
<
T
>
({
num
,
h_bbox
,
w_bbox
,
c_bbox
},
bbox_deltas_swap
.
mutable_data
<
T
>
({
num
,
h_bbox
,
w_bbox
,
c_bbox
},
...
@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
...
@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
CPUGather
<
T
>
(
ctx
,
proposals
,
keep
,
&
bbox_sel
);
CPUGather
<
T
>
(
ctx
,
proposals
,
keep
,
&
bbox_sel
);
CPUGather
<
T
>
(
ctx
,
scores_sel
,
keep
,
&
scores_filter
);
CPUGather
<
T
>
(
ctx
,
scores_sel
,
keep
,
&
scores_filter
);
if
(
nms_thresh
<=
0
)
{
if
(
nms_thresh
<=
0
)
{
return
std
::
make_pair
(
bbox_sel
,
scores_
sel
);
return
std
::
make_pair
(
bbox_sel
,
scores_
filter
);
}
}
Tensor
keep_nms
=
NMS
<
T
>
(
ctx
,
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
Tensor
keep_nms
=
NMS
<
T
>
(
ctx
,
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
...
...
paddle/fluid/operators/detection/rpn_target_assign_op.cc
浏览文件 @
459d4cc8
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include <random>
#include <random>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
...
@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
auto
in_dims
=
ctx
->
GetInputDim
(
"DistMat"
);
auto
in_dims
=
ctx
->
GetInputDim
(
"DistMat"
);
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
2
,
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
2
,
"The rank of Input(DistMat) must be 2."
);
"The rank of Input(DistMat) must be 2."
);
ctx
->
SetOutputDim
(
"LocationIndex"
,
{
-
1
});
ctx
->
SetOutputDim
(
"ScoreIndex"
,
{
-
1
});
ctx
->
SetOutputDim
(
"TargetLabel"
,
{
-
1
,
1
});
ctx
->
SetOutputDim
(
"TargetBBox"
,
{
-
1
,
4
});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"DistMat"
)
->
type
()),
platform
::
CPUPlace
());
}
}
};
};
template
<
typename
T
>
template
<
typename
T
>
class
RpnTargetAssignKernel
:
public
framework
::
OpKernel
<
T
>
{
class
RpnTargetAssignKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
anchor_t
=
context
.
Input
<
Tensor
>
(
"Anchor"
);
// (H*W*A) * 4
auto
*
gt_bbox_t
=
context
.
Input
<
Tensor
>
(
"GtBox"
);
auto
*
dist_t
=
context
.
Input
<
LoDTensor
>
(
"DistMat"
);
auto
*
loc_index_t
=
context
.
Output
<
Tensor
>
(
"LocationIndex"
);
auto
*
score_index_t
=
context
.
Output
<
Tensor
>
(
"ScoreIndex"
);
auto
*
tgt_bbox_t
=
context
.
Output
<
Tensor
>
(
"TargetBBox"
);
auto
*
tgt_lbl_t
=
context
.
Output
<
Tensor
>
(
"TargetLabel"
);
auto
lod
=
dist_t
->
lod
().
back
();
int64_t
batch_num
=
static_cast
<
int64_t
>
(
lod
.
size
()
-
1
);
int64_t
anchor_num
=
dist_t
->
dims
()[
1
];
PADDLE_ENFORCE_EQ
(
anchor_num
,
anchor_t
->
dims
()[
0
]);
int
rpn_batch_size
=
context
.
Attr
<
int
>
(
"rpn_batch_size_per_im"
);
float
pos_threshold
=
context
.
Attr
<
float
>
(
"rpn_positive_overlap"
);
float
neg_threshold
=
context
.
Attr
<
float
>
(
"rpn_negative_overlap"
);
float
fg_fraction
=
context
.
Attr
<
float
>
(
"fg_fraction"
);
int
fg_num_per_batch
=
static_cast
<
int
>
(
rpn_batch_size
*
fg_fraction
);
int64_t
max_num
=
batch_num
*
anchor_num
;
auto
place
=
context
.
GetPlace
();
tgt_bbox_t
->
mutable_data
<
T
>
({
max_num
,
4
},
place
);
auto
*
loc_index
=
loc_index_t
->
mutable_data
<
int
>
({
max_num
},
place
);
auto
*
score_index
=
score_index_t
->
mutable_data
<
int
>
({
max_num
},
place
);
Tensor
tmp_tgt_lbl
;
auto
*
tmp_lbl_data
=
tmp_tgt_lbl
.
mutable_data
<
int64_t
>
({
max_num
},
place
);
auto
&
dev_ctx
=
context
.
device_context
<
platform
::
CPUDeviceContext
>
();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
int64_t
>
iset
;
iset
(
dev_ctx
,
&
tmp_tgt_lbl
,
static_cast
<
int64_t
>
(
-
1
));
std
::
random_device
rnd
;
std
::
minstd_rand
engine
;
int
seed
=
context
.
Attr
<
bool
>
(
"fix_seed"
)
?
context
.
Attr
<
int
>
(
"seed"
)
:
rnd
();
engine
.
seed
(
seed
);
int
fg_num
=
0
;
int
bg_num
=
0
;
for
(
int
i
=
0
;
i
<
batch_num
;
++
i
)
{
Tensor
dist
=
dist_t
->
Slice
(
lod
[
i
],
lod
[
i
+
1
]);
Tensor
gt_bbox
=
gt_bbox_t
->
Slice
(
lod
[
i
],
lod
[
i
+
1
]);
auto
fg_bg_gt
=
SampleFgBgGt
(
dev_ctx
,
dist
,
pos_threshold
,
neg_threshold
,
rpn_batch_size
,
fg_num_per_batch
,
engine
,
tmp_lbl_data
+
i
*
anchor_num
);
int
cur_fg_num
=
fg_bg_gt
[
0
].
size
();
int
cur_bg_num
=
fg_bg_gt
[
1
].
size
();
std
::
transform
(
fg_bg_gt
[
0
].
begin
(),
fg_bg_gt
[
0
].
end
(),
loc_index
,
[
i
,
anchor_num
](
int
d
)
{
return
d
+
i
*
anchor_num
;
});
memcpy
(
score_index
,
loc_index
,
cur_fg_num
*
sizeof
(
int
));
std
::
transform
(
fg_bg_gt
[
1
].
begin
(),
fg_bg_gt
[
1
].
end
(),
score_index
+
cur_fg_num
,
[
i
,
anchor_num
](
int
d
)
{
return
d
+
i
*
anchor_num
;
});
// get target bbox deltas
if
(
cur_fg_num
)
{
Tensor
fg_gt
;
T
*
gt_data
=
fg_gt
.
mutable_data
<
T
>
({
cur_fg_num
,
4
},
place
);
Tensor
tgt_bbox
=
tgt_bbox_t
->
Slice
(
fg_num
,
fg_num
+
cur_fg_num
);
T
*
tgt_data
=
tgt_bbox
.
data
<
T
>
();
Gather
<
T
>
(
anchor_t
->
data
<
T
>
(),
4
,
reinterpret_cast
<
int
*>
(
&
fg_bg_gt
[
0
][
0
]),
cur_fg_num
,
tgt_data
);
Gather
<
T
>
(
gt_bbox
.
data
<
T
>
(),
4
,
reinterpret_cast
<
int
*>
(
&
fg_bg_gt
[
2
][
0
]),
cur_fg_num
,
gt_data
);
BoxToDelta
<
T
>
(
cur_fg_num
,
tgt_bbox
,
fg_gt
,
nullptr
,
false
,
&
tgt_bbox
);
}
loc_index
+=
cur_fg_num
;
score_index
+=
cur_fg_num
+
cur_bg_num
;
fg_num
+=
cur_fg_num
;
bg_num
+=
cur_bg_num
;
}
int
lbl_num
=
fg_num
+
bg_num
;
PADDLE_ENFORCE_LE
(
fg_num
,
max_num
);
PADDLE_ENFORCE_LE
(
lbl_num
,
max_num
);
tgt_bbox_t
->
Resize
({
fg_num
,
4
});
loc_index_t
->
Resize
({
fg_num
});
score_index_t
->
Resize
({
lbl_num
});
auto
*
lbl_data
=
tgt_lbl_t
->
mutable_data
<
int64_t
>
({
lbl_num
,
1
},
place
);
Gather
<
int64_t
>
(
tmp_lbl_data
,
1
,
score_index_t
->
data
<
int
>
(),
lbl_num
,
lbl_data
);
}
private:
void
ScoreAssign
(
const
T
*
dist_data
,
const
Tensor
&
anchor_to_gt_max
,
void
ScoreAssign
(
const
T
*
dist_data
,
const
Tensor
&
anchor_to_gt_max
,
const
int
row
,
const
int
col
,
const
float
pos_threshold
,
const
int
row
,
const
int
col
,
const
float
pos_threshold
,
const
float
neg_threshold
,
int64_t
*
target_label
_data
,
const
float
neg_threshold
,
int64_t
*
target_label
,
std
::
vector
<
int
>*
fg_inds
,
std
::
vector
<
int
>*
bg_inds
)
const
{
std
::
vector
<
int
>*
fg_inds
,
std
::
vector
<
int
>*
bg_inds
)
const
{
int
fg_offset
=
fg_inds
->
size
();
float
epsilon
=
0.0001
;
int
bg_offset
=
bg_inds
->
size
();
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
const
T
*
v
=
dist_data
+
i
*
col
;
const
T
*
v
=
dist_data
+
i
*
col
;
T
max
_dist
=
*
std
::
max_element
(
v
,
v
+
col
);
T
max
=
*
std
::
max_element
(
v
,
v
+
col
);
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
T
val
=
dist_data
[
i
*
col
+
j
];
if
(
std
::
abs
(
max
-
v
[
j
])
<
epsilon
)
{
if
(
val
==
max_dist
)
target_label_data
[
j
]
=
1
;
target_label
[
j
]
=
1
;
}
}
}
}
}
// Pick the fg/bg and count the number
// Pick the fg/bg
const
T
*
anchor_to_gt_max_data
=
anchor_to_gt_max
.
data
<
T
>
();
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
if
(
anchor_to_gt_max
.
data
<
T
>
()[
j
]
>
pos_threshold
)
{
if
(
anchor_to_gt_max
_data
[
j
]
>=
pos_threshold
)
{
target_label
_data
[
j
]
=
1
;
target_label
[
j
]
=
1
;
}
else
if
(
anchor_to_gt_max
.
data
<
T
>
()
[
j
]
<
neg_threshold
)
{
}
else
if
(
anchor_to_gt_max
_data
[
j
]
<
neg_threshold
)
{
target_label
_data
[
j
]
=
0
;
target_label
[
j
]
=
0
;
}
}
if
(
target_label
_data
[
j
]
==
1
)
{
if
(
target_label
[
j
]
==
1
)
{
fg_inds
->
push_back
(
fg_offset
+
j
);
fg_inds
->
push_back
(
j
);
}
else
if
(
target_label
_data
[
j
]
==
0
)
{
}
else
if
(
target_label
[
j
]
==
0
)
{
bg_inds
->
push_back
(
bg_offset
+
j
);
bg_inds
->
push_back
(
j
);
}
}
}
}
}
}
void
ReservoirSampling
(
const
int
num
,
const
int
offset
,
void
ReservoirSampling
(
const
int
num
,
std
::
minstd_rand
engine
,
std
::
minstd_rand
engine
,
std
::
vector
<
int
>*
inds
)
const
{
std
::
vector
<
int
>*
inds
)
const
{
std
::
uniform_real_distribution
<
float
>
uniform
(
0
,
1
);
std
::
uniform_real_distribution
<
float
>
uniform
(
0
,
1
);
const
int64_t
size
=
static_cast
<
int64_t
>
(
inds
->
size
()
-
offset
);
size_t
len
=
inds
->
size
(
);
if
(
size
>
num
)
{
if
(
len
>
static_cast
<
size_t
>
(
num
)
)
{
for
(
int64_t
i
=
num
;
i
<
size
;
++
i
)
{
for
(
size_t
i
=
num
;
i
<
len
;
++
i
)
{
int
rng_ind
=
std
::
floor
(
uniform
(
engine
)
*
i
);
int
rng_ind
=
std
::
floor
(
uniform
(
engine
)
*
i
);
if
(
rng_ind
<
num
)
if
(
rng_ind
<
num
)
std
::
iter_swap
(
inds
->
begin
()
+
rng_ind
+
offset
,
std
::
iter_swap
(
inds
->
begin
()
+
rng_ind
,
inds
->
begin
()
+
i
);
inds
->
begin
()
+
i
+
offset
);
}
}
inds
->
resize
(
num
);
}
}
}
}
void
RpnTargetAssign
(
const
framework
::
ExecutionContext
&
ctx
,
// std::vector<std::vector<int>> RpnTargetAssign(
const
Tensor
&
dist
,
const
float
pos_threshold
,
std
::
vector
<
std
::
vector
<
int
>>
SampleFgBgGt
(
const
float
neg_threshold
,
const
int
rpn_batch_size
,
const
platform
::
CPUDeviceContext
&
ctx
,
const
Tensor
&
dist
,
const
int
fg_num
,
std
::
minstd_rand
engine
,
const
float
pos_threshold
,
const
float
neg_threshold
,
std
::
vector
<
int
>*
fg_inds
,
std
::
vector
<
int
>*
bg_inds
,
const
int
rpn_batch_size
,
const
int
fg_num
,
std
::
minstd_rand
engine
,
int64_t
*
target_label_data
)
const
{
int64_t
*
target_label
)
const
{
auto
*
dist_data
=
dist
.
data
<
T
>
();
auto
*
dist_data
=
dist
.
data
<
T
>
();
int64_t
row
=
dist
.
dims
()[
0
];
int
row
=
dist
.
dims
()[
0
];
int64_t
col
=
dist
.
dims
()[
1
];
int
col
=
dist
.
dims
()[
1
];
int
fg_offset
=
fg_inds
->
size
();
int
bg_offset
=
bg_inds
->
size
();
std
::
vector
<
int
>
fg_inds
;
std
::
vector
<
int
>
bg_inds
;
std
::
vector
<
int
>
gt_inds
;
// Calculate the max IoU between anchors and gt boxes
// Calculate the max IoU between anchors and gt boxes
Tensor
anchor_to_gt_max
;
// Map from anchor to gt box that has highest overlap
anchor_to_gt_max
.
mutable_data
<
T
>
(
auto
place
=
ctx
.
GetPlace
();
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
col
),
1
}),
Tensor
anchor_to_gt_max
,
anchor_to_gt_argmax
;
platform
::
CPUPlace
());
anchor_to_gt_max
.
mutable_data
<
T
>
({
col
},
place
);
auto
&
place
=
*
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>()
int
*
argmax
=
anchor_to_gt_argmax
.
mutable_data
<
int
>
({
col
},
place
);
.
eigen_device
();
auto
x
=
EigenMatrix
<
T
>::
From
(
dist
);
auto
x
=
framework
::
EigenMatrix
<
T
>::
From
(
dist
);
auto
x_col_max
=
EigenMatrix
<
T
>::
From
(
anchor_to_gt_max
);
auto
x_col_max
=
framework
::
EigenVector
<
T
>::
Flatten
(
anchor_to_gt_max
);
x_col_max
.
device
(
place
)
=
auto
x_col_argmax
=
x
.
maximum
(
Eigen
::
DSizes
<
int
,
1
>
(
0
))
framework
::
EigenVector
<
int
>::
Flatten
(
anchor_to_gt_argmax
);
.
reshape
(
Eigen
::
DSizes
<
int
,
2
>
(
static_cast
<
int64_t
>
(
col
),
1
));
x_col_max
=
x
.
maximum
(
Eigen
::
DSizes
<
int
,
1
>
(
0
));
x_col_argmax
=
x
.
argmax
(
0
).
template
cast
<
int
>();
// Follow the Faster RCNN's implementation
// Follow the Faster RCNN's implementation
ScoreAssign
(
dist_data
,
anchor_to_gt_max
,
row
,
col
,
pos_threshold
,
ScoreAssign
(
dist_data
,
anchor_to_gt_max
,
row
,
col
,
pos_threshold
,
neg_threshold
,
target_label
_data
,
fg_inds
,
bg_inds
);
neg_threshold
,
target_label
,
&
fg_inds
,
&
bg_inds
);
// Reservoir Sampling
// Reservoir Sampling
ReservoirSampling
(
fg_num
,
fg_offset
,
engine
,
fg_inds
);
ReservoirSampling
(
fg_num
,
engine
,
&
fg_inds
);
int
bg_num
=
rpn_batch_size
-
(
fg_inds
->
size
()
-
fg_offset
);
int
fg_num2
=
static_cast
<
int
>
(
fg_inds
.
size
()
);
ReservoirSampling
(
bg_num
,
bg_offset
,
engine
,
bg_inds
)
;
int
bg_num
=
rpn_batch_size
-
fg_num2
;
}
ReservoirSampling
(
bg_num
,
engine
,
&
bg_inds
);
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
gt_inds
.
reserve
(
fg_num2
);
auto
*
dist
=
context
.
Input
<
LoDTensor
>
(
"DistMat"
);
for
(
int
i
=
0
;
i
<
fg_num2
;
++
i
)
{
auto
*
loc_index
=
context
.
Output
<
Tensor
>
(
"LocationIndex"
);
gt_inds
.
emplace_back
(
argmax
[
fg_inds
[
i
]]);
auto
*
score_index
=
context
.
Output
<
Tensor
>
(
"ScoreIndex"
);
auto
*
tgt_lbl
=
context
.
Output
<
Tensor
>
(
"TargetLabel"
);
auto
col
=
dist
->
dims
()[
1
];
int64_t
n
=
dist
->
lod
().
size
()
==
0UL
?
1
:
static_cast
<
int64_t
>
(
dist
->
lod
().
back
().
size
()
-
1
);
if
(
dist
->
lod
().
size
())
{
PADDLE_ENFORCE_EQ
(
dist
->
lod
().
size
(),
1UL
,
"Only support 1 level of LoD."
);
}
}
int
rpn_batch_size
=
context
.
Attr
<
int
>
(
"rpn_batch_size_per_im"
);
std
::
vector
<
std
::
vector
<
int
>>
fg_bg_gt
;
float
pos_threshold
=
context
.
Attr
<
float
>
(
"rpn_positive_overlap"
);
fg_bg_gt
.
emplace_back
(
fg_inds
);
float
neg_threshold
=
context
.
Attr
<
float
>
(
"rpn_negative_overlap"
);
fg_bg_gt
.
emplace_back
(
bg_inds
);
float
fg_fraction
=
context
.
Attr
<
float
>
(
"fg_fraction"
);
fg_bg_gt
.
emplace_back
(
gt_inds
);
int
fg_num
=
static_cast
<
int
>
(
rpn_batch_size
*
fg_fraction
);
int64_t
*
target_label_data
=
tgt_lbl
->
mutable_data
<
int64_t
>
({
n
*
col
,
1
},
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
device_context
<
platform
::
CPUDeviceContext
>
();
return
fg_bg_gt
;
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
int64_t
>
iset
;
iset
(
dev_ctx
,
tgt_lbl
,
static_cast
<
int
>
(
-
1
));
std
::
vector
<
int
>
fg_inds
;
std
::
vector
<
int
>
bg_inds
;
std
::
random_device
rnd
;
std
::
minstd_rand
engine
;
int
seed
=
context
.
Attr
<
bool
>
(
"fix_seed"
)
?
context
.
Attr
<
int
>
(
"seed"
)
:
rnd
();
engine
.
seed
(
seed
);
if
(
n
==
1
)
{
RpnTargetAssign
(
context
,
*
dist
,
pos_threshold
,
neg_threshold
,
rpn_batch_size
,
fg_num
,
engine
,
&
fg_inds
,
&
bg_inds
,
target_label_data
);
}
else
{
auto
lod
=
dist
->
lod
().
back
();
for
(
size_t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
Tensor
one_ins
=
dist
->
Slice
(
lod
[
i
],
lod
[
i
+
1
]);
RpnTargetAssign
(
context
,
one_ins
,
pos_threshold
,
neg_threshold
,
rpn_batch_size
,
fg_num
,
engine
,
&
fg_inds
,
&
bg_inds
,
target_label_data
+
i
*
col
);
}
}
int
*
loc_index_data
=
loc_index
->
mutable_data
<
int
>
(
{
static_cast
<
int
>
(
fg_inds
.
size
())},
context
.
GetPlace
());
int
*
score_index_data
=
score_index
->
mutable_data
<
int
>
(
{
static_cast
<
int
>
(
fg_inds
.
size
()
+
bg_inds
.
size
())},
context
.
GetPlace
());
memcpy
(
loc_index_data
,
reinterpret_cast
<
int
*>
(
&
fg_inds
[
0
]),
fg_inds
.
size
()
*
sizeof
(
int
));
memcpy
(
score_index_data
,
reinterpret_cast
<
int
*>
(
&
fg_inds
[
0
]),
fg_inds
.
size
()
*
sizeof
(
int
));
memcpy
(
score_index_data
+
fg_inds
.
size
(),
reinterpret_cast
<
int
*>
(
&
bg_inds
[
0
]),
bg_inds
.
size
()
*
sizeof
(
int
));
}
}
};
};
class
RpnTargetAssignOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
RpnTargetAssignOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
override
{
void
Make
()
override
{
AddInput
(
"Anchor"
,
"(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."
);
AddInput
(
"GtBox"
,
"(LoDTensor) input groud-truth bbox with shape [K, 4]."
);
AddInput
(
AddInput
(
"DistMat"
,
"DistMat"
,
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
...
@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"ScoreIndex"
,
"ScoreIndex"
,
"(Tensor), The indexes of foreground and background anchors in all "
"(Tensor), The indexes of foreground and background anchors in all "
"RPN anchors(The rest anchors are ignored). The shape of the "
"RPN anchors(The rest anchors are ignored). The shape of the "
"ScoreIndex is [F + B], F and B depend on the value of input "
"ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
"tensor and attributes."
);
" number."
);
AddOutput
(
"TargetLabel"
,
AddOutput
(
"TargetBBox"
,
"(Tensor<int64_t>), The target labels of each anchor with shape "
"(Tensor<int64_t>), The target bbox deltas with shape "
"[K * M, 1], "
"[F, 4], F is the sampled foreground number."
);
"K and M is the same as they are in DistMat."
);
AddOutput
(
"TargetLabel"
,
"(Tensor<int64_t>), The target labels of each anchor with shape "
"[F + B, 1], F and B are sampled foreground and backgroud number."
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
This operator can be, for given the IoU between the ground truth bboxes and the
This operator can be, for given the IoU between the ground truth bboxes and the
anchors, to assign classification and regression targets to each prediction.
anchors, to assign classification and regression targets to each prediction.
...
...
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
459d4cc8
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <glog/logging.h>
#include <glog/logging.h>
#include <algorithm>
#include <algorithm>
#include <iterator>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
...
@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
...
@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
template
<
typename
T
,
typename
DeviceContext
>
template
<
typename
T
,
typename
DeviceContext
>
class
MidWiseTransformIterator
;
class
MidWiseTransformIterator
;
// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
template
<
typename
T
>
template
<
typename
T
>
class
RowwiseTransformIterator
<
T
,
platform
::
CPUDeviceContext
>
{
class
RowwiseTransformIterator
<
T
,
platform
::
CPUDeviceContext
>
:
public
std
::
iterator
<
std
::
random_access_iterator_tag
,
T
,
std
::
ptrdiff_t
,
T
*
,
T
&>
{
public:
public:
RowwiseTransformIterator
(
const
T
*
ptr
,
int
n
)
:
ptr_
(
ptr
),
i_
(
0
),
n_
(
n
)
{}
RowwiseTransformIterator
(
const
T
*
ptr
,
int
n
)
:
ptr_
(
ptr
),
i_
(
0
),
n_
(
n
)
{}
...
@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
...
@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
};
};
template
<
typename
T
>
template
<
typename
T
>
class
MidWiseTransformIterator
<
T
,
platform
::
CPUDeviceContext
>
{
class
MidWiseTransformIterator
<
T
,
platform
::
CPUDeviceContext
>
:
public
std
::
iterator
<
std
::
random_access_iterator_tag
,
T
,
std
::
ptrdiff_t
,
T
*
,
T
&>
{
public:
public:
MidWiseTransformIterator
(
const
T
*
ptr
,
int
n
,
int
post
)
MidWiseTransformIterator
(
const
T
*
ptr
,
int
n
,
int
post
)
:
ptr_
(
ptr
),
i_
(
0
),
j_
(
0
),
n_
(
n
),
post_
(
post
)
{}
:
ptr_
(
ptr
),
i_
(
0
),
j_
(
0
),
n_
(
n
),
post_
(
post
)
{}
...
@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
...
@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x_dim
));
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x_dim
));
#if !defined(_WIN32)
platform
::
ForRange
<
DeviceContext
>
for_range
(
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
.
template
device_context
<
DeviceContext
>(),
N
);
ctx
.
template
device_context
<
DeviceContext
>(),
N
);
#else
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
.
device_context
<
DeviceContext
>
(),
N
);
#endif // !_WIN32
for_range
(
ElemwiseGradNoBroadcast
<
T
,
DX_OP
,
DY_OP
>
{
for_range
(
ElemwiseGradNoBroadcast
<
T
,
DX_OP
,
DY_OP
>
{
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
dx_op
,
dy_op
,
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
...
@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
...
@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
template
<
typename
Functor
,
typename
DeviceContext
,
typename
T
,
template
<
typename
Functor
,
typename
DeviceContext
,
typename
T
,
typename
OutType
=
T
>
typename
OutType
=
T
>
void
ElementwiseComputeEx
(
const
framework
::
ExecutionContext
&
ctx
,
void
ElementwiseComputeEx
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
int
axis
,
Functor
func
,
const
framework
::
Tensor
*
y
,
int
axis
,
Functor
func
,
framework
::
Tensor
*
z
)
{
framework
::
Tensor
*
z
)
{
TransformFunctor
<
Functor
,
T
,
DeviceContext
,
OutType
>
functor
(
TransformFunctor
<
Functor
,
T
,
DeviceContext
,
OutType
>
functor
(
x
,
y
,
z
,
ctx
.
template
device_context
<
DeviceContext
>(),
func
);
x
,
y
,
z
,
ctx
.
template
device_context
<
DeviceContext
>(),
func
);
auto
x_dims
=
x
->
dims
();
auto
x_dims
=
x
->
dims
();
auto
y_dims_untrimed
=
y
->
dims
();
auto
y_dims_untrimed
=
y
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims_untrimed
.
size
(),
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims_untrimed
.
size
(),
...
...
paddle/fluid/operators/fusion_gru_op.cc
浏览文件 @
459d4cc8
...
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
...
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/fusion_gru_op.h"
#include "paddle/fluid/operators/fusion_gru_op.h"
#include <cstring> // for memcpy
#include <string>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"Input(WeightH) of GRU should not be null."
);
"Input(WeightH) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XX"
),
"Output(XX) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XX"
),
"Output(XX) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"
BatchedGate
"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"
ReorderedH0
"
),
"Output(
BatchedGate
) of GRU should not be null."
);
"Output(
ReorderedH0
) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batch
ResetHiddenPrev
"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batch
edInput
"
),
"Output(Batch
ResetHiddenPrev
) of GRU should not be null."
);
"Output(Batch
edInput
) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batched
Hidden
"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Batched
Out
"
),
"Output(Batched
Hidden
) of GRU should not be null."
);
"Output(Batched
Out
) of GRU should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Hidden"
),
"Output(Hidden) of GRU should not be null."
);
"Output(Hidden) of GRU should not be null."
);
...
@@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
}
}
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedGate"
,
{
x_dims
[
0
],
wx_dims
[
1
]});
ctx
->
SetOutputDim
(
"BatchedInput"
,
{
x_dims
[
0
],
wx_dims
[
1
]});
ctx
->
SetOutputDim
(
"BatchedHidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedOut"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchResetHiddenPrev"
,
out_dims
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
int
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
int
xx_width
;
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"use_seq"
))
{
xx_width
=
wx_dims
[
1
];
}
else
{
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
}
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
xx_width
});
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
xx_width
});
ctx
->
ShareLoD
(
"X"
,
"XX"
);
ctx
->
ShareLoD
(
"X"
,
"XX"
);
}
}
...
@@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() {
...
@@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() {
"(Tensor) The FC weight with shape (M x 3D),"
"(Tensor) The FC weight with shape (M x 3D),"
"where M is the dim size of x, D is the hidden size. "
);
"where M is the dim size of x, D is the hidden size. "
);
AddInput
(
"WeightH"
,
AddInput
(
"WeightH"
,
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
);
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
"This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
"Acutally they are D x 2D and D x D two part weights."
"{W_update, W_reset; W_state}"
"{D x (D + D); D x D}"
);
AddInput
(
"Bias"
,
AddInput
(
"Bias"
,
"(Tensor, optional) (1 x 3D)."
"(Tensor, optional) (1 x 3D)."
"Almost same as GRUOp."
"Almost same as GRUOp."
"Note: if have FC bias it should be added on this bias."
)
"Note: if have FC bias it should be added on this bias."
)
.
AsDispensable
();
.
AsDispensable
();
AddOutput
(
"ReorderedH0"
,
"(Tensor) (N x D), which N is the min-batch size."
)
.
AsIntermediate
();
AddOutput
(
"XX"
,
AddOutput
(
"XX"
,
"(LoDTensor) the result after X * WeightX (size is T x
4
D)"
"(LoDTensor) the result after X * WeightX (size is T x
3
D)"
" or batched_X (size is T x M), this will be automatically chosen,"
" or batched_X (size is T x M), this will be automatically chosen,"
" where T is the total time steps in this mini-batch,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input."
)
" D is the hidden size, M is the dim size of x input."
)
.
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"BatchedGate"
,
"(LoDTensor) Same as GRUOp"
).
AsIntermediate
();
AddOutput
(
"BatchedInput"
,
AddOutput
(
"BatchResetHiddenPrev"
,
"(LoDTensor) (T x 3D) Same as GRUOp."
)
"(LoDTensor) This is the batched result of input X"
"or the batched result after fc, shape (T x 3D)"
)
.
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"Batched
Hidden"
,
"(LoDTensor) (T X D) Same as GRUOp
."
)
AddOutput
(
"Batched
Out"
,
"(LoDTensor) (T X D) save batched hidden
."
)
.
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"Hidden"
,
"(LoDTensor) (T x D) Same as GRUOp"
);
AddOutput
(
"Hidden"
,
"(LoDTensor) (T x D) Same as GRUOp"
);
AddAttr
<
std
::
string
>
(
"activation"
,
AddAttr
<
std
::
string
>
(
"activation"
,
...
@@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() {
...
@@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() {
"(bool, defalut: False) "
"(bool, defalut: False) "
"whether to compute reversed GRU."
)
"whether to compute reversed GRU."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_seq"
,
"(bool, defalut: True) "
"whether to use seq mode to compute GRU."
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
The Fusion complete GRU Operator.
The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU,
This operator fuse the fully-connected operator into GRU,
...
@@ -153,172 +165,261 @@ more details can refer to GRU op.
...
@@ -153,172 +165,261 @@ more details can refer to GRU op.
)DOC"
);
)DOC"
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
math
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
ctx
.
GetPlace
());
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
template
<
typename
DeviceContext
,
typename
T
>
class
FusionGRUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FusionGRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
if
(
ctx
.
Attr
<
bool
>
(
"use_seq"
))
{
SeqCompute
(
ctx
);
}
else
{
BatchCompute
(
ctx
);
}
}
#define INIT_VEC_FUNC \
std::function<void(const int, const T *, T *)> act_gate, act_state; \
std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_state_str = ctx.Attr<std::string>("activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::avx>; \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::isa_any>; \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse");
#define INIT_BASE_SIZES \
auto x_dims = x->dims();
/* T x M*/
\
auto wh_dims = wh->dims();
/* D x 3D*/
\
const int total_T = x_dims[0]; \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D3 = wh_dims[1]; \
const int D2 = D * 2;
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
INIT_BASE_INPUT_OUTPUT
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
INIT_BASE_SIZES
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
INIT_VEC_FUNC
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
batched_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedGate"
);
auto
*
batch_reset_hidden_prev
=
ctx
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
auto
*
batch_hidden
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedHidden"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
auto
x_lod
=
x
->
lod
();
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
h0_data
=
h0
?
h0
->
data
<
T
>
()
:
nullptr
;
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
const
T
*
wh_state_data
=
wh_data
+
D
*
D2
;
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_gate_data
=
batched_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
hidden_out_data
=
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_hidden
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
total_T
,
D3
,
M
,
x_data
,
wx_data
,
xx_data
,
bias
?
bias
->
data
<
T
>
()
:
nullptr
);
int
xx_offset
=
D3
;
int
gate_offset
=
D
;
if
(
is_reverse
)
{
const
int
offset
=
(
total_T
-
1
)
*
D
;
xx_data
=
xx_data
+
offset
*
3
;
hidden_out_data
=
hidden_out_data
+
offset
;
xx_offset
=
-
D3
;
gate_offset
=
-
D
;
}
auto
move_step
=
[
&
]()
{
xx_data
=
xx_data
+
xx_offset
;
hidden_out_data
=
hidden_out_data
+
gate_offset
;
};
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
int
bid
=
is_reverse
?
N
-
1
-
i
:
i
;
int
seq_len
=
x_lod
[
0
][
bid
+
1
]
-
x_lod
[
0
][
bid
];
const
T
*
prev_hidden_data
=
nullptr
;
int
tstart
=
0
;
if
(
h0_data
)
{
prev_hidden_data
=
h0_data
+
bid
*
D
;
}
else
{
// W: {W_update, W_reset; W_state}
// update gate
act_gate
(
D
,
xx_data
,
xx_data
);
// state gate
act_state
(
D
,
xx_data
+
D2
,
xx_data
+
D2
);
// out = a*b
blas
.
VMUL
(
D
,
xx_data
,
xx_data
+
D2
,
hidden_out_data
);
// save prev
prev_hidden_data
=
hidden_out_data
;
tstart
=
1
;
move_step
();
}
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
// gemm prev * (Wu + Wr)
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D2
,
D
,
static_cast
<
T
>
(
1
),
prev_hidden_data
,
D
,
wh_data
,
D2
,
static_cast
<
T
>
(
1
),
xx_data
,
D3
);
act_gate
(
D2
,
xx_data
,
xx_data
);
// rt = rt*ht_1 inplace result
blas
.
VMUL
(
D
,
prev_hidden_data
,
xx_data
+
D
,
hidden_out_data
);
// gemm rt * Ws
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D
,
D
,
static_cast
<
T
>
(
1
),
hidden_out_data
,
D
,
wh_state_data
,
D
,
static_cast
<
T
>
(
1
),
xx_data
+
D2
,
D3
);
act_state
(
D
,
xx_data
+
D2
,
xx_data
+
D2
);
// out = zt*ht~ + (1-zt)*ht_1
cross
(
D
,
xx_data
,
xx_data
+
D2
,
prev_hidden_data
,
hidden_out_data
);
// save prev
prev_hidden_data
=
hidden_out_data
;
move_step
();
}
}
}
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
SeqCompute
(
ctx
);
return
;
}
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
batched_input
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedInput"
);
auto
*
batched_out
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedOut"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
auto
x_dims
=
x
->
dims
();
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
wx_dims
=
wx
->
dims
();
T
*
batched_input_data
=
batched_input
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_out_data
=
batched_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
if
(
x_dims
[
1
]
>
wx_dims
[
1
]
)
{
if
(
M
>
D3
)
{
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
]
,
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
total_T
,
D3
,
M
,
x_data
,
wx_data
,
x
_data
,
wx_data
,
x
x_data
,
xx_data
,
bias
?
bias
->
data
<
T
>
()
:
NULL
);
bias
?
bias
->
data
<
T
>
()
:
nullptr
);
to_batch
(
dev_ctx
,
*
xx
,
batched_
gate
,
true
,
is_reverse
);
to_batch
(
dev_ctx
,
*
xx
,
batched_
input
,
true
,
is_reverse
);
}
else
{
}
else
{
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
batched_
gate
->
set_lod
(
xx
->
lod
());
batched_
input
->
set_lod
(
xx
->
lod
());
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
]
,
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
total_T
,
D3
,
M
,
xx_data
,
wx_data
,
xx_data
,
wx_data
,
batched_gate
_data
,
batched_input
_data
,
bias
?
bias
->
data
<
T
>
()
:
NULL
);
bias
?
bias
->
data
<
T
>
()
:
nullptr
);
}
}
int
frame_size
=
static_cast
<
int
>
(
wx_dims
[
1
]
/
3
);
auto
batched_lod
=
batched_input
->
lod
();
math
::
GRUMetaValue
<
T
>
gru_value
;
const
auto
&
seq_order
=
batched_lod
[
2
];
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
wh_data
);
const
int
max_bs
=
seq_order
.
size
();
gru_value
.
state_weight
=
reordered_h0
->
Resize
({
max_bs
,
D
});
const_cast
<
T
*>
(
wh_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batched_gate
->
lod
()[
2
]);
int
tstart
=
0
;
T
*
prev_hidden_data
=
nullptr
;
if
(
h0
)
{
if
(
h0
)
{
ReorderInitState
<
DeviceContext
,
T
>
(
// reorder h0
ctx
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
T
*
reordered_h0_data
=
reordered_h0
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
true
);
const
T
*
h0_data
=
h0
->
data
<
T
>
();
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
prev_hidden_data
=
reordered_h0_data
;
size_t
sz
=
sizeof
(
T
)
*
D
;
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
std
::
memcpy
(
reordered_h0_data
,
h0_data
+
seq_order
[
i
]
*
D
,
sz
);
reordered_h0_data
+=
D
;
}
}
else
{
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
// compute without h0
T
*
cur_in_data
=
batched_input_data
;
T
*
cur_out_data
=
batched_out_data
;
// W: {W_update, W_reset; W_state}
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
// update gate
act_gate
(
D
,
cur_in_data
,
cur_in_data
);
// state gate
act_state
(
D
,
cur_in_data
+
D2
,
cur_in_data
+
D2
);
// out = a*b
blas
.
VMUL
(
D
,
cur_in_data
,
cur_in_data
+
D2
,
cur_out_data
);
// add offset
cur_in_data
+=
D3
;
cur_out_data
+=
D
;
}
tstart
=
1
;
prev_hidden_data
=
batched_out_data
;
}
}
auto
batch_starts
=
batched_gate
->
lod
()[
0
];
// Then start from next
size_t
seq_len
=
batch_starts
.
size
()
-
1
;
const
T
*
wh_state_data
=
wh_data
+
D
*
D2
;
auto
active_node
=
const
auto
&
batch_starts
=
batched_lod
[
0
];
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"activation"
));
const
int
max_seq_len
=
batch_starts
.
size
()
-
1
;
auto
active_gate
=
math
::
detail
::
GetActivationType
(
batched_input_data
=
batched_input_data
+
tstart
*
max_bs
*
D3
;
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
));
batched_out_data
=
batched_out_data
+
tstart
*
max_bs
*
D
;
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
#ifdef PADDLE_WITH_MKLML
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
// use MKL packed to speedup GEMM
// gemm prev * (Wu + Wr)
if
(
FLAGS_paddle_num_threads
>=
4
)
{
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
cur_bs
,
D2
,
D
,
static_cast
<
T
>
(
1
),
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
prev_hidden_data
,
D
,
wh_data
,
D2
,
static_cast
<
T
>
(
1
),
T
*
packed_gate
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
batched_input_data
,
D3
);
frame_size
*
2
/*width of weight*/
,
frame_size
/*height of height*/
);
T
*
cur_batched_data
=
batched_input_data
;
PADDLE_ENFORCE
(
packed_gate
);
T
*
cur_out_data
=
batched_out_data
;
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
*
2
,
T
*
cur_prev_hidden_data
=
prev_hidden_data
;
frame_size
,
T
(
1.0
),
gru_value
.
gate_weight
,
frame_size
*
2
,
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
packed_gate
);
act_gate
(
D2
,
cur_batched_data
,
cur_batched_data
);
T
*
packed_state
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
// rt = rt*ht_1 inplace result
frame_size
/*width of weight*/
,
blas
.
VMUL
(
D
,
cur_prev_hidden_data
,
cur_batched_data
+
D
,
cur_out_data
);
frame_size
/*height of height*/
);
PADDLE_ENFORCE
(
packed_state
);
cur_batched_data
+=
D3
;
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
,
cur_prev_hidden_data
+=
D
;
frame_size
,
T
(
1.0
),
gru_value
.
state_weight
,
frame_size
,
cur_out_data
+=
D
;
packed_state
);
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batched_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
*
2
,
frame_size
,
gru_value
.
prev_out_value
,
frame_size
,
packed_gate
,
frame_size
*
2
,
T
(
1
),
gru_value
.
gate_value
,
frame_size
*
3
);
}
math
::
detail
::
forward_reset_output
(
math
::
detail
::
forward
::
gru_resetOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_gate
);
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
,
frame_size
,
gru_value
.
reset_output_value
,
frame_size
,
packed_state
,
frame_size
,
T
(
1
),
gru_value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
);
}
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_node
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
}
blas
.
GEMM_FREE
(
packed_gate
);
cur_batched_data
=
batched_input_data
;
blas
.
GEMM_FREE
(
packed_state
);
cur_out_data
=
batched_out_data
;
}
else
{
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
cur_bs
,
D
,
D
,
static_cast
<
T
>
(
1
),
#endif
cur_out_data
,
D
,
wh_state_data
,
D
,
static_cast
<
T
>
(
1
),
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
cur_batched_data
+
D2
,
D3
);
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
cur_prev_hidden_data
=
prev_hidden_data
;
int
cur_batch_size
=
bend
-
bstart
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
// ht~ = act_state(...)
Tensor
gate_t
=
batched_gate
->
Slice
(
bstart
,
bend
);
act_state
(
D
,
cur_batched_data
+
D2
,
cur_batched_data
+
D2
);
Tensor
reset_hidden_prev_t
=
// out = zt*ht~ + (1-zt)*ht_1
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
cross
(
D
,
cur_batched_data
,
cur_batched_data
+
D2
,
cur_prev_hidden_data
,
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
cur_out_data
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
cur_batched_data
+=
D3
;
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
cur_prev_hidden_data
+=
D
;
cur_out_data
+=
D
;
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
}
#ifdef PADDLE_WITH_MKLML
prev_hidden_data
=
batched_out_data
;
batched_out_data
=
cur_out_data
;
batched_input_data
=
cur_batched_data
;
}
}
#endif
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch
_hidden
->
set_lod
(
batched_gate
->
lod
()
);
batch
ed_out
->
set_lod
(
batched_lod
);
to_seq
(
dev_ctx
,
*
batch
_hidden
,
hidden_out
);
to_seq
(
dev_ctx
,
*
batch
ed_out
,
hidden_out
);
}
}
#undef INIT_VEC_FUNC
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
};
};
}
// namespace operators
}
// namespace operators
...
@@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
...
@@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_gru
,
ops
::
FusionGRUOp
,
ops
::
FusionGRUOpMaker
,
REGISTER_OPERATOR
(
fusion_gru
,
ops
::
FusionGRUOp
,
ops
::
FusionGRUOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
fusion_gru
,
ops
::
FusionGRUKernel
<
float
>
,
fusion_gru
,
ops
::
FusionGRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FusionGRUKernel
<
double
>
);
ops
::
FusionGRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/fusion_lstm_op.cc
浏览文件 @
459d4cc8
...
@@ -16,14 +16,10 @@ limitations under the License. */
...
@@ -16,14 +16,10 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool
(
seq_mode
,
true
,
"Use sequence mode"
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
"Output(Hidden) of LSTM should not be null."
);
"Output(Hidden) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Cell"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Cell"
),
"Output(Cell) of LSTM should not be null."
);
"Output(Cell) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedGate"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedInput"
),
"Output(BatchedGate) of LSTM should not be null."
);
"Output(BatchedInput) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchCellPreAct"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedHidden"
),
"Output(BatchedGate) of LSTM should not be null."
);
"Output(BatchedHidden) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedCell"
),
"Output(BatchedCell) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ReorderedH0"
),
"Output(ReorderedH0) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ReorderedC0"
),
"Output(ReorderedC0) of LSTM should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
2
,
"Input(X)'s rank must be 2."
);
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
2
,
"Input(X)'s rank must be 2."
);
...
@@ -97,13 +99,14 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -97,13 +99,14 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"Hidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"Cell"
,
out_dims
);
ctx
->
SetOutputDim
(
"Cell"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedGate"
,
{
x_dims
[
0
],
wx_dims
[
1
]});
ctx
->
SetOutputDim
(
"BatchedInput"
,
{
x_dims
[
0
],
wx_dims
[
1
]});
ctx
->
SetOutputDim
(
"BatchCellPreAct"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedHidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedCell"
,
out_dims
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
ctx
->
ShareLoD
(
"X"
,
"Hidden"
);
ctx
->
ShareLoD
(
"X"
,
"Cell"
);
ctx
->
ShareLoD
(
"X"
,
"Cell"
);
int
xx_width
;
int
xx_width
;
if
(
FLAGS_seq_mode
)
{
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"use_seq"
)
)
{
xx_width
=
wx_dims
[
1
];
xx_width
=
wx_dims
[
1
];
}
else
{
}
else
{
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
xx_width
=
x_dims
[
1
]
>
wx_dims
[
1
]
?
wx_dims
[
1
]
:
x_dims
[
1
];
...
@@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() {
...
@@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() {
" where T is the total time steps in this mini-batch,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input."
)
" D is the hidden size, M is the dim size of x input."
)
.
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"BatchedGate"
,
"(LoDTensor) (same as LSTMOp)."
).
AsIntermediate
();
AddOutput
(
"BatchedInput"
,
"(LoDTensor) (T x 4D)."
).
AsIntermediate
();
AddOutput
(
"BatchCellPreAct"
,
"(LoDTensor) (same as LSTMOp)."
)
AddOutput
(
"BatchedHidden"
,
"(LoDTensor) (T x D)."
).
AsIntermediate
();
.
AsIntermediate
();
AddOutput
(
"BatchedCell"
,
"(LoDTensor) (T x D)."
).
AsIntermediate
();
AddOutput
(
"ReorderedH0"
,
"(LoDTensor) (N x D)."
).
AsIntermediate
();
AddOutput
(
"ReorderedC0"
,
"(LoDTensor) (N x D)."
).
AsIntermediate
();
AddAttr
<
bool
>
(
"use_peepholes"
,
AddAttr
<
bool
>
(
"use_peepholes"
,
"(bool, defalut: True) "
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections."
)
"whether to enable diagonal/peephole connections."
)
...
@@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() {
...
@@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() {
"(bool, defalut: False) "
"(bool, defalut: False) "
"whether to compute reversed LSTM."
)
"whether to compute reversed LSTM."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_seq"
,
"(bool, defalut: True) "
"whether to use seq mode to compute."
)
.
SetDefault
(
true
);
AddAttr
<
std
::
string
>
(
"gate_activation"
,
AddAttr
<
std
::
string
>
(
"gate_activation"
,
"(string, default: sigmoid)"
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
"The activation for input gate, forget gate and output "
...
@@ -203,64 +212,60 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
...
@@ -203,64 +212,60 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
)DOC"
);
)DOC"
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ReorderInitState
(
const
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
src
,
framework
::
Vector
<
size_t
>
index_lod
,
framework
::
Tensor
*
dst
,
bool
indexed_src
)
{
math
::
CopyMatrixRowsFunctor
<
DeviceContext
,
T
>
row_shuffle
;
dst
->
mutable_data
<
T
>
(
src
.
dims
(),
ctx
.
GetPlace
());
// TODO(TJ): check mem copy perf
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
template
<
typename
T
>
template
<
typename
T
>
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
#define INIT_VEC_FUNC \
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool is_reverse = ctx.Attr<bool>("is_reverse");
#define INIT_BASE_SIZES \
auto x_dims = x->dims();
/* T x M*/
\
auto wh_dims = wh->dims();
/* D x 4D*/
\
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const int D3 = D * 3; \
const int D4 = wh_dims[1];
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
INIT_BASE_INPUT_OUTPUT
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
INIT_BASE_SIZES
auto
*
c0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
INIT_VEC_FUNC
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
auto
*
cell_out
=
ctx
.
Output
<
LoDTensor
>
(
"Cell"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
act_gate
,
act_cell
,
act_cand
;
auto
&
act_gate_str
=
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
);
auto
&
act_cell_str
=
ctx
.
Attr
<
std
::
string
>
(
"cell_activation"
);
auto
&
act_cand_str
=
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
);
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
))
{
math
::
VecActivations
<
T
,
platform
::
jit
::
avx
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
}
else
{
math
::
VecActivations
<
T
,
platform
::
jit
::
isa_any
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
}
auto
x_lod
=
x
->
lod
();
auto
x_lod
=
x
->
lod
();
auto
x_dims
=
x
->
dims
();
// T x M
auto
wh_dims
=
wh
->
dims
();
// D x 4D
const
int
total_T
=
x_dims
[
0
];
const
int
total_T
=
x_dims
[
0
];
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
// batch size
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
// batch size
const
int
M
=
x_dims
[
1
];
// x frame size
const
int
D
=
wh_dims
[
0
];
const
int
D2
=
D
*
2
;
const
int
D3
=
D
*
3
;
const
int
D4
=
wh_dims
[
1
];
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
h0_data
=
h0
?
h0
->
data
<
T
>
()
:
NULL
;
const
T
*
h0_data
=
h0
?
h0
->
data
<
T
>
()
:
nullptr
;
const
T
*
c0_data
=
c0
?
c0
->
data
<
T
>
()
:
NULL
;
const
T
*
c0_data
=
c0
?
c0
->
data
<
T
>
()
:
nullptr
;
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
@@ -290,12 +295,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -290,12 +295,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
int
bid
=
is_reverse
?
N
-
1
-
i
:
i
;
int
bid
=
is_reverse
?
N
-
1
-
i
:
i
;
int
seq_len
=
x_lod
[
0
][
bid
+
1
]
-
x_lod
[
0
][
bid
];
int
seq_len
=
x_lod
[
0
][
bid
+
1
]
-
x_lod
[
0
][
bid
];
const
T
*
prev_c
ell_data
=
NULL
;
const
T
*
prev_c
_data
=
nullptr
;
const
T
*
prev_h
idden_data
=
NULL
;
const
T
*
prev_h
_data
=
nullptr
;
int
tstart
=
0
;
int
tstart
=
0
;
if
(
h0_data
)
{
if
(
h0_data
)
{
prev_h
idden
_data
=
h0_data
+
bid
*
D
;
prev_h_data
=
h0_data
+
bid
*
D
;
prev_c
ell
_data
=
c0_data
+
bid
*
D
;
prev_c_data
=
c0_data
+
bid
*
D
;
}
else
{
}
else
{
// W_ch, W_ih, W_fh, W_oh
// W_ch, W_ih, W_fh, W_oh
act_gate
(
D3
,
xx_data
+
D
,
xx_data
+
D
);
act_gate
(
D3
,
xx_data
+
D
,
xx_data
+
D
);
...
@@ -307,23 +312,22 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -307,23 +312,22 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
blas
.
VMUL
(
D
,
xx_data
+
D2
,
xx_data
+
D3
,
hidden_out_data
);
blas
.
VMUL
(
D
,
xx_data
+
D2
,
xx_data
+
D3
,
hidden_out_data
);
// prev
// prev
prev_h
idden
_data
=
hidden_out_data
;
prev_h_data
=
hidden_out_data
;
prev_c
ell
_data
=
cell_out_data
;
prev_c_data
=
cell_out_data
;
tstart
=
1
;
tstart
=
1
;
move_step
();
move_step
();
}
}
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D4
,
D
,
static_cast
<
T
>
(
1
),
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D4
,
D
,
static_cast
<
T
>
(
1
),
prev_hidden_data
,
D
,
wh_data
,
D4
,
static_cast
<
T
>
(
1
),
xx_data
,
prev_h_data
,
D
,
wh_data
,
D4
,
static_cast
<
T
>
(
1
),
xx_data
,
D4
);
D4
);
// W_ch, W_ih, W_fh, W_oh
// W_ch, W_ih, W_fh, W_oh
act_gate
(
D3
,
xx_data
+
D
,
xx_data
+
D
);
act_gate
(
D3
,
xx_data
+
D
,
xx_data
+
D
);
act_cand
(
D
,
xx_data
,
xx_data
);
act_cand
(
D
,
xx_data
,
xx_data
);
// a = forget * prev_cell
// a = forget * prev_cell
blas
.
VMUL
(
D
,
xx_data
+
D2
,
prev_c
ell
_data
,
xx_data
+
D2
);
blas
.
VMUL
(
D
,
xx_data
+
D2
,
prev_c_data
,
xx_data
+
D2
);
// b = input * tilde
// b = input * tilde
blas
.
VMUL
(
D
,
xx_data
,
xx_data
+
D
,
xx_data
+
D
);
blas
.
VMUL
(
D
,
xx_data
,
xx_data
+
D
,
xx_data
+
D
);
...
@@ -336,8 +340,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -336,8 +340,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
blas
.
VMUL
(
D
,
xx_data
+
D2
,
xx_data
+
D3
,
hidden_out_data
);
blas
.
VMUL
(
D
,
xx_data
+
D2
,
xx_data
+
D3
,
hidden_out_data
);
// prev
// prev
prev_h
idden
_data
=
hidden_out_data
;
prev_h_data
=
hidden_out_data
;
prev_c
ell
_data
=
cell_out_data
;
prev_c_data
=
cell_out_data
;
move_step
();
move_step
();
}
}
...
@@ -346,143 +350,155 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -346,143 +350,155 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
platform
::
CPUDeviceContext
;
using
DeviceContext
=
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
INIT_BASE_INPUT_OUTPUT
auto
*
wx
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
auto
*
wh
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
SeqCompute
(
ctx
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
return
;
auto
*
hidden_t0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
}
auto
*
cell_t0
=
ctx
.
Input
<
Tensor
>
(
"C0"
);
INIT_BASE_SIZES
INIT_VEC_FUNC
auto
*
xx
=
ctx
.
Output
<
LoDTensor
>
(
"XX"
);
auto
*
batched_gate
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedGate"
);
auto
*
hidden_out
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
auto
*
cell_out
=
ctx
.
Output
<
LoDTensor
>
(
"Cell"
);
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
T
*
batched_gate_data
=
batched_gate
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
reordered_c0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedC0"
);
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
batched_input
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedInput"
);
cell_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
batched_c_out
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedCell"
);
auto
*
batched_h_out
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedHidden"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
auto
x_dims
=
x
->
dims
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
auto
wx_dims
=
wx
->
dims
();
auto
place
=
ctx
.
GetPlace
();
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
place
);
T
*
batched_input_data
=
batched_input
->
mutable_data
<
T
>
(
place
);
T
*
batched_c_out_data
=
batched_c_out
->
mutable_data
<
T
>
(
place
);
T
*
batched_h_out_data
=
batched_h_out
->
mutable_data
<
T
>
(
place
);
hidden_out
->
mutable_data
<
T
>
(
place
);
cell_out
->
mutable_data
<
T
>
(
place
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
if
(
x_dims
[
1
]
>
wx_dims
[
1
])
{
if
(
M
>
D4
)
{
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
],
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
D4
,
M
,
x_data
,
wx_data
,
x_data
,
wx_data
,
xx_data
,
xx_data
,
bias
->
data
<
T
>
());
bias
->
data
<
T
>
());
to_batch
(
dev_ctx
,
*
xx
,
batched_input
,
true
,
is_reverse
);
to_batch
(
dev_ctx
,
*
xx
,
batched_gate
,
true
,
is_reverse
);
}
else
{
}
else
{
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
to_batch
(
dev_ctx
,
*
x
,
xx
,
true
,
is_reverse
);
batched_
gate
->
set_lod
(
xx
->
lod
());
batched_
input
->
set_lod
(
xx
->
lod
());
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
wx_dims
[
1
],
x_dims
[
1
]
,
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
x_dims
[
0
],
D4
,
M
,
xx_data
,
xx_data
,
wx_data
,
batched_gate
_data
,
wx_data
,
batched_input
_data
,
bias
->
data
<
T
>
());
bias
->
data
<
T
>
());
}
}
int
frame_size
=
static_cast
<
int
>
(
wx_dims
[
1
]
/
4
);
auto
batched_lod
=
batched_input
->
lod
();
framework
::
DDim
out_dims
({
x_dims
[
0
],
frame_size
});
const
auto
&
seq_order
=
batched_lod
[
2
];
math
::
LstmMetaValue
<
T
>
lstm_value
;
const
int
max_bs
=
seq_order
.
size
();
// no peephole
reordered_h0
->
Resize
({
max_bs
,
D
});
lstm_value
.
check_ig
=
nullptr
;
reordered_c0
->
Resize
({
max_bs
,
D
});
lstm_value
.
check_fg
=
nullptr
;
lstm_value
.
check_og
=
nullptr
;
int
tstart
=
0
;
lstm_value
.
prev_state_value
=
nullptr
;
T
*
prev_h_data
=
nullptr
;
Tensor
ordered_c0
;
T
*
prev_c_data
=
nullptr
;
if
(
h0
)
{
framework
::
Vector
<
size_t
>
order
(
batched_gate
->
lod
()[
2
]);
// reorder h0, c0
T
*
reordered_h0_data
=
reordered_h0
->
mutable_data
<
T
>
(
place
);
if
(
cell_t0
)
{
T
*
reordered_c0_data
=
reordered_c0
->
mutable_data
<
T
>
(
place
);
// Since the batch computing for LSTM reorders the input sequence
const
T
*
h0_data
=
h0
->
data
<
T
>
();
// according to their length. The initialized cell state also needs
const
T
*
c0_data
=
c0
->
data
<
T
>
();
// to reorder.
prev_h_data
=
reordered_h0_data
;
ReorderInitState
<
DeviceContext
,
T
>
(
dev_ctx
,
*
cell_t0
,
order
,
&
ordered_c0
,
prev_c_data
=
reordered_c0_data
;
true
);
size_t
sz
=
sizeof
(
T
)
*
D
;
lstm_value
.
prev_state_value
=
ordered_c0
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
std
::
memcpy
(
reordered_h0_data
,
h0_data
+
seq_order
[
i
]
*
D
,
sz
);
std
::
memcpy
(
reordered_c0_data
,
c0_data
+
seq_order
[
i
]
*
D
,
sz
);
reordered_h0_data
+=
D
;
reordered_c0_data
+=
D
;
}
}
else
{
// compute without h0, c0
T
*
cur_in_data
=
batched_input_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
// W_ch, W_ih, W_fh, W_oh
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
act_gate
(
D3
,
cur_in_data
+
D
,
cur_in_data
+
D
);
act_cand
(
D
,
cur_in_data
,
cur_in_data
);
// cell out= input*tilde
blas
.
VMUL
(
D
,
cur_in_data
,
cur_in_data
+
D
,
cur_c_out_data
);
// hidden out= act_state(cellout) * outgate
act_cell
(
D
,
cur_c_out_data
,
cur_in_data
+
D2
);
blas
.
VMUL
(
D
,
cur_in_data
+
D2
,
cur_in_data
+
D3
,
cur_h_out_data
);
// add offset
cur_in_data
+=
D4
;
cur_c_out_data
+=
D
;
cur_h_out_data
+=
D
;
}
tstart
=
1
;
prev_h_data
=
batched_h_out_data
;
prev_c_data
=
batched_c_out_data
;
}
}
// Then start from next
const
auto
&
batch_starts
=
batched_lod
[
0
];
const
int
max_seq_len
=
batch_starts
.
size
()
-
1
;
const
int
offset
=
tstart
*
max_bs
*
D
;
batched_input_data
=
batched_input_data
+
offset
*
4
;
batched_h_out_data
=
batched_h_out_data
+
offset
;
batched_c_out_data
=
batched_c_out_data
+
offset
;
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
cur_bs
,
D4
,
D
,
static_cast
<
T
>
(
1
),
prev_h_data
,
D
,
wh_data
,
D4
,
static_cast
<
T
>
(
1
),
batched_input_data
,
D4
);
T
*
cur_in_data
=
batched_input_data
;
T
*
cur_prev_c_data
=
prev_c_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
// W_ch, W_ih, W_fh, W_oh
act_gate
(
D3
,
cur_in_data
+
D
,
cur_in_data
+
D
);
act_cand
(
D
,
cur_in_data
,
cur_in_data
);
// a = forget * prev_cell
blas
.
VMUL
(
D
,
cur_in_data
+
D2
,
cur_prev_c_data
,
cur_in_data
+
D2
);
// b = input * tilde
blas
.
VMUL
(
D
,
cur_in_data
,
cur_in_data
+
D
,
cur_in_data
+
D
);
// cell out= a+b
blas
.
VADD
(
D
,
cur_in_data
+
D
,
cur_in_data
+
D2
,
cur_c_out_data
);
// hidden out= act_state(cellout) * outgate
act_cell
(
D
,
cur_c_out_data
,
cur_in_data
+
D2
);
blas
.
VMUL
(
D
,
cur_in_data
+
D2
,
cur_in_data
+
D3
,
cur_h_out_data
);
// Use the local variable as here.
cur_in_data
+=
D4
;
LoDTensor
batch_hidden
,
batch_cell
;
cur_prev_c_data
+=
D
;
auto
*
batch_cell_pre_act
=
ctx
.
Output
<
LoDTensor
>
(
"BatchCellPreAct"
);
cur_c_out_data
+=
D
;
batch_hidden
.
mutable_data
<
T
>
(
out_dims
,
ctx
.
GetPlace
());
cur_h_out_data
+=
D
;
batch_cell
.
mutable_data
<
T
>
(
out_dims
,
ctx
.
GetPlace
());
batch_cell_pre_act
->
mutable_data
<
T
>
(
out_dims
,
ctx
.
GetPlace
());
auto
batch_starts
=
batched_gate
->
lod
()[
0
];
size_t
max_seq_len
=
batch_starts
.
size
()
-
1
;
auto
gate_act
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
));
auto
cell_act
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"cell_activation"
));
auto
cand_act
=
math
::
detail
::
GetActivationType
(
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
));
for
(
size_t
n
=
0
;
n
<
max_seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
Tensor
gate_t
=
batched_gate
->
Slice
(
bstart
,
bend
);
Tensor
out_t
=
batch_hidden
.
Slice
(
bstart
,
bend
);
Tensor
cell_t
=
batch_cell
.
Slice
(
bstart
,
bend
);
Tensor
cell_pre_act_t
=
batch_cell_pre_act
->
Slice
(
bstart
,
bend
);
int
cur_batch_size
=
bend
-
bstart
;
if
(
n
>
0
)
{
int
pre_h_start
=
static_cast
<
int
>
(
batch_starts
[
n
-
1
]);
int
pre_h_end
=
pre_h_start
+
cur_batch_size
;
auto
pre_hidden_t
=
batch_hidden
.
Slice
(
pre_h_start
,
pre_h_end
);
// TODO(TJ): use gemm directly
blas
.
MatMul
(
pre_hidden_t
,
false
,
*
wh
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
else
if
(
hidden_t0
)
{
// TODO(TJ): move h0 outside for
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
// If n == 0 and there is initialized hidden state, calculate W_h * H0.
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor
ordered_h0
;
ReorderInitState
<
DeviceContext
,
T
>
(
dev_ctx
,
*
hidden_t0
,
order
,
&
ordered_h0
,
true
);
// TODO(TJ): use gemm directly
blas
.
MatMul
(
ordered_h0
,
false
,
*
wh
,
false
,
static_cast
<
T
>
(
1.0
),
&
gate_t
,
static_cast
<
T
>
(
1.0
));
}
}
lstm_value
.
gate_value
=
gate_t
.
data
<
T
>
();
prev_c_data
=
batched_c_out_data
;
lstm_value
.
output_value
=
out_t
.
data
<
T
>
();
prev_h_data
=
batched_h_out_data
;
lstm_value
.
state_value
=
cell_t
.
data
<
T
>
();
batched_c_out_data
=
cur_c_out_data
;
lstm_value
.
state_active_value
=
cell_pre_act_t
.
data
<
T
>
();
batched_h_out_data
=
cur_h_out_data
;
math
::
LstmUnitFunctor
<
DeviceContext
,
T
>::
compute
(
batched_input_data
=
cur_in_data
;
dev_ctx
,
lstm_value
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
lstm_value
.
prev_state_value
=
lstm_value
.
state_value
;
}
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
.
set_lod
(
batched_gate
->
lod
());
batched_h_out
->
set_lod
(
batched_lod
);
// restore the output hidden in LoDTensor from the batch hidden
to_seq
(
dev_ctx
,
*
batched_h_out
,
hidden_out
);
to_seq
(
dev_ctx
,
batch_hidden
,
hidden_out
);
batched_c_out
->
set_lod
(
batched_lod
);
to_seq
(
dev_ctx
,
*
batched_c_out
,
cell_out
);
batch_cell
.
set_lod
(
batched_gate
->
lod
());
// restore the output cell state in LoDTensor from the batch cell
to_seq
(
dev_ctx
,
batch_cell
,
cell_out
);
}
}
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
if
(
FLAGS_seq_mode
)
{
if
(
ctx
.
Attr
<
bool
>
(
"use_seq"
)
)
{
SeqCompute
(
ctx
);
SeqCompute
(
ctx
);
}
else
{
}
else
{
BatchCompute
(
ctx
);
BatchCompute
(
ctx
);
}
}
}
}
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/gru_unit_op.h
浏览文件 @
459d4cc8
...
@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
...
@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
gate_data
,
frame_size
*
3
);
gate_data
,
frame_size
*
3
);
// calculate activited gate
// calculate activited gate
Eigen
::
array
<
int
,
2
>
extents
({{
batch_size
,
frame_size
}})
;
Eigen
::
array
<
int
,
2
>
extents
{{
batch_size
,
frame_size
}}
;
Eigen
::
array
<
int
,
2
>
u_offsets
({{
0
,
0
}})
;
Eigen
::
array
<
int
,
2
>
u_offsets
{{
0
,
0
}}
;
ActCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
ActCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
g
.
slice
(
u_offsets
,
extents
),
g
.
slice
(
u_offsets
,
extents
));
g
.
slice
(
u_offsets
,
extents
),
g
.
slice
(
u_offsets
,
extents
));
auto
u
=
g
.
slice
(
u_offsets
,
extents
);
// update gate
auto
u
=
g
.
slice
(
u_offsets
,
extents
);
// update gate
Eigen
::
array
<
int
,
2
>
r_offsets
({{
0
,
frame_size
}})
;
Eigen
::
array
<
int
,
2
>
r_offsets
{{
0
,
frame_size
}}
;
ActCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
ActCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
g
.
slice
(
r_offsets
,
extents
),
g
.
slice
(
r_offsets
,
extents
));
g
.
slice
(
r_offsets
,
extents
),
g
.
slice
(
r_offsets
,
extents
));
auto
r
=
g
.
slice
(
r_offsets
,
extents
);
// reset gate
auto
r
=
g
.
slice
(
r_offsets
,
extents
);
// reset gate
...
@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
...
@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
weight_data
+
frame_size
*
frame_size
*
2
,
frame_size
,
1
,
weight_data
+
frame_size
*
frame_size
*
2
,
frame_size
,
1
,
gate_data
+
frame_size
*
2
,
frame_size
*
3
);
gate_data
+
frame_size
*
2
,
frame_size
*
3
);
Eigen
::
array
<
int
,
2
>
c_offsets
({{
0
,
frame_size
*
2
}})
;
Eigen
::
array
<
int
,
2
>
c_offsets
{{
0
,
frame_size
*
2
}}
;
ActCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
ActCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
g
.
slice
(
c_offsets
,
extents
),
g
.
slice
(
c_offsets
,
extents
));
g
.
slice
(
c_offsets
,
extents
),
g
.
slice
(
c_offsets
,
extents
));
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
...
@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
...
@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
int
batch_size
=
input
->
dims
()[
0
];
int
batch_size
=
input
->
dims
()[
0
];
int
frame_size
=
hidden_prev
->
dims
()[
1
];
int
frame_size
=
hidden_prev
->
dims
()[
1
];
Eigen
::
array
<
int
,
2
>
extents
({{
batch_size
,
frame_size
}})
;
Eigen
::
array
<
int
,
2
>
extents
{{
batch_size
,
frame_size
}}
;
Eigen
::
array
<
int
,
2
>
u_offsets
({{
0
,
0
}})
;
Eigen
::
array
<
int
,
2
>
u_offsets
{{
0
,
0
}}
;
auto
u
=
g
.
slice
(
u_offsets
,
extents
);
// update gate
auto
u
=
g
.
slice
(
u_offsets
,
extents
);
// update gate
Eigen
::
array
<
int
,
2
>
r_offsets
({{
0
,
frame_size
}})
;
Eigen
::
array
<
int
,
2
>
r_offsets
{{
0
,
frame_size
}}
;
auto
r
=
g
.
slice
(
r_offsets
,
extents
);
// reset gate
auto
r
=
g
.
slice
(
r_offsets
,
extents
);
// reset gate
Eigen
::
array
<
int
,
2
>
c_offsets
({{
0
,
frame_size
*
2
}})
;
Eigen
::
array
<
int
,
2
>
c_offsets
{{
0
,
frame_size
*
2
}}
;
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
// backward for unactivated update gate
// backward for unactivated update gate
...
...
paddle/fluid/operators/label_smooth_op.h
浏览文件 @
459d4cc8
...
@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
...
@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
auto
dist
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dist_t
);
auto
dist
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dist_t
);
out
.
device
(
dev
)
=
out
.
device
(
dev
)
=
static_cast
<
T
>
(
1
-
epsilon
)
*
in
+
static_cast
<
T
>
(
1
-
epsilon
)
*
in
+
epsilon
*
dist
.
broadcast
(
Eigen
::
DSizes
<
int
,
1
>
(
in_t
->
numel
()));
static_cast
<
T
>
(
epsilon
)
*
dist
.
broadcast
(
Eigen
::
DSizes
<
int
,
1
>
(
in_t
->
numel
()));
}
else
{
}
else
{
out
.
device
(
dev
)
=
static_cast
<
T
>
(
1
-
epsilon
)
*
in
+
out
.
device
(
dev
)
=
static_cast
<
T
>
(
1
-
epsilon
)
*
in
+
static_cast
<
T
>
(
epsilon
/
label_dim
);
static_cast
<
T
>
(
epsilon
/
label_dim
);
...
...
paddle/fluid/operators/math/cpu_vec.h
浏览文件 @
459d4cc8
...
@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
...
@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
vec_scal
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
vec_scal
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
}
}
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
=
platform
::
jit
::
isa_any
>
inline
void
vec_bias_sub
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
-
x
[
i
];
}
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
jit
::
avx
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
AVX_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_bias_sub
<
float
,
platform
::
jit
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
a
);
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
-
x
[
i
];
}
#else
vec_bias_sub
<
float
,
platform
::
jit
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
jit
::
avx2
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
vec_bias_sub
<
float
,
platform
::
jit
::
avx
>
(
n
,
a
,
x
,
y
);
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
jit
::
avx512_common
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_bias_sub
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
}
// out = x*y + (1-x)*z
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
=
platform
::
jit
::
isa_any
>
inline
void
vec_cross
(
const
int
n
,
const
T
*
x
,
const
T
*
y
,
const
T
*
z
,
T
*
out
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
out
[
i
]
=
x
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
x
[
i
])
*
z
[
i
];
}
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
jit
::
avx
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
#ifdef __AVX__
constexpr
int
block
=
AVX_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_cross
<
float
,
platform
::
jit
::
isa_any
>
(
n
,
x
,
y
,
z
,
out
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
1.
f
);
__m256
tmpx
,
tmpy
,
tmpz
;
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
tmpx
=
_mm256_loadu_ps
(
x
+
i
);
tmpy
=
_mm256_loadu_ps
(
y
+
i
);
tmpz
=
_mm256_loadu_ps
(
z
+
i
);
tmpy
=
_mm256_mul_ps
(
tmpx
,
tmpy
);
tmpx
=
_mm256_sub_ps
(
bias
,
tmpx
);
tmpz
=
_mm256_mul_ps
(
tmpx
,
tmpz
);
tmpz
=
_mm256_add_ps
(
tmpy
,
tmpz
);
_mm256_storeu_ps
(
out
+
i
,
tmpz
);
}
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
out
[
i
]
=
x
[
i
]
*
y
[
i
]
+
(
1.
f
-
x
[
i
])
*
z
[
i
];
}
#else
vec_cross
<
float
,
platform
::
jit
::
isa_any
>
(
n
,
x
,
y
,
z
,
out
);
#endif
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
jit
::
avx2
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
vec_cross
<
float
,
platform
::
jit
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
jit
::
avx512_common
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
// TODO(TJ): enable me
vec_cross
<
float
,
platform
::
jit
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
}
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
=
platform
::
jit
::
isa_any
>
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
=
platform
::
jit
::
isa_any
>
inline
void
vec_add_bias
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
inline
void
vec_add_bias
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
459d4cc8
...
@@ -17,6 +17,11 @@ limitations under the License. */
...
@@ -17,6 +17,11 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#endif // _WIN32
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
...
@@ -55,12 +60,38 @@ namespace math {
...
@@ -55,12 +60,38 @@ namespace math {
* FindLastSet(x) = 1 + \floor*{\log_{2}x}
* FindLastSet(x) = 1 + \floor*{\log_{2}x}
* \f]
* \f]
*/
*/
#if !defined(_WIN32)
inline
constexpr
size_t
FindLastSet
(
size_t
x
)
{
inline
constexpr
size_t
FindLastSet
(
size_t
x
)
{
return
std
::
is_same
<
size_t
,
unsigned
int
>::
value
return
std
::
is_same
<
size_t
,
unsigned
int
>::
value
?
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clz
(
x
)
:
0
)
?
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clz
(
x
)
:
0
)
:
(
std
::
is_same
<
size_t
,
unsigned
long
>::
value
// NOLINT
:
(
std
::
is_same
<
size_t
,
unsigned
long
>::
value
// NOLINT
?
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clzl
(
x
)
:
0
)
?
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clzl
(
x
)
:
0
)
:
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clzll
(
x
)
:
0
));
:
(
x
?
8
*
sizeof
(
x
)
-
__builtin_clzll
(
x
)
:
0
));
#else
// windows don't have built-in clz, ctz function
template
<
typename
T
>
inline
int
ctz
(
const
T
&
value
)
{
DWORD
trailing_zero
=
0
;
if
(
_BitScanForward
(
&
trailing_zero
,
value
))
{
return
static_cast
<
int
>
(
trailing_zero
);
}
else
{
return
static_cast
<
int
>
(
0
);
}
}
template
<
typename
T
>
inline
int
clz
(
const
T
&
value
)
{
DWORD
leadning_zero
=
0
;
if
(
_BitScanReverse
(
&
leadning_zero
,
value
))
{
return
static_cast
<
int
>
(
sizeof
(
T
)
*
8
-
leadning_zero
);
}
else
{
return
static_cast
<
int
>
(
0
);
}
}
inline
size_t
FindLastSet
(
size_t
x
)
{
return
sizeof
(
size_t
)
*
8
-
clz
(
x
);
}
#endif // !_WIN32
}
}
struct
SimpleCode
{
struct
SimpleCode
{
...
...
paddle/fluid/operators/math/maxouting.h
浏览文件 @
459d4cc8
...
@@ -16,13 +16,12 @@ limitations under the License. */
...
@@ -16,13 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
#define FLT_MAX __FLT_MAX__
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
MaxOutFunctor
{
class
MaxOutFunctor
{
public:
public:
...
...
paddle/fluid/operators/math/pooling.h
浏览文件 @
459d4cc8
...
@@ -18,15 +18,12 @@ limitations under the License. */
...
@@ -18,15 +18,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
#define FLT_MAX \
__FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm
// still wondering where to put it.
/*
/*
* \brief Extracting simple operations from pooling.
* \brief Extracting simple operations from pooling.
* Both MaxPool and AvgPool need "initial", "compute" and "finalize"
* Both MaxPool and AvgPool need "initial", "compute" and "finalize"
...
...
paddle/fluid/operators/math/sequence2batch.h
浏览文件 @
459d4cc8
...
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
...
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
// Calculate the start position of each batch.
// Calculate the start position of each batch.
// example: sequences = {s0, s1, s2}
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
//
num_batch
= 5,
//
max_seqlen
= 5,
// batchIndex = {b0, b1, b2, b3, b4}
// batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
...
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
...
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
// where 1 is the second sequence,
// where 1 is the second sequence,
// 0 is the first sequence,
// 0 is the first sequence,
// 2 is the third sequence.
// 2 is the third sequence.
// The
num_batch
represents batch size after rearranging the
// The
max_seqlen
represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence.
// input LodTensor. It is also the maximum length of input sequence.
paddle
::
framework
::
LoD
batch_lods
;
paddle
::
framework
::
LoD
batch_lods
;
...
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
...
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
batch_lods
.
emplace_back
(
std
::
vector
<
size_t
>
{
0
});
batch_lods
.
emplace_back
(
std
::
vector
<
size_t
>
{
0
});
// batch_lods[0] is the start positions for batch LoDTensor
// batch_lods[0] is the start positions for batch LoDTensor
int
num_batch
=
seq_info
[
0
].
length
;
int
max_seqlen
=
seq_info
[
0
].
length
;
batch_lods
[
0
].
resize
(
static_cast
<
size_t
>
(
num_batch
+
1
));
batch_lods
[
0
].
resize
(
static_cast
<
size_t
>
(
max_seqlen
+
1
));
// batch_lods[1] is the raw index in the input LoDTensor
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods
[
1
].
resize
(
static_cast
<
size_t
>
(
lod_tensor
.
dims
()[
0
]));
batch_lods
[
1
].
resize
(
static_cast
<
size_t
>
(
lod_tensor
.
dims
()[
0
]));
// batch_lods[2] is the sort order for the input LoDTensor.
// batch_lods[2] is the sort order for the input LoDTensor.
...
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
...
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
size_t
*
batch_starts
=
batch_lods
[
0
].
data
();
size_t
*
batch_starts
=
batch_lods
[
0
].
data
();
size_t
*
seq2batch_idx
=
batch_lods
[
1
].
data
();
size_t
*
seq2batch_idx
=
batch_lods
[
1
].
data
();
batch_starts
[
0
]
=
0
;
batch_starts
[
0
]
=
0
;
for
(
int
n
=
0
;
n
<
num_batch
;
n
++
)
{
for
(
int
n
=
0
;
n
<
max_seqlen
;
n
++
)
{
auto
batch_id
=
static_cast
<
int
>
(
batch_starts
[
n
]);
auto
batch_id
=
static_cast
<
int
>
(
batch_starts
[
n
]);
for
(
size_t
i
=
0
;
i
<
seq_info
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
seq_info
.
size
();
++
i
)
{
int
seq_len
=
seq_info
[
i
].
length
;
int
seq_len
=
seq_info
[
i
].
length
;
...
...
paddle/fluid/operators/roi_pool_op.cu
浏览文件 @
459d4cc8
...
@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
...
@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
template
<
typename
T
>
template
<
typename
T
>
__global__
void
GPUROIPoolForward
(
__global__
void
GPUROIPoolForward
(
const
int
nthreads
,
const
T
*
input_data
,
const
int64_t
*
input_rois
,
const
int
nthreads
,
const
T
*
input_data
,
const
T
*
input_rois
,
const
float
spatial_scale
,
const
int
channels
,
const
int
height
,
const
float
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
int
*
roi_batch_id_data
,
T
*
output_data
,
int64_t
*
argmax_data
)
{
int
*
roi_batch_id_data
,
T
*
output_data
,
int64_t
*
argmax_data
)
{
...
@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
...
@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
int
c
=
(
i
/
pooled_width
/
pooled_height
)
%
channels
;
int
c
=
(
i
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
i
/
pooled_width
/
pooled_height
/
channels
;
int
n
=
i
/
pooled_width
/
pooled_height
/
channels
;
const
int64_t
*
offset_input_rois
=
input_rois
+
n
*
kROISize
;
const
T
*
offset_input_rois
=
input_rois
+
n
*
kROISize
;
int
roi_batch_ind
=
roi_batch_id_data
[
n
];
int
roi_batch_ind
=
roi_batch_id_data
[
n
];
int
roi_start_w
=
round
(
offset_input_rois
[
0
]
*
spatial_scale
);
int
roi_start_w
=
round
(
offset_input_rois
[
0
]
*
spatial_scale
);
int
roi_start_h
=
round
(
offset_input_rois
[
1
]
*
spatial_scale
);
int
roi_start_h
=
round
(
offset_input_rois
[
1
]
*
spatial_scale
);
...
@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
...
@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
template
<
typename
T
>
template
<
typename
T
>
__global__
void
GPUROIPoolBackward
(
__global__
void
GPUROIPoolBackward
(
const
int
nthreads
,
const
int64_t
*
input_rois
,
const
T
*
output_grad
,
const
int
nthreads
,
const
T
*
input_rois
,
const
T
*
output_grad
,
const
int64_t
*
argmax_data
,
const
int
num_rois
,
const
float
spatial_scale
,
const
int64_t
*
argmax_data
,
const
int
num_rois
,
const
float
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
int
*
roi_batch_id_data
,
const
int
pooled_height
,
const
int
pooled_width
,
int
*
roi_batch_id_data
,
...
@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
...
@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
GPUROIPoolForward
<
GPUROIPoolForward
<
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
output_size
,
in
->
data
<
T
>
(),
rois
->
data
<
int64_t
>
(),
spatial_scale
,
output_size
,
in
->
data
<
T
>
(),
rois
->
data
<
T
>
(),
spatial_scale
,
channels
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
height
,
width
,
pooled_height
,
pooled_width
,
roi_batch_id_list_gpu
.
data
<
int
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
roi_batch_id_list_gpu
.
data
<
int
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
argmax
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
()));
argmax
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
()));
}
}
...
@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
...
@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
if
(
output_grad_size
>
0
)
{
if
(
output_grad_size
>
0
)
{
GPUROIPoolBackward
<
GPUROIPoolBackward
<
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
output_grad_size
,
rois
->
data
<
int64_t
>
(),
out_grad
->
data
<
T
>
(),
output_grad_size
,
rois
->
data
<
T
>
(),
out_grad
->
data
<
T
>
(),
argmax
->
data
<
int64_t
>
(),
rois_num
,
spatial_scale
,
channels
,
height
,
argmax
->
data
<
int64_t
>
(),
rois_num
,
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
width
,
pooled_height
,
pooled_width
,
roi_batch_id_list_gpu
.
data
<
int
>
(),
roi_batch_id_list_gpu
.
data
<
int
>
(),
...
...
paddle/fluid/operators/roi_pool_op.h
浏览文件 @
459d4cc8
...
@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
...
@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
T
*
output_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
output_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
*
argmax_data
=
argmax
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
int64_t
*
argmax_data
=
argmax
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
const
int64_t
*
rois_data
=
rois
->
data
<
int64_t
>
();
const
T
*
rois_data
=
rois
->
data
<
T
>
();
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
int
roi_batch_id
=
roi_batch_id_data
[
n
];
int
roi_batch_id
=
roi_batch_id_data
[
n
];
int
roi_start_w
=
round
(
rois_data
[
0
]
*
spatial_scale
);
int
roi_start_w
=
round
(
rois_data
[
0
]
*
spatial_scale
);
...
@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
...
@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
}
}
}
}
const
int64_t
*
rois_data
=
rois
->
data
<
int64_t
>
();
const
T
*
rois_data
=
rois
->
data
<
T
>
();
const
T
*
out_grad_data
=
out_grad
->
data
<
T
>
();
const
T
*
out_grad_data
=
out_grad
->
data
<
T
>
();
const
int64_t
*
argmax_data
=
argmax
->
data
<
int64_t
>
();
const
int64_t
*
argmax_data
=
argmax
->
data
<
int64_t
>
();
T
*
in_grad_data
=
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
in_grad_data
=
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
paddle/fluid/operators/save_combine_op.cc
浏览文件 @
459d4cc8
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <stdint.h>
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <fstream>
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
...
@@ -23,40 +22,11 @@ limitations under the License. */
...
@@ -23,40 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr
char
kSEP
=
'/'
;
static
bool
FileExists
(
const
std
::
string
&
filepath
)
{
struct
stat
buffer
;
return
(
stat
(
filepath
.
c_str
(),
&
buffer
)
==
0
);
}
static
std
::
string
DirName
(
const
std
::
string
&
filepath
)
{
auto
pos
=
filepath
.
rfind
(
kSEP
);
if
(
pos
==
std
::
string
::
npos
)
{
return
""
;
}
return
filepath
.
substr
(
0
,
pos
);
}
static
void
MkDir
(
const
char
*
path
)
{
if
(
mkdir
(
path
,
0755
))
{
PADDLE_ENFORCE_EQ
(
errno
,
EEXIST
,
"%s mkdir failed!"
,
path
);
}
}
static
void
MkDirRecursively
(
const
char
*
fullpath
)
{
if
(
*
fullpath
==
'\0'
)
return
;
// empty string
if
(
FileExists
(
fullpath
))
return
;
MkDirRecursively
(
DirName
(
fullpath
).
c_str
());
MkDir
(
fullpath
);
}
class
SaveCombineOp
:
public
framework
::
OperatorBase
{
class
SaveCombineOp
:
public
framework
::
OperatorBase
{
public:
public:
SaveCombineOp
(
const
std
::
string
&
type
,
SaveCombineOp
(
const
std
::
string
&
type
,
...
...
paddle/fluid/operators/save_op.cc
浏览文件 @
459d4cc8
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <stdint.h>
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <fstream>
#include <numeric>
#include <numeric>
...
@@ -25,6 +24,7 @@ limitations under the License. */
...
@@ -25,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -33,36 +33,6 @@ namespace operators {
...
@@ -33,36 +33,6 @@ namespace operators {
// to directory specified.
// to directory specified.
constexpr
char
LOOKUP_TABLE_PATH
[]
=
"kLookupTablePath"
;
constexpr
char
LOOKUP_TABLE_PATH
[]
=
"kLookupTablePath"
;
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
constexpr
char
kSEP
=
'/'
;
static
bool
FileExists
(
const
std
::
string
&
filepath
)
{
struct
stat
buffer
;
return
(
stat
(
filepath
.
c_str
(),
&
buffer
)
==
0
);
}
static
std
::
string
DirName
(
const
std
::
string
&
filepath
)
{
auto
pos
=
filepath
.
rfind
(
kSEP
);
if
(
pos
==
std
::
string
::
npos
)
{
return
""
;
}
return
filepath
.
substr
(
0
,
pos
);
}
static
void
MkDir
(
const
char
*
path
)
{
if
(
mkdir
(
path
,
0755
))
{
PADDLE_ENFORCE_EQ
(
errno
,
EEXIST
,
"%s mkdir failed!"
,
path
);
}
}
static
void
MkDirRecursively
(
const
char
*
fullpath
)
{
if
(
*
fullpath
==
'\0'
)
return
;
// empty string
if
(
FileExists
(
fullpath
))
return
;
MkDirRecursively
(
DirName
(
fullpath
).
c_str
());
MkDir
(
fullpath
);
}
class
SaveOp
:
public
framework
::
OperatorBase
{
class
SaveOp
:
public
framework
::
OperatorBase
{
public:
public:
SaveOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
SaveOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
...
...
paddle/fluid/operators/sequence_enumerate_op.cc
0 → 100644
浏览文件 @
459d4cc8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/sequence_enumerate_op.h"
namespace
paddle
{
namespace
operators
{
class
SequenceEnumerateOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequecceEnumerate operator should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(X) of SequenceEnumerate operator should not be null."
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
2UL
,
"Input(X) of SequenceEnumerate operator's rank should be 2."
);
PADDLE_ENFORCE_EQ
(
x_dims
[
1
],
1UL
,
"Input(X) of SequenceEnumerate operator's 2nd dimension should be 1."
);
const
auto
win_size
=
ctx
->
Attrs
().
Get
<
int
>
(
"win_size"
);
ctx
->
SetOutputDim
(
"Out"
,
{
x_dims
[
0
],
win_size
});
ctx
->
ShareLoD
(
"X"
,
"Out"
);
}
};
class
SequenceEnumerateOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(2-D LoDTensor with the 2nd dimension equal to 1) "
"Input LoDTensor of SequenceEnumerate operator."
);
AddOutput
(
"Out"
,
"(2-D LoDTensor with the 2nd dimension equal to win_size) "
"Output LoDTensor of SequenceEnumerate operator."
);
AddAttr
<
int
>
(
"win_size"
,
"(int) The enumerate sequence window size."
)
.
AddCustomChecker
([](
const
int
&
win_size
)
{
PADDLE_ENFORCE
(
win_size
>=
2
,
"The window size should be not less than 2."
);
});
AddAttr
<
int
>
(
"pad_value"
,
"(int) The enumerate sequence padding value."
)
.
SetDefault
(
0
);
AddComment
(
R"DOC(
Sequence Enumerate Operator.
Generate a new sequence for the input index sequence, which enumerates all the
sub-sequences with length `win_size` of the input.
The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples:
Case 1:
Input:
X.lod = [[0, 3, 5]]
X.data = [[1], [2], [3], [4], [5]]
X.dims = [5, 1]
Attrs:
win_size = 2
pad_value = 0
Output:
Out.lod = [[0, 3, 5]]
Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
Out.dims = [5, 2]
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_WITHOUT_GRADIENT
(
sequence_enumerate
,
ops
::
SequenceEnumerateOp
,
ops
::
SequenceEnumerateOpMaker
);
REGISTER_OP_CPU_KERNEL
(
sequence_enumerate
,
ops
::
SequenceEnumerateKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int32_t
>
,
ops
::
SequenceEnumerateKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/sequence_enumerate_op.cu
0 → 100644
浏览文件 @
459d4cc8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_enumerate_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace
paddle
{
namespace
operators
{
using
platform
::
PADDLE_CUDA_NUM_THREADS
;
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
T
>
__global__
void
CalcOutPut
(
const
T
*
in_data
,
const
size_t
*
in_lod
,
const
size_t
lod_len
,
const
int64_t
win_size
,
const
int64_t
pad_value
,
T
*
out_data
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
in_lod
[
lod_len
-
1
])
{
int
end_idx
=
0
;
// Get LoD interval of index
for
(
int
i
=
1
;
i
<
lod_len
;
++
i
)
{
if
(
index
<
in_lod
[
i
])
{
end_idx
=
in_lod
[
i
];
break
;
}
}
for
(
size_t
i
=
0
;
i
<
win_size
;
++
i
)
{
int
word_pos
=
index
+
i
;
out_data
[
index
*
win_size
+
i
]
=
word_pos
<
end_idx
?
in_data
[
word_pos
]
:
pad_value
;
}
}
}
template
<
typename
T
>
class
SequenceEnumerateOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
int
win_size
=
context
.
Attr
<
int
>
(
"win_size"
);
int
pad_value
=
context
.
Attr
<
int
>
(
"pad_value"
);
auto
in_dims
=
in
->
dims
();
auto
in_lod
=
in
->
lod
();
PADDLE_ENFORCE_EQ
(
static_cast
<
uint64_t
>
(
in_dims
[
0
]),
in_lod
[
0
].
back
(),
"The actual input data's size mismatched with LoD information."
);
/* Generate enumerate sequence set */
auto
stream
=
context
.
cuda_device_context
().
stream
();
auto
lod0
=
in_lod
[
0
];
auto
in_len
=
in
->
numel
();
auto
in_data
=
in
->
data
<
T
>
();
auto
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
// Copy LoD to GPU
const
size_t
*
dev_in_lod_ptr
=
lod0
.
CUDAData
(
context
.
GetPlace
());
// Calc output tensor
CalcOutPut
<<<
(
in_len
-
1
)
/
PADDLE_CUDA_NUM_THREADS
+
1
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_data
,
dev_in_lod_ptr
,
lod0
.
size
(),
win_size
,
pad_value
,
out_data
);
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
sequence_enumerate
,
paddle
::
operators
::
SequenceEnumerateOpCUDAKernel
<
int32_t
>
,
paddle
::
operators
::
SequenceEnumerateOpCUDAKernel
<
int64_t
>
);
paddle/fluid/operators/sequence_enumerate_op.h
0 → 100644
浏览文件 @
459d4cc8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
template
<
typename
DeviceContext
,
typename
T
>
class
SequenceEnumerateKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
int
win_size
=
context
.
Attr
<
int
>
(
"win_size"
);
int
pad_value
=
context
.
Attr
<
int
>
(
"pad_value"
);
auto
in_dims
=
in
->
dims
();
auto
in_lod
=
in
->
lod
();
PADDLE_ENFORCE_EQ
(
static_cast
<
uint64_t
>
(
in_dims
[
0
]),
in_lod
[
0
].
back
(),
"The actual input data's size mismatched with LoD information."
);
// Generate enumerate sequence set
auto
lod0
=
in_lod
[
0
];
auto
in_data
=
in
->
data
<
T
>
();
auto
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
for
(
size_t
i
=
0
;
i
<
lod0
.
size
()
-
1
;
++
i
)
{
for
(
size_t
idx
=
lod0
[
i
];
idx
<
lod0
[
i
+
1
];
++
idx
)
{
for
(
int
word_idx
=
0
;
word_idx
<
win_size
;
++
word_idx
)
{
size_t
word_pos
=
idx
+
word_idx
;
out_data
[
win_size
*
idx
+
word_idx
]
=
word_pos
<
lod0
[
i
+
1
]
?
in_data
[
word_pos
]
:
pad_value
;
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/platform/macros.h
浏览文件 @
459d4cc8
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <cfloat>
// Disable the copy and assignment operator for a class.
// Disable the copy and assignment operator for a class.
#ifndef DISABLE_COPY_AND_ASSIGN
#ifndef DISABLE_COPY_AND_ASSIGN
...
@@ -23,3 +24,7 @@ limitations under the License. */
...
@@ -23,3 +24,7 @@ limitations under the License. */
classname& operator=(const classname&) = delete; \
classname& operator=(const classname&) = delete; \
classname& operator=(classname&&) = delete
classname& operator=(classname&&) = delete
#endif
#endif
#if defined(__FLT_MAX__)
#define FLT_MAX __FLT_MAX__
#endif // __FLT_MAX__
paddle/fluid/platform/port.h
浏览文件 @
459d4cc8
...
@@ -14,24 +14,141 @@
...
@@ -14,24 +14,141 @@
#pragma once
#pragma once
#include <cstdio>
#include <stdexcept>
#include <stdexcept>
#include <memory>
#include <string>
#include <string>
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#if !defined(_WIN32)
#if !defined(_WIN32)
#include <dlfcn.h> // for dladdr
#define UNUSED __attribute__((unused))
#include <execinfo.h> // for backtrace
#include <dlfcn.h> // dladdr
#include <execinfo.h> // backtrace
#include <sys/stat.h>
#include <algorithm> // std::accumulate
#else
#else
#include <Shlwapi.h>
#include <io.h> // _popen, _pclose
#include <Windows.h>
#include <windows.h>
#if defined(_WIN32)
#include <numeric> // std::accumulate in msvc
#endif
// windows version of __attribute__((unused))
#define UNUSED __pragma(warning(suppress : 4100))
static
void
*
dlsym
(
void
*
handle
,
const
char
*
symbol_name
)
{
#ifndef S_ISDIR // windows port for sys/stat.h
#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
#endif // S_ISDIR
static
void
*
dlsym
(
void
*
handle
,
const
char
*
symbol_name
)
{
FARPROC
found_symbol
;
FARPROC
found_symbol
;
found_symbol
=
GetProcAddress
((
HMODULE
)
handle
,
symbol_name
);
found_symbol
=
GetProcAddress
((
HMODULE
)
handle
,
symbol_name
);
if
(
found_symbol
==
NULL
)
{
if
(
found_symbol
==
NULL
)
{
throw
std
::
runtime_error
(
std
::
string
(
symbol_name
)
+
" not found."
);
throw
std
::
runtime_error
(
std
::
string
(
symbol_name
)
+
" not found."
);
}
}
return
reinterpret_cast
<
void
*>
(
found_symbol
);
return
reinterpret_cast
<
void
*>
(
found_symbol
);
}
}
#endif
static
void
*
dlopen
(
const
char
*
filename
,
int
flag
)
{
std
::
string
file_name
(
filename
);
file_name
.
replace
(
0
,
file_name
.
size
()
-
1
,
'/'
,
'\\'
);
HMODULE
hModule
=
LoadLibrary
(
file_name
.
c_str
());
if
(
!
hModule
)
{
throw
std
::
runtime_error
(
file_name
+
" not found."
);
}
return
reinterpret_cast
<
void
*>
(
hModule
);
}
#endif // !_WIN32
static
void
ExecShellCommand
(
const
std
::
string
&
cmd
,
std
::
string
*
message
)
{
char
buffer
[
128
];
#if !defined(_WIN32)
std
::
shared_ptr
<
FILE
>
pipe
(
popen
(
cmd
.
c_str
(),
"r"
),
pclose
);
#else
std
::
shared_ptr
<
FILE
>
pipe
(
_popen
(
cmd
.
c_str
(),
"r"
),
_pclose
);
#endif // _WIN32
if
(
!
pipe
)
{
LOG
(
ERROR
)
<<
"error running command: "
<<
cmd
;
return
;
}
while
(
!
feof
(
pipe
.
get
()))
{
if
(
fgets
(
buffer
,
128
,
pipe
.
get
())
!=
nullptr
)
{
*
message
+=
buffer
;
}
}
}
static
bool
PathExists
(
const
std
::
string
&
path
)
{
#if !defined(_WIN32)
struct
stat
statbuf
;
if
(
stat
(
path
.
c_str
(),
&
statbuf
)
!=
-
1
)
{
if
(
S_ISDIR
(
statbuf
.
st_mode
))
{
return
true
;
}
}
#else
struct
_stat
statbuf
;
if
(
_stat
(
path
.
c_str
(),
&
statbuf
)
!=
-
1
)
{
if
(
S_ISDIR
(
statbuf
.
st_mode
))
{
return
true
;
}
}
#endif // !_WIN32
return
false
;
}
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
#if !defined(_WIN32)
constexpr
char
kSEP
=
'/'
;
#else
constexpr
char
kSEP
=
'\\'
;
#endif // _WIN32
static
bool
FileExists
(
const
std
::
string
&
filepath
)
{
#if !defined(_WIN32)
struct
stat
buffer
;
return
(
stat
(
filepath
.
c_str
(),
&
buffer
)
==
0
);
#else
struct
_stat
buffer
;
return
(
_stat
(
filepath
.
c_str
(),
&
buffer
)
==
0
);
#endif // !_WIN32
}
static
std
::
string
DirName
(
const
std
::
string
&
filepath
)
{
auto
pos
=
filepath
.
rfind
(
kSEP
);
if
(
pos
==
std
::
string
::
npos
)
{
return
""
;
}
return
filepath
.
substr
(
0
,
pos
);
}
static
void
MkDir
(
const
char
*
path
)
{
std
::
string
path_error
(
path
);
path_error
+=
" mkdir failed!"
;
#if !defined(_WIN32)
if
(
mkdir
(
path
,
0755
))
{
if
(
errno
!=
EEXIST
)
{
throw
std
::
runtime_error
(
path_error
);
}
}
#else
CreateDirectory
(
path
,
NULL
);
auto
errorno
=
GetLastError
();
if
(
errorno
!=
ERROR_ALREADY_EXISTS
)
{
throw
std
::
runtime_error
(
path_error
);
}
#endif // !_WIN32
}
static
void
MkDirRecursively
(
const
char
*
fullpath
)
{
if
(
*
fullpath
==
'\0'
)
return
;
// empty string
if
(
FileExists
(
fullpath
))
return
;
MkDirRecursively
(
DirName
(
fullpath
).
c_str
());
MkDir
(
fullpath
);
}
python/paddle/fluid/inferencer.py
浏览文件 @
459d4cc8
...
@@ -98,10 +98,9 @@ class Inferencer(object):
...
@@ -98,10 +98,9 @@ class Inferencer(object):
raise
ValueError
(
raise
ValueError
(
"inputs should be a map of {'input_name': input_var}"
)
"inputs should be a map of {'input_name': input_var}"
)
with
executor
.
scope_guard
(
self
.
scope
):
with
self
.
_prog_and_scope_guard
():
results
=
self
.
exe
.
run
(
self
.
inference_program
,
results
=
self
.
exe
.
run
(
feed
=
inputs
,
feed
=
inputs
,
fetch_list
=
[
self
.
predict_var
.
name
],
fetch_list
=
[
self
.
predict_var
],
return_numpy
=
return_numpy
)
return_numpy
=
return_numpy
)
return
results
return
results
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
459d4cc8
...
@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
...
@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
"""
"""
helper
=
LayerHelper
(
'rpn_target_assign'
,
**
locals
())
helper
=
LayerHelper
(
'rpn_target_assign'
,
**
locals
())
# 1. Compute the regression target bboxes
# Compute overlaps between the prior boxes and the gt boxes overlaps
target_bbox
=
box_coder
(
prior_box
=
anchor_box
,
prior_box_var
=
anchor_var
,
target_box
=
gt_box
,
code_type
=
'encode_center_size'
,
box_normalized
=
False
)
# 2. Compute overlaps between the prior boxes and the gt boxes overlaps
iou
=
iou_similarity
(
x
=
gt_box
,
y
=
anchor_box
)
iou
=
iou_similarity
(
x
=
gt_box
,
y
=
anchor_box
)
# 3. Assign target label to anchors
# Assign target label to anchors
loc_index
=
helper
.
create_tmp_variable
(
dtype
=
anchor_box
.
dtype
)
loc_index
=
helper
.
create_tmp_variable
(
dtype
=
'int32'
)
score_index
=
helper
.
create_tmp_variable
(
dtype
=
anchor_box
.
dtype
)
score_index
=
helper
.
create_tmp_variable
(
dtype
=
'int32'
)
target_label
=
helper
.
create_tmp_variable
(
dtype
=
anchor_box
.
dtype
)
target_label
=
helper
.
create_tmp_variable
(
dtype
=
'int64'
)
target_bbox
=
helper
.
create_tmp_variable
(
dtype
=
anchor_box
.
dtype
)
helper
.
append_op
(
helper
.
append_op
(
type
=
"rpn_target_assign"
,
type
=
"rpn_target_assign"
,
inputs
=
{
'DistMat'
:
iou
},
inputs
=
{
'Anchor'
:
anchor_box
,
'GtBox'
:
gt_box
,
'DistMat'
:
iou
},
outputs
=
{
outputs
=
{
'LocationIndex'
:
loc_index
,
'LocationIndex'
:
loc_index
,
'ScoreIndex'
:
score_index
,
'ScoreIndex'
:
score_index
,
'TargetLabel'
:
target_label
'TargetLabel'
:
target_label
,
'TargetBBox'
:
target_bbox
,
},
},
attrs
=
{
attrs
=
{
'rpn_batch_size_per_im'
:
rpn_batch_size_per_im
,
'rpn_batch_size_per_im'
:
rpn_batch_size_per_im
,
...
@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
...
@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
'fg_fraction'
:
fg_fraction
'fg_fraction'
:
fg_fraction
})
})
# 4. Reshape and gather the target entry
loc_index
.
stop_gradient
=
True
scores
=
nn
.
reshape
(
x
=
scores
,
shape
=
(
-
1
,
2
))
score_index
.
stop_gradient
=
True
loc
=
nn
.
reshape
(
x
=
loc
,
shape
=
(
-
1
,
4
))
target_label
.
stop_gradient
=
True
target_label
=
nn
.
reshape
(
x
=
target_label
,
shape
=
(
-
1
,
1
))
target_bbox
.
stop_gradient
=
True
target_bbox
=
nn
.
reshape
(
x
=
target_bbox
,
shape
=
(
-
1
,
4
))
scores
=
nn
.
reshape
(
x
=
scores
,
shape
=
(
-
1
,
1
))
loc
=
nn
.
reshape
(
x
=
loc
,
shape
=
(
-
1
,
4
))
predicted_scores
=
nn
.
gather
(
scores
,
score_index
)
predicted_scores
=
nn
.
gather
(
scores
,
score_index
)
predicted_location
=
nn
.
gather
(
loc
,
loc_index
)
predicted_location
=
nn
.
gather
(
loc
,
loc_index
)
target_label
=
nn
.
gather
(
target_label
,
score_index
)
target_bbox
=
nn
.
gather
(
target_bbox
,
loc_index
)
return
predicted_scores
,
predicted_location
,
target_label
,
target_bbox
return
predicted_scores
,
predicted_location
,
target_label
,
target_bbox
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
459d4cc8
...
@@ -111,6 +111,7 @@ __all__ = [
...
@@ -111,6 +111,7 @@ __all__ = [
'stack'
,
'stack'
,
'pad2d'
,
'pad2d'
,
'unstack'
,
'unstack'
,
'sequence_enumerate'
,
]
]
...
@@ -5823,6 +5824,51 @@ def flatten(x, axis=1, name=None):
...
@@ -5823,6 +5824,51 @@ def flatten(x, axis=1, name=None):
return
out
return
out
def
sequence_enumerate
(
input
,
win_size
,
pad_value
=
0
,
name
=
None
):
"""
Generate a new sequence for the input index sequence, which enumerates all the
sub-sequences with length `win_size` of the input.
The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples:
Case 1:
Input:
X.lod = [[0, 3, 5]]
X.data = [[1], [2], [3], [4], [5]]
X.dims = [5, 1]
Attrs:
win_size = 2
pad_value = 0
Output:
Out.lod = [[0, 3, 5]]
Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
Out.dims = [5, 2]
Args:
input (Variable): The input variable which is a index sequence.
win_size (int): The window size for enumerating all sub-sequences.
pad_value (int): The padding value, default 0.
Returns:
Variable: The enumerate sequence variable which is a LoDTensor.
Examples:
.. code-block:: python
x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
"""
helper
=
LayerHelper
(
'sequence_enumerate'
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
helper
.
input_dtype
(),
stop_gradient
=
True
)
helper
.
append_op
(
type
=
'sequence_enumerate'
,
inputs
=
{
'X'
:
input
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'win_size'
:
win_size
,
'pad_value'
:
pad_value
})
def
sequence_mask
(
x
,
maxlen
=
None
,
dtype
=
'int64'
,
name
=
None
):
def
sequence_mask
(
x
,
maxlen
=
None
,
dtype
=
'int64'
,
name
=
None
):
"""
"""
**SequenceMask Layer**
**SequenceMask Layer**
...
@@ -5902,6 +5948,7 @@ def stack(x, axis=0):
...
@@ -5902,6 +5948,7 @@ def stack(x, axis=0):
helper
.
append_op
(
helper
.
append_op
(
type
=
'stack'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Y'
:
out
},
type
=
'stack'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Y'
:
out
},
attrs
=
{
'axis'
:
axis
})
attrs
=
{
'axis'
:
axis
})
return
out
return
out
...
...
python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
浏览文件 @
459d4cc8
...
@@ -16,7 +16,9 @@ from __future__ import print_function
...
@@ -16,7 +16,9 @@ from __future__ import print_function
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
import
numpy
import
os
import
cifar10_small_test_set
import
cifar10_small_test_set
...
@@ -89,7 +91,7 @@ def optimizer_func():
...
@@ -89,7 +91,7 @@ def optimizer_func():
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
def
train
(
use_cuda
,
train_program
,
params_dirname
):
def
train
(
use_cuda
,
train_program
,
para
llel
,
para
ms_dirname
):
BATCH_SIZE
=
128
BATCH_SIZE
=
128
EPOCH_NUM
=
1
EPOCH_NUM
=
1
...
@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname):
...
@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
trainer
=
fluid
.
Trainer
(
trainer
=
fluid
.
Trainer
(
train_func
=
train_program
,
optimizer_func
=
optimizer_func
,
place
=
place
)
train_func
=
train_program
,
optimizer_func
=
optimizer_func
,
place
=
place
,
parallel
=
parallel
)
trainer
.
train
(
trainer
.
train
(
reader
=
train_reader
,
reader
=
train_reader
,
...
@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname):
...
@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname):
feed_order
=
[
'pixel'
,
'label'
])
feed_order
=
[
'pixel'
,
'label'
])
def
infer
(
use_cuda
,
inference_program
,
params_dirname
=
None
):
def
infer
(
use_cuda
,
inference_program
,
para
llel
,
para
ms_dirname
=
None
):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
inferencer
=
fluid
.
Inferencer
(
inferencer
=
fluid
.
Inferencer
(
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
)
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
,
parallel
=
parallel
)
# The input's dimension of conv should be 4-D or 5-D.
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
# Use normilized image pixels as input data, which should be in the range
...
@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
...
@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
print
(
"infer results: "
,
results
)
print
(
"infer results: "
,
results
)
def
main
(
use_cuda
):
def
main
(
use_cuda
,
parallel
):
if
use_cuda
and
not
fluid
.
core
.
is_compiled_with_cuda
():
if
use_cuda
and
not
fluid
.
core
.
is_compiled_with_cuda
():
return
return
save_path
=
"image_classification_resnet.inference.model"
save_path
=
"image_classification_resnet.inference.model"
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
train
(
train
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
train_program
=
train_network
,
train_program
=
train_network
,
params_dirname
=
save_path
)
params_dirname
=
save_path
,
parallel
=
parallel
)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if
parallel
and
use_cuda
:
return
os
.
environ
[
'CPU_NUM'
]
=
str
(
1
)
infer
(
infer
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
inference_program
=
inference_network
,
inference_program
=
inference_network
,
params_dirname
=
save_path
)
params_dirname
=
save_path
,
parallel
=
parallel
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
for
use_cuda
in
(
False
,
True
):
for
use_cuda
in
(
False
,
True
):
main
(
use_cuda
=
use_cuda
)
for
parallel
in
(
False
,
True
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
continue
main
(
use_cuda
=
use_cuda
,
parallel
=
parallel
)
python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
浏览文件 @
459d4cc8
...
@@ -16,7 +16,9 @@ from __future__ import print_function
...
@@ -16,7 +16,9 @@ from __future__ import print_function
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
import
numpy
import
os
import
cifar10_small_test_set
import
cifar10_small_test_set
...
@@ -68,7 +70,7 @@ def optimizer_func():
...
@@ -68,7 +70,7 @@ def optimizer_func():
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
def
train
(
use_cuda
,
train_program
,
params_dirname
):
def
train
(
use_cuda
,
train_program
,
para
llel
,
para
ms_dirname
):
BATCH_SIZE
=
128
BATCH_SIZE
=
128
train_reader
=
paddle
.
batch
(
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
reader
.
shuffle
(
...
@@ -93,7 +95,10 @@ def train(use_cuda, train_program, params_dirname):
...
@@ -93,7 +95,10 @@ def train(use_cuda, train_program, params_dirname):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
trainer
=
fluid
.
Trainer
(
trainer
=
fluid
.
Trainer
(
train_func
=
train_program
,
place
=
place
,
optimizer_func
=
optimizer_func
)
train_func
=
train_program
,
place
=
place
,
optimizer_func
=
optimizer_func
,
parallel
=
parallel
)
trainer
.
train
(
trainer
.
train
(
reader
=
train_reader
,
reader
=
train_reader
,
...
@@ -102,10 +107,13 @@ def train(use_cuda, train_program, params_dirname):
...
@@ -102,10 +107,13 @@ def train(use_cuda, train_program, params_dirname):
feed_order
=
[
'pixel'
,
'label'
])
feed_order
=
[
'pixel'
,
'label'
])
def
infer
(
use_cuda
,
inference_program
,
params_dirname
=
None
):
def
infer
(
use_cuda
,
inference_program
,
para
llel
,
para
ms_dirname
=
None
):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
inferencer
=
fluid
.
Inferencer
(
inferencer
=
fluid
.
Inferencer
(
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
)
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
,
parallel
=
parallel
)
# The input's dimension of conv should be 4-D or 5-D.
# The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range
# Use normilized image pixels as input data, which should be in the range
...
@@ -116,22 +124,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
...
@@ -116,22 +124,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
print
(
"infer results: "
,
results
)
print
(
"infer results: "
,
results
)
def
main
(
use_cuda
):
def
main
(
use_cuda
,
parallel
):
if
use_cuda
and
not
fluid
.
core
.
is_compiled_with_cuda
():
return
save_path
=
"image_classification_vgg.inference.model"
save_path
=
"image_classification_vgg.inference.model"
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
train
(
train
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
train_program
=
train_network
,
train_program
=
train_network
,
params_dirname
=
save_path
)
params_dirname
=
save_path
,
parallel
=
parallel
)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if
parallel
and
use_cuda
:
return
os
.
environ
[
'CPU_NUM'
]
=
str
(
1
)
infer
(
infer
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
inference_program
=
inference_network
,
inference_program
=
inference_network
,
params_dirname
=
save_path
)
params_dirname
=
save_path
,
parallel
=
parallel
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
for
use_cuda
in
(
False
,
True
):
for
use_cuda
in
(
False
,
True
):
main
(
use_cuda
=
use_cuda
)
for
parallel
in
(
False
,
True
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
continue
main
(
use_cuda
=
use_cuda
,
parallel
=
parallel
)
python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
浏览文件 @
459d4cc8
...
@@ -64,14 +64,14 @@ def optimizer_func():
...
@@ -64,14 +64,14 @@ def optimizer_func():
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
def
train
(
use_cuda
,
train_program
,
params_dirname
):
def
train
(
use_cuda
,
train_program
,
para
llel
,
para
ms_dirname
):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
trainer
=
fluid
.
Trainer
(
trainer
=
fluid
.
Trainer
(
train_func
=
train_program
,
train_func
=
train_program
,
place
=
place
,
place
=
place
,
optimizer_func
=
optimizer_func
,
optimizer_func
=
optimizer_func
,
parallel
=
True
)
parallel
=
parallel
)
def
event_handler
(
event
):
def
event_handler
(
event
):
if
isinstance
(
event
,
fluid
.
EndEpochEvent
):
if
isinstance
(
event
,
fluid
.
EndEpochEvent
):
...
@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname):
...
@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname):
feed_order
=
[
'img'
,
'label'
])
feed_order
=
[
'img'
,
'label'
])
def
infer
(
use_cuda
,
inference_program
,
params_dirname
=
None
):
def
infer
(
use_cuda
,
inference_program
,
para
llel
,
para
ms_dirname
=
None
):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
inferencer
=
fluid
.
Inferencer
(
inferencer
=
fluid
.
Inferencer
(
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
)
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
,
parallel
=
parallel
)
batch_size
=
1
batch_size
=
1
tensor_img
=
numpy
.
random
.
uniform
(
-
1.0
,
1.0
,
tensor_img
=
numpy
.
random
.
uniform
(
-
1.0
,
1.0
,
...
@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
...
@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print
(
"infer results: "
,
results
[
0
])
print
(
"infer results: "
,
results
[
0
])
def
main
(
use_cuda
):
def
main
(
use_cuda
,
parallel
):
params_dirname
=
"recognize_digits_conv.inference.model"
params_dirname
=
"recognize_digits_conv.inference.model"
# call train() with is_local argument to run distributed train
# call train() with is_local argument to run distributed train
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
train
(
train
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
train_program
=
train_program
,
train_program
=
train_program
,
params_dirname
=
params_dirname
)
params_dirname
=
params_dirname
,
parallel
=
parallel
)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if
parallel
and
use_cuda
:
return
os
.
environ
[
'CPU_NUM'
]
=
str
(
1
)
infer
(
infer
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
inference_program
=
inference_program
,
inference_program
=
inference_program
,
params_dirname
=
params_dirname
)
params_dirname
=
params_dirname
,
parallel
=
parallel
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
# for use_cuda in (False, True):
for
use_cuda
in
(
False
,
True
):
main
(
use_cuda
=
core
.
is_compiled_with_cuda
())
for
parallel
in
(
False
,
True
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
continue
main
(
use_cuda
=
use_cuda
,
parallel
=
parallel
)
python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
浏览文件 @
459d4cc8
...
@@ -16,6 +16,7 @@ from __future__ import print_function
...
@@ -16,6 +16,7 @@ from __future__ import print_function
import
argparse
import
argparse
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
paddle
import
paddle
import
sys
import
sys
import
numpy
import
numpy
...
@@ -50,11 +51,14 @@ def optimizer_func():
...
@@ -50,11 +51,14 @@ def optimizer_func():
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
return
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
def
train
(
use_cuda
,
train_program
,
params_dirname
):
def
train
(
use_cuda
,
train_program
,
params_dirname
,
parallel
):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
trainer
=
fluid
.
Trainer
(
trainer
=
fluid
.
Trainer
(
train_func
=
train_program
,
place
=
place
,
optimizer_func
=
optimizer_func
)
train_func
=
train_program
,
place
=
place
,
optimizer_func
=
optimizer_func
,
parallel
=
parallel
)
def
event_handler
(
event
):
def
event_handler
(
event
):
if
isinstance
(
event
,
fluid
.
EndEpochEvent
):
if
isinstance
(
event
,
fluid
.
EndEpochEvent
):
...
@@ -86,11 +90,14 @@ def train(use_cuda, train_program, params_dirname):
...
@@ -86,11 +90,14 @@ def train(use_cuda, train_program, params_dirname):
feed_order
=
[
'img'
,
'label'
])
feed_order
=
[
'img'
,
'label'
])
def
infer
(
use_cuda
,
inference_program
,
params_dirname
=
None
):
def
infer
(
use_cuda
,
inference_program
,
para
llel
,
para
ms_dirname
=
None
):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
inferencer
=
fluid
.
Inferencer
(
inferencer
=
fluid
.
Inferencer
(
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
)
infer_func
=
inference_program
,
param_path
=
params_dirname
,
place
=
place
,
parallel
=
parallel
)
batch_size
=
1
batch_size
=
1
tensor_img
=
numpy
.
random
.
uniform
(
-
1.0
,
1.0
,
tensor_img
=
numpy
.
random
.
uniform
(
-
1.0
,
1.0
,
...
@@ -101,20 +108,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
...
@@ -101,20 +108,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print
(
"infer results: "
,
results
[
0
])
print
(
"infer results: "
,
results
[
0
])
def
main
(
use_cuda
):
def
main
(
use_cuda
,
parallel
):
params_dirname
=
"recognize_digits_mlp.inference.model"
params_dirname
=
"recognize_digits_mlp.inference.model"
# call train() with is_local argument to run distributed train
# call train() with is_local argument to run distributed train
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
train
(
train
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
train_program
=
train_program
,
train_program
=
train_program
,
params_dirname
=
params_dirname
)
params_dirname
=
params_dirname
,
parallel
=
parallel
)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if
parallel
and
use_cuda
:
return
os
.
environ
[
'CPU_NUM'
]
=
str
(
1
)
infer
(
infer
(
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
inference_program
=
inference_program
,
inference_program
=
inference_program
,
params_dirname
=
params_dirname
)
params_dirname
=
params_dirname
,
parallel
=
parallel
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
# for use_cuda in (False, True):
for
use_cuda
in
(
False
,
True
):
main
(
use_cuda
=
False
)
for
parallel
in
(
False
,
True
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
continue
main
(
use_cuda
=
use_cuda
,
parallel
=
parallel
)
python/paddle/fluid/tests/test_detection.py
浏览文件 @
459d4cc8
...
@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
...
@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
gt_box
=
layers
.
data
(
gt_box
=
layers
.
data
(
name
=
'gt_box'
,
shape
=
[
4
],
lod_level
=
1
,
dtype
=
'float32'
)
name
=
'gt_box'
,
shape
=
[
4
],
lod_level
=
1
,
dtype
=
'float32'
)
pred
icted_scores
,
predicted_location
,
target_label
,
targe
t_bbox
=
layers
.
rpn_target_assign
(
pred
_scores
,
pred_loc
,
tgt_lbl
,
tg
t_bbox
=
layers
.
rpn_target_assign
(
loc
=
loc
,
loc
=
loc
,
scores
=
scores
,
scores
=
scores
,
anchor_box
=
anchor_box
,
anchor_box
=
anchor_box
,
...
@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
...
@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
rpn_positive_overlap
=
0.7
,
rpn_positive_overlap
=
0.7
,
rpn_negative_overlap
=
0.3
)
rpn_negative_overlap
=
0.3
)
self
.
assertIsNotNone
(
predicted_scores
)
self
.
assertIsNotNone
(
pred_scores
)
self
.
assertIsNotNone
(
predicted_location
)
self
.
assertIsNotNone
(
pred_loc
)
self
.
assertIsNotNone
(
target_label
)
self
.
assertIsNotNone
(
tgt_lbl
)
self
.
assertIsNotNone
(
target_bbox
)
self
.
assertIsNotNone
(
tgt_bbox
)
assert
predicted_scores
.
shape
[
1
]
==
2
assert
pred_scores
.
shape
[
1
]
==
1
assert
predicted_location
.
shape
[
1
]
==
4
assert
pred_loc
.
shape
[
1
]
==
4
assert
predicted_location
.
shape
[
1
]
==
target_bbox
.
shape
[
1
]
assert
pred_loc
.
shape
[
1
]
==
tgt_bbox
.
shape
[
1
]
print
(
str
(
program
))
class
TestGenerateProposals
(
unittest
.
TestCase
):
class
TestGenerateProposals
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
浏览文件 @
459d4cc8
...
@@ -37,7 +37,7 @@ def fusion_gru(
...
@@ -37,7 +37,7 @@ def fusion_gru(
h0
,
h0
,
wh
,
wh
,
np
.
zeros
(
np
.
zeros
(
(
1
,
wh
.
shape
[
1
]),
dtype
=
'float
64
'
),
(
1
,
wh
.
shape
[
1
]),
dtype
=
'float
32
'
),
is_reverse
,
is_reverse
,
act_state
,
act_state
,
act_gate
)
act_gate
)
...
@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest):
...
@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest):
T
=
sum
(
self
.
lod
[
0
])
T
=
sum
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
N
=
len
(
self
.
lod
[
0
])
x
=
np
.
random
.
rand
(
T
,
self
.
M
).
astype
(
'float
64
'
)
x
=
np
.
random
.
rand
(
T
,
self
.
M
).
astype
(
'float
32
'
)
wx
=
np
.
random
.
rand
(
self
.
M
,
3
*
self
.
D
).
astype
(
'float
64
'
)
wx
=
np
.
random
.
rand
(
self
.
M
,
3
*
self
.
D
).
astype
(
'float
32
'
)
wh
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float
64
'
)
wh
=
np
.
random
.
rand
(
self
.
D
,
3
*
self
.
D
).
astype
(
'float
32
'
)
bias
=
np
.
random
.
rand
(
bias
=
np
.
random
.
rand
(
1
,
3
*
self
.
D
).
astype
(
'float
64
'
)
if
self
.
with_bias
else
np
.
zeros
(
1
,
3
*
self
.
D
).
astype
(
'float
32
'
)
if
self
.
with_bias
else
np
.
zeros
(
(
1
,
3
*
self
.
D
),
dtype
=
'float
64
'
)
(
1
,
3
*
self
.
D
),
dtype
=
'float
32
'
)
h0
=
np
.
random
.
rand
(
h0
=
np
.
random
.
rand
(
N
,
self
.
D
).
astype
(
'float
64
'
)
if
self
.
with_h0
else
np
.
zeros
(
N
,
self
.
D
).
astype
(
'float
32
'
)
if
self
.
with_h0
else
np
.
zeros
(
(
N
,
self
.
D
),
dtype
=
'float
64
'
)
(
N
,
self
.
D
),
dtype
=
'float
32
'
)
_
,
_
,
_
,
hidden
=
fusion_gru
(
_
,
_
,
_
,
hidden
=
fusion_gru
(
x
,
self
.
lod
,
h0
,
wx
,
wh
,
bias
,
self
.
is_reverse
,
x
,
self
.
lod
,
h0
,
wx
,
wh
,
bias
,
self
.
is_reverse
,
...
@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest):
...
@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest):
}
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
(
atol
=
1e-8
)
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
()
class
TestFusionGRUOpNoInitial
(
TestFusionGRUOp
):
class
TestFusionGRUOpNoInitial
(
TestFusionGRUOp
):
...
...
python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
浏览文件 @
459d4cc8
...
@@ -114,7 +114,9 @@ class TestFusionLSTMOp(OpTest):
...
@@ -114,7 +114,9 @@ class TestFusionLSTMOp(OpTest):
}
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
()
class
TestFusionLSTMOpInit
(
TestFusionLSTMOp
):
class
TestFusionLSTMOpInit
(
TestFusionLSTMOp
):
...
...
python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
浏览文件 @
459d4cc8
...
@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
...
@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
dx
=
(
gt_ctr_x
-
ex_ctr_x
)
/
ex_w
/
weights
[
0
]
dx
=
(
gt_ctr_x
-
ex_ctr_x
)
/
ex_w
/
weights
[
0
]
dy
=
(
gt_ctr_y
-
ex_ctr_y
)
/
ex_h
/
weights
[
1
]
dy
=
(
gt_ctr_y
-
ex_ctr_y
)
/
ex_h
/
weights
[
1
]
dw
=
(
np
.
log
(
gt_w
/
ex_w
))
/
ex_w
/
weights
[
2
]
dw
=
(
np
.
log
(
gt_w
/
ex_w
))
/
weights
[
2
]
dh
=
(
np
.
log
(
gt_h
/
ex_h
))
/
ex_h
/
weights
[
3
]
dh
=
(
np
.
log
(
gt_h
/
ex_h
))
/
weights
[
3
]
targets
=
np
.
vstack
([
dx
,
dy
,
dw
,
dh
]).
transpose
()
targets
=
np
.
vstack
([
dx
,
dy
,
dw
,
dh
]).
transpose
()
return
targets
return
targets
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
459d4cc8
...
@@ -549,6 +549,13 @@ class TestBook(unittest.TestCase):
...
@@ -549,6 +549,13 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
print
(
str
(
program
))
def
test_sequence_enumerate
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
"input"
,
shape
=
[
1
],
dtype
=
'int32'
,
lod_level
=
1
)
out
=
layers
.
sequence_enumerate
(
input
=
x
,
win_size
=
2
,
pad_value
=
0
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_roi_pool_op.py
浏览文件 @
459d4cc8
...
@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):
...
@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):
for
i
in
range
(
self
.
rois_num
):
for
i
in
range
(
self
.
rois_num
):
roi
=
self
.
rois
[
i
]
roi
=
self
.
rois
[
i
]
roi_batch_id
=
roi
[
0
]
roi_batch_id
=
int
(
roi
[
0
])
roi_start_w
=
int
(
cpt
.
round
(
roi
[
1
]
*
self
.
spatial_scale
))
roi_start_w
=
int
(
cpt
.
round
(
roi
[
1
]
*
self
.
spatial_scale
))
roi_start_h
=
int
(
cpt
.
round
(
roi
[
2
]
*
self
.
spatial_scale
))
roi_start_h
=
int
(
cpt
.
round
(
roi
[
2
]
*
self
.
spatial_scale
))
roi_end_w
=
int
(
cpt
.
round
(
roi
[
3
]
*
self
.
spatial_scale
))
roi_end_w
=
int
(
cpt
.
round
(
roi
[
3
]
*
self
.
spatial_scale
))
...
@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
...
@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
roi
=
[
bno
,
x1
,
y1
,
x2
,
y2
]
roi
=
[
bno
,
x1
,
y1
,
x2
,
y2
]
rois
.
append
(
roi
)
rois
.
append
(
roi
)
self
.
rois_num
=
len
(
rois
)
self
.
rois_num
=
len
(
rois
)
self
.
rois
=
np
.
array
(
rois
).
astype
(
"
int64
"
)
self
.
rois
=
np
.
array
(
rois
).
astype
(
"
float32
"
)
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"roi_pool"
self
.
op_type
=
"roi_pool"
...
...
python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
浏览文件 @
459d4cc8
...
@@ -18,12 +18,17 @@ import unittest
...
@@ -18,12 +18,17 @@ import unittest
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
from
op_test
import
OpTest
from
test_anchor_generator_op
import
anchor_generator_in_python
from
test_generate_proposal_labels
import
_generate_groundtruth
from
test_generate_proposal_labels
import
_bbox_overlaps
,
_box_to_delta
def
rpn_target_assign
(
iou
,
rpn_batch_size_per_im
,
rpn_positive_overlap
,
def
rpn_target_assign
(
gt_anchor_iou
,
rpn_batch_size_per_im
,
rpn_negative_overlap
,
fg_fraction
):
rpn_
positive_overlap
,
rpn_
negative_overlap
,
fg_fraction
):
iou
=
np
.
transpose
(
iou
)
iou
=
np
.
transpose
(
gt_anchor_
iou
)
anchor_to_gt_max
=
iou
.
max
(
axis
=
1
)
anchor_to_gt_max
=
iou
.
max
(
axis
=
1
)
anchor_to_gt_argmax
=
iou
.
argmax
(
axis
=
1
)
gt_to_anchor_argmax
=
iou
.
argmax
(
axis
=
0
)
gt_to_anchor_argmax
=
iou
.
argmax
(
axis
=
0
)
gt_to_anchor_max
=
iou
[
gt_to_anchor_argmax
,
np
.
arange
(
iou
.
shape
[
1
])]
gt_to_anchor_max
=
iou
[
gt_to_anchor_argmax
,
np
.
arange
(
iou
.
shape
[
1
])]
anchors_with_max_overlap
=
np
.
where
(
iou
==
gt_to_anchor_max
)[
0
]
anchors_with_max_overlap
=
np
.
where
(
iou
==
gt_to_anchor_max
)[
0
]
...
@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
...
@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
num_bg
=
rpn_batch_size_per_im
-
np
.
sum
(
tgt_lbl
==
1
)
num_bg
=
rpn_batch_size_per_im
-
np
.
sum
(
tgt_lbl
==
1
)
bg_inds
=
np
.
where
(
anchor_to_gt_max
<
rpn_negative_overlap
)[
0
]
bg_inds
=
np
.
where
(
anchor_to_gt_max
<
rpn_negative_overlap
)[
0
]
tgt_lbl
[
bg_inds
]
=
0
if
len
(
bg_inds
)
>
num_bg
:
if
len
(
bg_inds
)
>
num_bg
:
enable_inds
=
bg_inds
[
np
.
random
.
randint
(
len
(
bg_inds
),
size
=
num_bg
)]
enable_inds
=
bg_inds
[
np
.
random
.
randint
(
len
(
bg_inds
),
size
=
num_bg
)]
tgt_lbl
[
enable_inds
]
=
0
tgt_lbl
[
enable_inds
]
=
0
bg_inds
=
np
.
where
(
tgt_lbl
==
0
)[
0
]
bg_inds
=
np
.
where
(
tgt_lbl
==
0
)[
0
]
tgt_lbl
[
bg_inds
]
=
0
loc_index
=
fg_inds
loc_index
=
fg_inds
score_index
=
np
.
hstack
((
fg_inds
,
bg_inds
))
score_index
=
np
.
hstack
((
fg_inds
,
bg_inds
))
tgt_lbl
=
np
.
expand_dims
(
tgt_lbl
,
axis
=
1
)
tgt_lbl
=
np
.
expand_dims
(
tgt_lbl
,
axis
=
1
)
return
loc_index
,
score_index
,
tgt_lbl
gt_inds
=
anchor_to_gt_argmax
[
fg_inds
]
return
loc_index
,
score_index
,
tgt_lbl
,
gt_inds
def
get_anchor
(
n
,
c
,
h
,
w
):
input_feat
=
np
.
random
.
random
((
n
,
c
,
h
,
w
)).
astype
(
'float32'
)
anchors
,
_
=
anchor_generator_in_python
(
input_feat
=
input_feat
,
anchor_sizes
=
[
32.
,
64.
],
aspect_ratios
=
[
0.5
,
1.0
],
variances
=
[
1.0
,
1.0
,
1.0
,
1.0
],
stride
=
[
16.0
,
16.0
],
offset
=
0.5
)
return
anchors
def
rpn_blob
(
anchor
,
gt_boxes
,
iou
,
lod
,
rpn_batch_size_per_im
,
rpn_positive_overlap
,
rpn_negative_overlap
,
fg_fraction
):
loc_indexes
=
[]
score_indexes
=
[]
tmp_tgt_labels
=
[]
tgt_bboxes
=
[]
anchor_num
=
anchor
.
shape
[
0
]
batch_size
=
len
(
lod
)
-
1
for
i
in
range
(
batch_size
):
b
,
e
=
lod
[
i
],
lod
[
i
+
1
]
iou_slice
=
iou
[
b
:
e
,
:]
bboxes_slice
=
gt_boxes
[
b
:
e
,
:]
loc_idx
,
score_idx
,
tgt_lbl
,
gt_inds
=
rpn_target_assign
(
iou_slice
,
rpn_batch_size_per_im
,
rpn_positive_overlap
,
rpn_negative_overlap
,
fg_fraction
)
fg_bboxes
=
bboxes_slice
[
gt_inds
]
fg_anchors
=
anchor
[
loc_idx
]
box_deltas
=
_box_to_delta
(
fg_anchors
,
fg_bboxes
,
[
1.
,
1.
,
1.
,
1.
])
if
i
==
0
:
loc_indexes
=
loc_idx
score_indexes
=
score_idx
tmp_tgt_labels
=
tgt_lbl
tgt_bboxes
=
box_deltas
else
:
loc_indexes
=
np
.
concatenate
(
[
loc_indexes
,
loc_idx
+
i
*
anchor_num
])
score_indexes
=
np
.
concatenate
(
[
score_indexes
,
score_idx
+
i
*
anchor_num
])
tmp_tgt_labels
=
np
.
concatenate
([
tmp_tgt_labels
,
tgt_lbl
])
tgt_bboxes
=
np
.
vstack
([
tgt_bboxes
,
box_deltas
])
tgt_labels
=
tmp_tgt_labels
[
score_indexes
]
return
loc_indexes
,
score_indexes
,
tgt_bboxes
,
tgt_labels
class
TestRpnTargetAssignOp
(
OpTest
):
class
TestRpnTargetAssignOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
iou
=
np
.
random
.
random
((
10
,
8
)).
astype
(
"float32"
)
n
,
c
,
h
,
w
=
2
,
4
,
14
,
14
self
.
op_type
=
"rpn_target_assign"
anchor
=
get_anchor
(
n
,
c
,
h
,
w
)
self
.
inputs
=
{
'DistMat'
:
iou
}
gt_num
=
10
self
.
attrs
=
{
anchor
=
anchor
.
reshape
(
-
1
,
4
)
'rpn_batch_size_per_im'
:
256
,
anchor_num
=
anchor
.
shape
[
0
]
'rpn_positive_overlap'
:
0.95
,
'rpn_negative_overlap'
:
0.3
,
'fg_fraction'
:
0.25
,
'fix_seed'
:
True
}
loc_index
,
score_index
,
tgt_lbl
=
rpn_target_assign
(
iou
,
256
,
0.95
,
0.3
,
0.25
)
self
.
outputs
=
{
'LocationIndex'
:
loc_index
,
'ScoreIndex'
:
score_index
,
'TargetLabel'
:
tgt_lbl
,
}
def
test_check_output
(
self
):
im_shapes
=
[[
64
,
64
],
[
64
,
64
]]
self
.
check_output
()
gt_box
,
lod
=
_generate_groundtruth
(
im_shapes
,
3
,
4
)
bbox
=
np
.
vstack
([
v
[
'boxes'
]
for
v
in
gt_box
])
iou
=
_bbox_overlaps
(
bbox
,
anchor
)
anchor
=
anchor
.
astype
(
'float32'
)
bbox
=
bbox
.
astype
(
'float32'
)
iou
=
iou
.
astype
(
'float32'
)
loc_index
,
score_index
,
tgt_bbox
,
tgt_lbl
=
rpn_blob
(
anchor
,
bbox
,
iou
,
[
0
,
4
,
8
],
25600
,
0.95
,
0.03
,
0.25
)
class
TestRpnTargetAssignOp2
(
OpTest
):
def
setUp
(
self
):
iou
=
np
.
random
.
random
((
10
,
20
)).
astype
(
"float32"
)
self
.
op_type
=
"rpn_target_assign"
self
.
op_type
=
"rpn_target_assign"
self
.
inputs
=
{
'DistMat'
:
iou
}
self
.
inputs
=
{
'Anchor'
:
anchor
,
'GtBox'
:
(
bbox
,
[[
4
,
4
]]),
'DistMat'
:
(
iou
,
[[
4
,
4
]]),
}
self
.
attrs
=
{
self
.
attrs
=
{
'rpn_batch_size_per_im'
:
128
,
'rpn_batch_size_per_im'
:
25600
,
'rpn_positive_overlap'
:
0.5
,
'rpn_positive_overlap'
:
0.
9
5
,
'rpn_negative_overlap'
:
0.
5
,
'rpn_negative_overlap'
:
0.
03
,
'fg_fraction'
:
0.5
,
'fg_fraction'
:
0.
2
5
,
'fix_seed'
:
True
'fix_seed'
:
True
}
}
loc_index
,
score_index
,
tgt_lbl
=
rpn_target_assign
(
iou
,
128
,
0.5
,
0.5
,
0.5
)
self
.
outputs
=
{
self
.
outputs
=
{
'LocationIndex'
:
loc_index
,
'LocationIndex'
:
loc_index
.
astype
(
'int32'
),
'ScoreIndex'
:
score_index
,
'ScoreIndex'
:
score_index
.
astype
(
'int32'
),
'TargetLabel'
:
tgt_lbl
,
'TargetBBox'
:
tgt_bbox
.
astype
(
'float32'
),
'TargetLabel'
:
tgt_lbl
.
astype
(
'int64'
),
}
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
0 → 100644
浏览文件 @
459d4cc8
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
def
sequence_enumerate
(
input_seq
,
in_lod
,
win_size
,
pad_value
):
lod0
=
[
0
]
for
i
in
range
(
0
,
len
(
in_lod
[
0
])):
lod0
.
append
(
lod0
[
i
]
+
in_lod
[
0
][
i
])
out_seq
=
[]
for
i
in
range
(
0
,
len
(
lod0
)
-
1
):
for
idx
in
range
(
lod0
[
i
],
lod0
[
i
+
1
]):
single_seq
=
[]
for
word_idx
in
range
(
win_size
):
word_pos
=
idx
+
word_idx
dat
=
input_seq
[
word_pos
]
if
word_pos
<
lod0
[
i
+
1
]
\
else
pad_value
single_seq
.
append
(
dat
)
out_seq
.
append
(
single_seq
)
return
out_seq
class
TestSequenceEnumerateOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"sequence_enumerate"
self
.
init_test_case
()
self
.
inputs
=
{
'X'
:
(
self
.
in_seq
,
self
.
lod
)}
self
.
attrs
=
{
'win_size'
:
self
.
win_size
,
'pad_value'
:
self
.
pad_value
}
self
.
outputs
=
{
'Out'
:
(
self
.
out_seq
,
self
.
lod
)}
def
test_check_output
(
self
):
self
.
check_output
()
def
init_test_case
(
self
):
self
.
in_seq
=
np
.
random
.
randint
(
0
,
10
,
(
30
,
1
)).
astype
(
"int32"
)
self
.
lod
=
[[
9
,
4
,
11
,
6
]]
self
.
win_size
=
2
self
.
pad_value
=
0
out_seq
=
sequence_enumerate
(
self
.
in_seq
,
self
.
lod
,
self
.
win_size
,
self
.
pad_value
)
self
.
out_seq
=
np
.
array
(
out_seq
).
astype
(
"int32"
)
class
TesSequenceEnumerateOpInt64
(
TestSequenceEnumerateOp
):
def
init_test_case
(
self
):
self
.
in_seq
=
np
.
random
.
randint
(
0
,
10
,
(
30
,
1
)).
astype
(
"int64"
)
self
.
lod
=
[[
9
,
4
,
11
,
6
]]
self
.
win_size
=
2
self
.
pad_value
=
0
out_seq
=
sequence_enumerate
(
self
.
in_seq
,
self
.
lod
,
self
.
win_size
,
self
.
pad_value
)
self
.
out_seq
=
np
.
array
(
out_seq
).
astype
(
"int64"
)
class
TestSequenceEnumerateOpLargeWinSize
(
TestSequenceEnumerateOp
):
def
init_test_case
(
self
):
self
.
in_seq
=
np
.
random
.
randint
(
0
,
10
,
(
30
,
1
)).
astype
(
"int32"
)
self
.
lod
=
[[
9
,
4
,
11
,
6
]]
self
.
win_size
=
5
self
.
pad_value
=
0
out_seq
=
sequence_enumerate
(
self
.
in_seq
,
self
.
lod
,
self
.
win_size
,
self
.
pad_value
)
self
.
out_seq
=
np
.
array
(
out_seq
).
astype
(
"int32"
)
class
TestSequenceEnumerateOpMaxWinSize
(
TestSequenceEnumerateOp
):
def
init_test_case
(
self
):
self
.
in_seq
=
np
.
random
.
randint
(
0
,
10
,
(
30
,
1
)).
astype
(
"int32"
)
self
.
lod
=
[[
9
,
4
,
11
,
6
]]
self
.
win_size
=
30
self
.
pad_value
=
0
out_seq
=
sequence_enumerate
(
self
.
in_seq
,
self
.
lod
,
self
.
win_size
,
self
.
pad_value
)
self
.
out_seq
=
np
.
array
(
out_seq
).
astype
(
"int32"
)
class
TestSequenceEnumerateOpLargePadValue
(
TestSequenceEnumerateOp
):
def
init_test_case
(
self
):
self
.
in_seq
=
np
.
random
.
randint
(
0
,
10
,
(
30
,
1
)).
astype
(
"int32"
)
self
.
lod
=
[[
9
,
4
,
11
,
6
]]
self
.
win_size
=
5
self
.
pad_value
=
5
out_seq
=
sequence_enumerate
(
self
.
in_seq
,
self
.
lod
,
self
.
win_size
,
self
.
pad_value
)
self
.
out_seq
=
np
.
array
(
out_seq
).
astype
(
"int32"
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
459d4cc8
...
@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object):
...
@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object):
self
.
table_name
]
self
.
table_name
]
zero_dim
=
int
(
zero_dim
=
int
(
math
.
ceil
(
origin_param_var
.
shape
[
0
]
/
len
(
self
.
pserver_endpoints
)))
math
.
ceil
(
origin_param_var
.
shape
[
0
]
/
float
(
len
(
self
.
pserver_endpoints
))))
table_shape
=
list
(
origin_param_var
.
shape
)
table_shape
=
list
(
origin_param_var
.
shape
)
table_shape
[
0
]
=
zero_dim
table_shape
[
0
]
=
zero_dim
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录