Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
d5a88b93
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d5a88b93
编写于
6月 08, 2018
作者:
Y
Yancey1989
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_memcpy_with_dist
上级
23433def
0aa9546e
变更
86
展开全部
隐藏空白更改
内联
并排
Showing
86 changed file
with
2956 addition
and
1041 deletion
+2956
-1041
benchmark/fluid/Dockerfile
benchmark/fluid/Dockerfile
+1
-1
benchmark/fluid/README.md
benchmark/fluid/README.md
+10
-0
benchmark/fluid/fluid_benchmark.py
benchmark/fluid/fluid_benchmark.py
+86
-28
benchmark/fluid/models/machine_translation.py
benchmark/fluid/models/machine_translation.py
+3
-1
benchmark/fluid/models/mnist.py
benchmark/fluid/models/mnist.py
+20
-4
benchmark/fluid/models/resnet.py
benchmark/fluid/models/resnet.py
+44
-15
benchmark/fluid/models/stacked_dynamic_lstm.py
benchmark/fluid/models/stacked_dynamic_lstm.py
+4
-1
benchmark/fluid/models/vgg.py
benchmark/fluid/models/vgg.py
+20
-4
benchmark/fluid/recordio_converter.py
benchmark/fluid/recordio_converter.py
+164
-0
cmake/configure.cmake
cmake/configure.cmake
+3
-0
paddle/contrib/inference/CMakeLists.txt
paddle/contrib/inference/CMakeLists.txt
+23
-17
paddle/contrib/inference/demo/simple_on_word2vec.cc
paddle/contrib/inference/demo/simple_on_word2vec.cc
+55
-1
paddle/contrib/inference/paddle_inference_api.h
paddle/contrib/inference/paddle_inference_api.h
+0
-1
paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
...e/contrib/inference/paddle_inference_api_anakin_engine.cc
+43
-11
paddle/contrib/inference/paddle_inference_api_anakin_engine.h
...le/contrib/inference/paddle_inference_api_anakin_engine.h
+16
-6
paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
...ib/inference/paddle_inference_api_anakin_engine_tester.cc
+41
-3
paddle/contrib/inference/test_paddle_inference_api_impl.cc
paddle/contrib/inference/test_paddle_inference_api_impl.cc
+149
-2
paddle/fluid/framework/data_layout.h
paddle/fluid/framework/data_layout.h
+6
-1
paddle/fluid/framework/data_layout_transform.cc
paddle/fluid/framework/data_layout_transform.cc
+83
-0
paddle/fluid/framework/data_layout_transform.h
paddle/fluid/framework/data_layout_transform.h
+45
-0
paddle/fluid/framework/data_transform.cc
paddle/fluid/framework/data_transform.cc
+26
-3
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+2
-1
paddle/fluid/framework/details/fuse_vars_op_handle.cc
paddle/fluid/framework/details/fuse_vars_op_handle.cc
+51
-0
paddle/fluid/framework/details/fuse_vars_op_handle.h
paddle/fluid/framework/details/fuse_vars_op_handle.h
+63
-0
paddle/fluid/framework/details/multi_devices_graph_builder.cc
...le/fluid/framework/details/multi_devices_graph_builder.cc
+5
-6
paddle/fluid/framework/details/multi_devices_graph_builder.h
paddle/fluid/framework/details/multi_devices_graph_builder.h
+1
-1
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+19
-10
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+10
-0
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+2
-0
paddle/fluid/framework/details/rpc_op_handle.cc
paddle/fluid/framework/details/rpc_op_handle.cc
+4
-4
paddle/fluid/framework/details/rpc_op_handle.h
paddle/fluid/framework/details/rpc_op_handle.h
+2
-2
paddle/fluid/framework/details/ssa_graph_builder.h
paddle/fluid/framework/details/ssa_graph_builder.h
+3
-1
paddle/fluid/framework/details/ssa_graph_printer.h
paddle/fluid/framework/details/ssa_graph_printer.h
+1
-1
paddle/fluid/framework/op_kernel_type.h
paddle/fluid/framework/op_kernel_type.h
+8
-1
paddle/fluid/framework/op_registry.h
paddle/fluid/framework/op_registry.h
+15
-8
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+57
-7
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+2
-2
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+1
-5
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+26
-2
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+2
-5
paddle/fluid/inference/tensorrt/convert/activation_op.cc
paddle/fluid/inference/tensorrt/convert/activation_op.cc
+8
-2
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
...le/fluid/inference/tensorrt/convert/test_activation_op.cc
+28
-87
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+0
-2
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+3
-1
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+8
-5
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+11
-6
paddle/fluid/operators/fc_op.cc
paddle/fluid/operators/fc_op.cc
+2
-2
paddle/fluid/operators/lrn_op.cc
paddle/fluid/operators/lrn_op.cc
+4
-3
paddle/fluid/operators/pool_mkldnn_op.cc
paddle/fluid/operators/pool_mkldnn_op.cc
+7
-4
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+8
-4
paddle/fluid/operators/reduce_max_op.cc
paddle/fluid/operators/reduce_max_op.cc
+34
-0
paddle/fluid/operators/reduce_max_op.cu
paddle/fluid/operators/reduce_max_op.cu
+34
-0
paddle/fluid/operators/reduce_mean_op.cc
paddle/fluid/operators/reduce_mean_op.cc
+35
-0
paddle/fluid/operators/reduce_mean_op.cu
paddle/fluid/operators/reduce_mean_op.cu
+34
-0
paddle/fluid/operators/reduce_mean_op.h
paddle/fluid/operators/reduce_mean_op.h
+39
-0
paddle/fluid/operators/reduce_min_max_op.h
paddle/fluid/operators/reduce_min_max_op.h
+50
-0
paddle/fluid/operators/reduce_min_op.cc
paddle/fluid/operators/reduce_min_op.cc
+34
-0
paddle/fluid/operators/reduce_min_op.cu
paddle/fluid/operators/reduce_min_op.cu
+34
-0
paddle/fluid/operators/reduce_op.cc
paddle/fluid/operators/reduce_op.cc
+0
-186
paddle/fluid/operators/reduce_op.cu
paddle/fluid/operators/reduce_op.cu
+0
-41
paddle/fluid/operators/reduce_op.h
paddle/fluid/operators/reduce_op.h
+165
-187
paddle/fluid/operators/reduce_op_function.h
paddle/fluid/operators/reduce_op_function.h
+109
-0
paddle/fluid/operators/reduce_prod_op.cc
paddle/fluid/operators/reduce_prod_op.cc
+35
-0
paddle/fluid/operators/reduce_prod_op.cu
paddle/fluid/operators/reduce_prod_op.cu
+34
-0
paddle/fluid/operators/reduce_prod_op.h
paddle/fluid/operators/reduce_prod_op.h
+39
-0
paddle/fluid/operators/reduce_sum_op.cc
paddle/fluid/operators/reduce_sum_op.cc
+34
-0
paddle/fluid/operators/reduce_sum_op.cu
paddle/fluid/operators/reduce_sum_op.cu
+34
-0
paddle/fluid/operators/reduce_sum_op.h
paddle/fluid/operators/reduce_sum_op.h
+39
-0
paddle/fluid/operators/reverse_op.cc
paddle/fluid/operators/reverse_op.cc
+107
-0
paddle/fluid/operators/reverse_op.cu
paddle/fluid/operators/reverse_op.cu
+24
-0
paddle/fluid/operators/reverse_op.h
paddle/fluid/operators/reverse_op.h
+87
-0
paddle/fluid/operators/softmax_op.cc
paddle/fluid/operators/softmax_op.cc
+6
-3
paddle/fluid/platform/mkldnn_helper.h
paddle/fluid/platform/mkldnn_helper.h
+13
-0
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-1
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+2
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+16
-9
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+1
-1
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+34
-0
python/paddle/fluid/tests/unittests/benchmark.py
python/paddle/fluid/tests/unittests/benchmark.py
+113
-0
python/paddle/fluid/tests/unittests/benchmark_sum_op.py
python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+82
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+124
-240
python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
...n/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+20
-0
python/paddle/fluid/tests/unittests/test_lstm_op.py
python/paddle/fluid/tests/unittests/test_lstm_op.py
+98
-101
python/paddle/fluid/tests/unittests/test_reverse_op.py
python/paddle/fluid/tests/unittests/test_reverse_op.py
+67
-0
python/paddle/fluid/tests/unittests/testsuite.py
python/paddle/fluid/tests/unittests/testsuite.py
+182
-0
未找到文件。
benchmark/fluid/Dockerfile
浏览文件 @
d5a88b93
...
...
@@ -19,4 +19,4 @@ ADD *.whl /
RUN
pip
install
/
*
.whl
&&
rm
-f
/
*
.whl
&&
chmod
+x /usr/bin/paddle_k8s
ENV
LD_LIBRARY_PATH=/usr/local/lib
ADD
fluid_benchmark.py
dataset
.py models/ /workspace/
ADD
fluid_benchmark.py
recordio_converter
.py models/ /workspace/
benchmark/fluid/README.md
浏览文件 @
d5a88b93
...
...
@@ -44,6 +44,16 @@ Currently supported `--model` argument include:
PADDLE_PSERVER_PORT
=
7164
PADDLE_TRAINER_IPS
=
192.168.0.2,192.168.0.3
PADDLE_CURRENT_IP
=
127.0.0.1
PADDLE_TRAINER_ID
=
0 python fluid_benchmark.py
--model
mnist
--device
GPU
--update_method
nccl2
```
## Prepare the RecordIO file to Achieve Better Performance
Run the following command will generate RecordIO files like "mnist.recordio" under the path
and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
at any time using
`fluid.batch`
.
```
bash
python
-c
'from recordio_converter import *; prepare_mnist("data", 1)'
```
## Run Distributed Benchmark on Kubernetes Cluster
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
...
...
benchmark/fluid/fluid_benchmark.py
浏览文件 @
d5a88b93
...
...
@@ -38,10 +38,12 @@ def parse_args():
default
=
'resnet'
,
help
=
'The model to run benchmark with.'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
'The minibatch size.'
)
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
'The batch size on each gpu.'
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
,
default
=
0.001
,
help
=
'The learning rate.'
)
# TODO(wuyi): add "--use_fake_data" option back.
parser
.
add_argument
(
'--skip_batch_num'
,
type
=
int
,
...
...
@@ -49,7 +51,10 @@ def parse_args():
help
=
'The first num of minibatch num to skip, for better performance test'
)
parser
.
add_argument
(
'--iterations'
,
type
=
int
,
default
=
80
,
help
=
'The number of minibatches.'
)
'--iterations'
,
type
=
int
,
default
=
80
,
help
=
'The number of minibatches, set to -1 to run all batches.'
)
parser
.
add_argument
(
'--pass_num'
,
type
=
int
,
default
=
100
,
help
=
'The number of passes.'
)
parser
.
add_argument
(
...
...
@@ -69,6 +74,7 @@ def parse_args():
type
=
int
,
default
=
1
,
help
=
'If gpus > 1, will use ParallelExecutor to run, else use Executor.'
)
# this option is available only for vgg and resnet.
parser
.
add_argument
(
'--cpus'
,
type
=
int
,
...
...
@@ -78,7 +84,7 @@ def parse_args():
'--data_set'
,
type
=
str
,
default
=
'flowers'
,
choices
=
[
'cifar10'
,
'flowers'
],
choices
=
[
'cifar10'
,
'flowers'
,
'imagenet'
],
help
=
'Optional dataset for benchmark.'
)
parser
.
add_argument
(
'--infer_only'
,
action
=
'store_true'
,
help
=
'If set, run forward only.'
)
...
...
@@ -108,6 +114,16 @@ def parse_args():
default
=
'local'
,
choices
=
[
'local'
,
'pserver'
,
'nccl2'
],
help
=
'Choose parameter update method, can be local, pserver, nccl2.'
)
parser
.
add_argument
(
'--use_reader_op'
,
action
=
'store_true'
,
help
=
'Whether to use reader op, and must specify the data path if set this to true.'
)
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
default
=
""
,
help
=
'Directory that contains all the training recordio files.'
)
args
=
parser
.
parse_args
()
return
args
...
...
@@ -210,26 +226,50 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
place
=
core
.
CPUPlace
()
if
args
.
device
==
'CPU'
else
core
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
feed_var_list
=
[
var
for
var
in
train_prog
.
global_block
().
vars
.
itervalues
()
if
var
.
is_data
]
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
if
not
args
.
use_reader_op
:
feed_var_list
=
[
var
for
var
in
train_prog
.
global_block
().
vars
.
itervalues
()
if
var
.
is_data
]
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
iters
,
num_samples
,
start_time
=
0
,
0
,
time
.
time
()
for
pass_id
in
range
(
args
.
pass_num
):
train_losses
=
[]
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
not
args
.
use_reader_op
:
reader_generator
=
train_reader
()
batch_id
=
0
data
=
None
while
True
:
if
not
args
.
use_reader_op
:
data
=
next
(
reader_generator
,
None
)
if
data
==
None
:
break
if
iters
==
args
.
iterations
:
break
if
iters
==
args
.
skip_batch_num
:
start_time
=
time
.
time
()
num_samples
=
0
if
iters
==
args
.
iterations
:
break
loss
=
exe
.
run
(
train_prog
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_loss
])
if
args
.
use_reader_op
:
try
:
loss
=
exe
.
run
(
train_prog
,
fetch_list
=
[
avg_loss
])
except
fluid
.
core
.
EnforceNotMet
as
ex
:
break
else
:
loss
=
exe
.
run
(
train_prog
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_loss
])
iters
+=
1
num_samples
+=
len
(
data
)
batch_id
+=
1
# FIXME(wuyi): For use_reader_op, if the current
# pass is not the last, the last batch of this pass
# is also equal to args.batch_size.
if
args
.
use_reader_op
:
num_samples
+=
args
.
batch_size
*
args
.
gpus
else
:
num_samples
+=
len
(
data
)
train_losses
.
append
(
loss
)
print
(
"Pass: %d, Iter: %d, Loss: %f
\n
"
%
(
pass_id
,
iters
,
np
.
mean
(
train_losses
)))
...
...
@@ -250,10 +290,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def
train_parallel
(
avg_loss
,
infer_prog
,
optimizer
,
train_reader
,
test_reader
,
batch_acc
,
args
,
train_prog
,
startup_prog
,
nccl_id_var
,
num_trainers
,
trainer_id
):
feed_var_list
=
[
var
for
var
in
train_prog
.
global_block
().
vars
.
itervalues
()
if
var
.
is_data
]
place
=
core
.
CPUPlace
()
if
args
.
device
==
'CPU'
else
core
.
CUDAPlace
(
0
)
if
not
args
.
use_reader_op
:
feed_var_list
=
[
var
for
var
in
train_prog
.
global_block
().
vars
.
itervalues
()
if
var
.
is_data
]
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
# generate fake:
if
args
.
use_fake_data
:
for
var
in
feed_var_list
:
...
...
@@ -270,7 +314,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
"value"
:
1.0
,
"dtype"
:
var
.
dtype
})
place
=
core
.
CPUPlace
()
if
args
.
device
==
'CPU'
else
core
.
CUDAPlace
(
0
)
if
nccl_id_var
and
trainer_id
==
0
:
#FIXME(wuyi): wait other trainer to start listening
time
.
sleep
(
30
)
...
...
@@ -287,12 +330,21 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_trainers
=
num_trainers
,
trainer_id
=
trainer_id
)
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
for
pass_id
in
range
(
args
.
pass_num
):
num_samples
=
0
iters
=
0
start_time
=
time
.
time
()
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
not
args
.
use_reader_op
:
reader_generator
=
train_reader
()
batch_id
=
0
data
=
None
while
True
:
if
not
args
.
use_reader_op
:
data
=
next
(
reader_generator
,
None
)
if
data
==
None
:
break
if
iters
==
args
.
iterations
:
break
if
args
.
profile
and
pass_id
==
0
and
batch_id
==
5
:
profiler
.
start_profiler
(
"All"
)
elif
args
.
profile
and
pass_id
==
0
and
batch_id
==
10
:
...
...
@@ -301,19 +353,25 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
if
iters
==
args
.
skip_batch_num
:
start_time
=
time
.
time
()
num_samples
=
0
if
iters
==
args
.
iterations
:
break
if
args
.
use_fake_data
:
loss
,
=
exe
.
run
([
avg_loss
.
name
])
if
args
.
use_fake_data
or
args
.
use_reader_op
:
try
:
loss
,
=
exe
.
run
([
avg_loss
.
name
])
except
fluid
.
core
.
EnforceNotMet
as
ex
:
break
else
:
loss
,
=
exe
.
run
([
avg_loss
.
name
],
feed
=
feeder
.
feed
(
data
))
if
args
.
update_method
==
"pserver"
:
exe
.
bcast_params
()
num_samples
+=
len
(
data
)
if
args
.
use_reader_op
:
num_samples
+=
args
.
batch_size
*
args
.
gpus
else
:
num_samples
+=
len
(
data
)
iters
+=
1
if
batch_id
%
1
==
0
:
print
(
"Pass %d, batch %d, loss %s"
%
(
pass_id
,
batch_id
,
np
.
array
(
loss
)))
batch_id
+=
1
print_train_time
(
start_time
,
time
.
time
(),
num_samples
)
if
not
args
.
no_test
and
batch_acc
:
test_acc
=
test
(
startup_exe
,
infer_prog
,
test_reader
,
feeder
,
...
...
benchmark/fluid/models/machine_translation.py
浏览文件 @
d5a88b93
...
...
@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
def
get_model
(
args
):
if
args
.
use_reader_op
:
raise
Exception
(
"machine_translation do not support reader op for now."
)
embedding_dim
=
512
encoder_size
=
512
decoder_size
=
512
...
...
@@ -221,7 +223,7 @@ def get_model(args):
train_batch_generator
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
wmt14
.
train
(
dict_size
),
buf_size
=
1000
),
batch_size
=
args
.
batch_size
)
batch_size
=
args
.
batch_size
*
args
.
gpus
)
test_batch_generator
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
...
...
benchmark/fluid/models/mnist.py
浏览文件 @
d5a88b93
...
...
@@ -20,6 +20,7 @@ import numpy as np
import
argparse
import
time
import
cProfile
import
os
import
paddle
import
paddle.fluid
as
fluid
...
...
@@ -65,9 +66,24 @@ def cnn_model(data):
def
get_model
(
args
):
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
if
args
.
use_reader_op
:
filelist
=
[
os
.
path
.
join
(
args
.
data_path
,
f
)
for
f
in
os
.
listdir
(
args
.
data_path
)
]
data_file
=
fluid
.
layers
.
open_files
(
filenames
=
filelist
,
shapes
=
[[
-
1
,
1
,
28
,
28
],
(
-
1
,
1
)],
lod_levels
=
[
0
,
0
],
dtypes
=
[
"float32"
,
"int64"
],
thread_num
=
args
.
gpus
,
pass_num
=
args
.
pass_num
)
data_file
=
fluid
.
layers
.
double_buffer
(
fluid
.
layers
.
batch
(
data_file
,
batch_size
=
args
.
batch_size
))
images
,
label
=
fluid
.
layers
.
read_file
(
data_file
)
else
:
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
if
args
.
device
==
'CPU'
and
args
.
cpus
>
1
:
places
=
fluid
.
layers
.
get_places
(
args
.
cpus
)
...
...
@@ -103,7 +119,7 @@ def get_model(args):
# Reader
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
args
.
batch_size
)
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
args
.
batch_size
*
args
.
gpus
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
args
.
batch_size
)
return
avg_cost
,
inference_program
,
opt
,
train_reader
,
test_reader
,
batch_acc
benchmark/fluid/models/resnet.py
浏览文件 @
d5a88b93
...
...
@@ -19,6 +19,7 @@ from __future__ import print_function
import
functools
import
numpy
as
np
import
time
import
os
import
cProfile
,
pstats
,
StringIO
...
...
@@ -26,6 +27,7 @@ import paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
paddle.fluid.profiler
as
profiler
from
recordio_converter
import
imagenet_train
,
imagenet_test
def
conv_bn_layer
(
input
,
ch_out
,
filter_size
,
stride
,
padding
,
act
=
'relu'
):
...
...
@@ -122,16 +124,48 @@ def get_model(args):
else
:
dshape
=
[
32
,
32
,
3
]
model
=
resnet_cifar10
else
:
train_reader
=
paddle
.
dataset
.
cifar
.
train10
()
test_reader
=
paddle
.
dataset
.
cifar
.
test10
()
elif
args
.
data_set
==
"flowers"
:
class_dim
=
102
if
args
.
data_format
==
'NCHW'
:
dshape
=
[
3
,
224
,
224
]
else
:
dshape
=
[
224
,
224
,
3
]
model
=
resnet_imagenet
input
=
fluid
.
layers
.
data
(
name
=
'data'
,
shape
=
dshape
,
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
train_reader
=
paddle
.
dataset
.
flowers
.
train
()
test_reader
=
paddle
.
dataset
.
flowers
.
test
()
elif
args
.
data_set
==
"imagenet"
:
class_dim
=
1000
if
args
.
data_format
==
'NCHW'
:
dshape
=
[
3
,
224
,
224
]
else
:
dshape
=
[
224
,
224
,
3
]
model
=
resnet_imagenet
if
not
args
.
data_path
:
raise
Exception
(
"Must specify --data_path when training with imagenet"
)
train_reader
=
imagenet_train
(
args
.
data_path
)
test_reader
=
imagenet_test
(
args
.
data_path
)
if
args
.
use_reader_op
:
filelist
=
[
os
.
path
.
join
(
args
.
data_path
,
f
)
for
f
in
os
.
listdir
(
args
.
data_path
)
]
data_file
=
fluid
.
layers
.
open_files
(
filenames
=
filelist
,
shapes
=
[[
-
1
]
+
dshape
,
(
-
1
,
1
)],
lod_levels
=
[
0
,
0
],
dtypes
=
[
"float32"
,
"int64"
],
thread_num
=
args
.
gpus
,
pass_num
=
args
.
pass_num
)
data_file
=
fluid
.
layers
.
double_buffer
(
fluid
.
layers
.
batch
(
data_file
,
batch_size
=
args
.
batch_size
))
input
,
label
=
fluid
.
layers
.
read_file
(
data_file
)
else
:
input
=
fluid
.
layers
.
data
(
name
=
'data'
,
shape
=
dshape
,
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
if
args
.
device
==
'CPU'
and
args
.
cpus
>
1
:
places
=
fluid
.
layers
.
get_places
(
args
.
cpus
)
...
...
@@ -162,15 +196,10 @@ def get_model(args):
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.01
,
momentum
=
0.9
)
train_reader
=
paddle
.
batch
(
batched_
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
cifar
.
train10
()
if
args
.
data_set
==
'cifar10'
else
paddle
.
dataset
.
flowers
.
train
(),
buf_size
=
5120
),
batch_size
=
args
.
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
cifar
.
test10
()
if
args
.
data_set
==
'cifar10'
else
paddle
.
dataset
.
flowers
.
test
(),
batch_size
=
args
.
batch_size
)
return
avg_cost
,
inference_program
,
optimizer
,
train_reader
,
test_reader
,
batch_acc
train_reader
,
buf_size
=
5120
),
batch_size
=
args
.
batch_size
*
args
.
gpus
)
batched_test_reader
=
paddle
.
batch
(
train_reader
,
batch_size
=
args
.
batch_size
)
return
avg_cost
,
inference_program
,
optimizer
,
batched_train_reader
,
batched_test_reader
,
batch_acc
benchmark/fluid/models/stacked_dynamic_lstm.py
浏览文件 @
d5a88b93
...
...
@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
def
get_model
(
args
):
if
args
.
use_reader_op
:
raise
Exception
(
"stacked_dynamic_lstm do not support reader op for now."
)
lstm_size
=
512
emb_dim
=
512
crop_size
=
1500
...
...
@@ -114,7 +117,7 @@ def get_model(args):
train_reader
=
batch
(
paddle
.
reader
.
shuffle
(
crop_sentence
(
imdb
.
train
(
word_dict
),
crop_size
),
buf_size
=
25000
),
batch_size
=
args
.
batch_size
)
batch_size
=
args
.
batch_size
*
args
.
gpus
)
test_reader
=
batch
(
paddle
.
reader
.
shuffle
(
crop_sentence
(
imdb
.
test
(
word_dict
),
crop_size
),
buf_size
=
25000
),
...
...
benchmark/fluid/models/vgg.py
浏览文件 @
d5a88b93
...
...
@@ -22,6 +22,7 @@ import paddle.fluid as fluid
import
paddle.fluid.core
as
core
import
argparse
import
functools
import
os
def
vgg16_bn_drop
(
input
):
...
...
@@ -65,9 +66,24 @@ def get_model(args):
else
:
data_shape
=
[
224
,
224
,
3
]
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
if
args
.
use_reader_op
:
filelist
=
[
os
.
path
.
join
(
args
.
data_path
,
f
)
for
f
in
os
.
listdir
(
args
.
data_path
)
]
data_file
=
fluid
.
layers
.
open_files
(
filenames
=
filelist
,
shapes
=
[[
-
1
]
+
data_shape
,
(
-
1
,
1
)],
lod_levels
=
[
0
,
0
],
dtypes
=
[
"float32"
,
"int64"
],
thread_num
=
args
.
gpus
,
pass_num
=
args
.
pass_num
)
data_file
=
fluid
.
layers
.
double_buffer
(
fluid
.
layers
.
batch
(
data_file
,
batch_size
=
args
.
batch_size
))
images
,
label
=
fluid
.
layers
.
read_file
(
data_file
)
else
:
images
=
fluid
.
layers
.
data
(
name
=
'data'
,
shape
=
dshape
,
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
# Train program
net
=
vgg16_bn_drop
(
images
)
...
...
@@ -95,7 +111,7 @@ def get_model(args):
paddle
.
dataset
.
cifar
.
train10
()
if
args
.
data_set
==
'cifar10'
else
paddle
.
dataset
.
flowers
.
train
(),
buf_size
=
5120
),
batch_size
=
args
.
batch_size
)
batch_size
=
args
.
batch_size
*
args
.
gpus
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
cifar
.
test10
()
if
args
.
data_set
==
'cifar10'
else
paddle
.
dataset
.
flowers
.
test
(),
...
...
benchmark/fluid/recordio_converter.py
0 → 100644
浏览文件 @
d5a88b93
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
random
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.dataset
import
mnist
,
cifar
,
flowers
,
image
def
convert_2_recordio
(
py_reader
,
outfilepath
,
batch_size
,
shape_data
,
shape_label
):
num_batches
=
0
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
reader
=
paddle
.
batch
(
py_reader
(),
batch_size
=
batch_size
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
# order is image and label
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
shape_data
),
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
shape_label
,
dtype
=
'int64'
),
],
place
=
fluid
.
CPUPlace
())
num_batches
=
fluid
.
recordio_writer
.
convert_reader_to_recordio_file
(
outfilepath
,
reader
,
feeder
)
return
num_batches
def
prepare_mnist
(
outpath
,
batch_size
):
outfilepath
=
os
.
path
.
join
(
outpath
,
"mnist.recordio"
)
convert_2_recordio
(
mnist
.
train
,
outfilepath
,
batch_size
,
[
784
],
[
1
])
def
prepare_cifar10
(
outpath
,
batch_size
):
outfilepath
=
os
.
path
.
join
(
outpath
,
"cifar.recordio"
)
convert_2_recordio
(
cifar
.
train10
,
outfilepath
,
batch_size
,
[
3
,
32
,
32
],
[
1
])
def
prepare_flowers
(
outpath
,
batch_size
):
outfilepath
=
os
.
path
.
join
(
outpath
,
"flowers.recordio"
)
convert_2_recordio
(
flowers
.
train
,
outfilepath
,
batch_size
,
[
3
,
224
,
224
],
[
1
])
def
default_mapper
(
sample
):
img
,
label
=
sample
img
=
image
.
simple_transform
(
img
,
256
,
224
,
True
,
mean
=
[
103.94
,
116.78
,
123.68
])
return
img
.
flatten
().
astype
(
'float32'
),
label
def
imagenet_train
(
data_dir
):
contents
=
os
.
listdir
(
data_dir
)
if
set
(
contents
)
!=
set
(
[
"train"
,
"train.txt"
,
"val"
,
"val_set"
,
"val.txt"
,
"unzip.sh"
]):
raise
Exception
(
"Imagenet data contents error!"
)
img2label
=
dict
()
imgfilelist
=
[]
with
open
(
os
.
path
.
join
(
data_dir
,
"train.txt"
))
as
fn
:
while
1
:
l
=
fn
.
readline
()
if
not
l
:
break
img
,
lbl
=
l
[:
-
1
].
split
(
" "
)
img2label
[
img
]
=
int
(
lbl
)
imgfilelist
.
append
(
img
)
# shuffle all, this is slow
random
.
shuffle
(
imgfilelist
)
def
train_reader
():
for
idx
,
imgfile
in
enumerate
(
imgfilelist
):
data
=
image
.
load_image
(
os
.
path
.
join
(
data_dir
,
"train"
,
imgfile
.
lower
()))
label
=
[
img2label
[
imgfile
],
]
yield
[
data
,
label
]
return
paddle
.
reader
.
map_readers
(
default_mapper
,
train_reader
)
def
imagenet_test
(
data_dir
):
contents
=
os
.
listdir
(
data_dir
)
if
set
(
contents
)
!=
set
(
[
"train"
,
"train.txt"
,
"val"
,
"val_set"
,
"val.txt"
,
"unzip.sh"
]):
raise
Exception
(
"Imagenet data contents error!"
)
img2label
=
dict
()
imgfilelist
=
[]
with
open
(
os
.
path
.
join
(
data_dir
,
"val.txt"
))
as
fn
:
while
1
:
l
=
fn
.
readline
()
if
not
l
:
break
img
,
lbl
=
l
[:
-
1
].
split
(
" "
)
img2label
[
img
]
=
int
(
lbl
)
imgfilelist
.
append
(
img
)
def
test_reader
():
for
idx
,
imgfile
in
enumerate
(
imgfilelist
):
base_path
=
os
.
path
.
join
(
data_dir
,
"val"
,
imgfile
.
split
(
"."
)[
0
])
image_path
=
"."
.
join
([
base_path
,
"jpeg"
])
data
=
image
.
load_image
(
image_path
)
label
=
[
img2label
[
imgfile
],
]
yield
[
data
,
label
]
return
paddle
.
reader
.
map_readers
(
default_mapper
,
test_reader
)
# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
def
convert_reader_to_recordio_files
(
filename
,
batch_per_file
,
reader_creator
,
feeder
,
compressor
=
core
.
RecordIOWriter
.
Compressor
.
Snappy
,
max_num_records
=
1000
,
feed_order
=
None
):
if
feed_order
is
None
:
feed_order
=
feeder
.
feed_names
f_name
,
f_ext
=
os
.
path
.
splitext
(
filename
)
assert
(
f_ext
==
".recordio"
)
lines
=
[]
f_idx
=
0
counter
=
0
for
idx
,
batch
in
enumerate
(
reader_creator
()):
lines
.
append
(
batch
)
if
idx
>=
batch_per_file
and
idx
%
batch_per_file
==
0
:
filename
=
"%s-%05d%s"
%
(
f_name
,
f_idx
,
f_ext
)
with
fluid
.
recordio_writer
.
create_recordio_writer
(
filename
,
compressor
,
max_num_records
)
as
writer
:
for
l
in
lines
:
res
=
feeder
.
feed
(
l
)
for
each
in
feed_order
:
writer
.
append_tensor
(
res
[
each
])
writer
.
complete_append_tensor
()
counter
+=
1
lines
=
[]
f_idx
+=
1
print
(
"written file: "
,
filename
)
return
counter
def
prepare_imagenet
(
inpath
,
outpath
,
batch_size
):
r
=
paddle
.
batch
(
imagenet_train
(
inpath
),
batch_size
=
batch_size
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
fluid
.
layers
.
data
(
name
=
"image"
,
shape
=
[
3
,
224
,
224
]),
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
'int64'
)
],
place
=
fluid
.
CPUPlace
())
outpath
=
os
.
path
.
join
(
outpath
,
"imagenet.recordio"
)
convert_reader_to_recordio_files
(
outpath
,
10000
,
r
,
feeder
)
cmake/configure.cmake
浏览文件 @
d5a88b93
...
...
@@ -92,6 +92,9 @@ if(WITH_GPU)
if
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
message
(
FATAL_ERROR
"TensorRT needs CUDNN >= 7.0 to compile"
)
endif
()
if
(
${
TENSORRT_MAJOR_VERSION
}
VERSION_LESS 4
)
message
(
FATAL_ERROR
"Paddle needs TensorRT >= 4.0 to compile"
)
endif
()
include_directories
(
${
TENSORRT_INCLUDE_DIR
}
)
endif
()
elseif
(
WITH_AMD_GPU
)
...
...
paddle/contrib/inference/CMakeLists.txt
浏览文件 @
d5a88b93
...
...
@@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
set
(
inference_deps paddle_inference_api paddle_fluid_api
)
# if anakin is set enable anakin api implementation
if
(
ANAKIN_INCLUDE
_DIR
AND ANAKIN_LIBRARY
)
if
(
ANAKIN_INCLUDE AND ANAKIN_LIBRARY
)
set
(
ANAKIN_FOUND ON
)
else
()
set
(
ANAKIN_FOUND OFF
)
endif
()
function
(
fetch_include_recursively root_dir
)
if
(
IS_DIRECTORY
${
root_dir
}
)
include_directories
(
${
root_dir
}
)
endif
()
file
(
GLOB ALL_SUB RELATIVE
${
root_dir
}
${
root_dir
}
/*
)
foreach
(
sub
${
ALL_SUB
}
)
if
(
IS_DIRECTORY
${
root_dir
}
/
${
sub
}
)
fetch_include_recursively
(
${
root_dir
}
/
${
sub
}
)
endif
()
endforeach
()
endfunction
()
if
(
ANAKIN_FOUND
)
# Anakin's code style doesn't follow google c style.
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-Wno-error=comment
-Wno-error=reorder
-Wno-error=format
-Wno-error=switch
-Wno-error=return-type
-Wno-error=non-virtual-dtor
-Wno-error=cpp"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp"
)
message
(
STATUS
"Anakin for inference is enabled"
)
message
(
STATUS
"Anakin is set INCLUDE:
${
ANAKIN_INCLUDE
}
LIBRARY:
${
ANAKIN_LIBRARY
}
"
)
include_directories
(
"
${
ANAKIN_INCLUDE
}
"
)
# Anakin's source path is a mass, need to set sub-directories trivially.
include_directories
(
"
${
ANAKIN_INCLUDE
}
/saber"
)
link_directories
(
"
${
ANAKIN_LIBRARY
}
"
)
fetch_include_recursively
(
${
ANAKIN_INCLUDE
}
)
link_directories
(
${
ANAKIN_LIBRARY
}
)
nv_library
(
inference_anakin_api S
RCS
paddle_inference_api_anakin_engine.cc
)
target_link_libraries
(
inference_anakin_api anakin
)
nv_library
(
inference_anakin_api S
HARED SRCS paddle_inference_api.cc
paddle_inference_api_anakin_engine.cc
)
target_link_libraries
(
inference_anakin_api anakin
anakin_saber_common
)
list
(
APPEND inference_deps inference_anakin_api
)
endif
()
...
...
@@ -73,7 +79,7 @@ function(inference_api_test TARGET_NAME)
endfunction
(
inference_api_test
)
cc_library
(
paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS
${
FLUID_CORE_MODULES
}
${
GLOB_OP_LIB
}
)
cc_test
(
test_paddle_inference_api
...
...
@@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl
ARGS test_word2vec test_image_classification
)
if
(
ANAKIN_FOUND
)
nv
_test
(
inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
DEPS
${
inference_deps
}
protobuf
)
cc
_test
(
inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
DEPS
${
inference_deps
}
)
endif
()
if
(
WITH_TESTING
)
...
...
paddle/contrib/inference/demo/simple_on_word2vec.cc
浏览文件 @
d5a88b93
...
...
@@ -19,8 +19,8 @@ limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <memory>
#include <thread>
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace
paddle
{
namespace
demo
{
...
...
@@ -61,13 +61,67 @@ void Main(bool use_gpu) {
for
(
size_t
i
=
0
;
i
<
std
::
min
(
5UL
,
num_elements
);
i
++
)
{
LOG
(
INFO
)
<<
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
)[
i
];
}
// TODO(Superjomn): this is should be free automatically
free
(
outputs
[
0
].
data
.
data
);
}
}
void
MainThreads
(
int
num_threads
,
bool
use_gpu
)
{
// Multi-threads only support on CPU
// 0. Create PaddlePredictor with a config.
NativeConfig
config
;
config
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
config
.
use_gpu
=
use_gpu
;
config
.
fraction_of_gpu_memory
=
0.15
;
config
.
device
=
0
;
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
// 1. clone a predictor which shares the same parameters
auto
predictor
=
main_predictor
->
Clone
();
constexpr
int
num_batches
=
3
;
for
(
int
batch_id
=
0
;
batch_id
<
num_batches
;
++
batch_id
)
{
// 2. Dummy Input Data
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleBuf
buf
{.
data
=
data
,
.
length
=
sizeof
(
data
)};
PaddleTensor
tensor
{.
name
=
""
,
.
shape
=
std
::
vector
<
int
>
({
4
,
1
}),
.
data
=
buf
,
.
dtype
=
PaddleDType
::
INT64
};
std
::
vector
<
PaddleTensor
>
inputs
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
outputs
;
// 3. Run
CHECK
(
predictor
->
Run
(
inputs
,
&
outputs
));
// 4. Get output.
ASSERT_EQ
(
outputs
.
size
(),
1UL
);
LOG
(
INFO
)
<<
"TID: "
<<
tid
<<
", "
<<
"output buffer size: "
<<
outputs
.
front
().
data
.
length
;
const
size_t
num_elements
=
outputs
.
front
().
data
.
length
/
sizeof
(
float
);
// The outputs' buffers are in CPU memory.
for
(
size_t
i
=
0
;
i
<
std
::
min
(
5UL
,
num_elements
);
i
++
)
{
LOG
(
INFO
)
<<
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
)[
i
];
}
free
(
outputs
[
0
].
data
.
data
);
}
});
}
for
(
int
i
=
0
;
i
<
num_threads
;
++
i
)
{
threads
[
i
].
join
();
}
}
TEST
(
demo
,
word2vec_cpu
)
{
Main
(
false
/*use_gpu*/
);
}
TEST
(
demo_multi_threads
,
word2vec_cpu_1
)
{
MainThreads
(
1
,
false
/*use_gpu*/
);
}
TEST
(
demo_multi_threads
,
word2vec_cpu_4
)
{
MainThreads
(
4
,
false
/*use_gpu*/
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
demo
,
word2vec_gpu
)
{
Main
(
true
/*use_gpu*/
);
}
TEST
(
demo_multi_threads
,
word2vec_gpu_1
)
{
MainThreads
(
1
,
true
/*use_gpu*/
);
}
TEST
(
demo_multi_threads
,
word2vec_gpu_4
)
{
MainThreads
(
4
,
true
/*use_gpu*/
);
}
#endif
}
// namespace demo
...
...
paddle/contrib/inference/paddle_inference_api.h
浏览文件 @
d5a88b93
...
...
@@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config {
// Similarly, each engine kind should map to a unique predictor implementation.
template
<
typename
ConfigT
,
PaddleEngineKind
engine
=
PaddleEngineKind
::
kNative
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
(
const
ConfigT
&
config
);
}
// namespace paddle
paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
浏览文件 @
d5a88b93
...
...
@@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
}
bool
PaddleInferenceAnakinPredictor
::
Init
(
const
AnakinConfig
&
config
)
{
// TODO(Superjomn) Tell anakin to support return code.
engine_
.
Build
(
config
.
model_file
,
config
.
max_batch_size
);
if
(
!
(
graph_
.
load
(
config
.
model_file
)))
{
return
false
;
}
graph_
.
ResetBatchSize
(
"input_0"
,
config
.
max_batch_size
);
// optimization for graph
if
(
!
(
graph_
.
Optimize
()))
{
return
false
;
}
// construct executer
executor_
.
init
(
graph_
);
return
true
;
}
...
...
@@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run(
<<
"'s type is not float"
;
return
false
;
}
engine_
.
SetInputFromCPU
(
input
.
name
,
static_cast
<
float
*>
(
input
.
data
.
data
),
input
.
data
.
length
);
auto
d_tensor_in_p
=
executor_
.
get_in
(
input
.
name
);
float
*
d_data_p
=
d_tensor_in_p
->
mutable_data
();
if
(
cudaMemcpy
(
d_data_p
,
static_cast
<
float
*>
(
input
.
data
.
data
),
d_tensor_in_p
->
valid_size
()
*
sizeof
(
float
),
cudaMemcpyHostToDevice
)
!=
0
)
{
LOG
(
ERROR
)
<<
"copy data from CPU to GPU error"
;
return
false
;
}
}
// TODO(Superjomn) Tell anakin to support return code.
engine_
.
Execute
();
executor_
.
prediction
();
if
(
output_data
->
empty
())
{
LOG
(
ERROR
)
<<
"At least one output should be set with tensors' names."
;
return
false
;
}
for
(
auto
&
output
:
*
output_data
)
{
auto
*
tensor
=
e
ngine_
.
GetOutputInGPU
(
output
.
name
);
auto
*
tensor
=
e
xecutor_
.
get_out
(
output
.
name
);
output
.
shape
=
tensor
->
shape
();
// Copy data from GPU -> CPU
if
(
cudaMemcpy
(
output
.
data
.
data
,
tensor
->
data
(),
tensor
->
size
(
),
tensor
->
mutable_
data
(),
tensor
->
valid_size
()
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
)
!=
0
)
{
LOG
(
ERROR
)
<<
"copy data from GPU to CPU error"
;
return
false
;
...
...
@@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run(
return
true
;
}
// TODO(Superjomn) To implement latter.
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
&
PaddleInferenceAnakinPredictor
::
get_executer
()
{
return
executor_
;
}
// the cloned new Predictor of anakin share the same net weights from original
// Predictor
std
::
unique_ptr
<
PaddlePredictor
>
PaddleInferenceAnakinPredictor
::
Clone
()
{
return
nullptr
;
VLOG
(
3
)
<<
"Anakin Predictor::clone"
;
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
PaddleInferenceAnakinPredictor
());
// construct executer from other graph
auto
anakin_predictor_p
=
dynamic_cast
<
PaddleInferenceAnakinPredictor
*>
(
cls
.
get
());
if
(
!
anakin_predictor_p
)
{
LOG
(
ERROR
)
<<
"fail to call Init"
;
return
nullptr
;
}
anakin_predictor_p
->
get_executer
().
init
(
graph_
);
return
std
::
move
(
cls
);
}
// A factory to help create difference predictor.
...
...
@@ -74,6 +105,7 @@ template <>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
const
AnakinConfig
&
config
)
{
VLOG
(
3
)
<<
"Anakin Predictor create."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
new
PaddleInferenceAnakinPredictor
(
config
));
return
x
;
...
...
paddle/contrib/inference/paddle_inference_api_anakin_engine.h
浏览文件 @
d5a88b93
...
...
@@ -20,32 +20,42 @@ limitations under the License. */
#pragma once
// NOTE This header file do not have namespace.
// TODO(Superjomn) Tell Anakin to provide better APIs.
#include <test/framework/net/paddle_api.h>
//#include <test/framework/net/paddle_api.h>
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "framework/core/net/net.h"
#include "saber/saber_types.h"
namespace
paddle
{
class
PaddleInferenceAnakinPredictor
:
public
PaddlePredictor
{
public:
PaddleInferenceAnakinPredictor
()
{}
PaddleInferenceAnakinPredictor
(
const
AnakinConfig
&
config
);
// NOTE Unlike the native engine, the buffers of anakin engine's output_data
// should be allocated first.
// TODO(Superjomn) should unify all the behaviors of output_data accross all
// the engines.
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
)
override
;
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
override
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>&
get_executer
();
~
PaddleInferenceAnakinPredictor
()
override
{};
private:
bool
Init
(
const
AnakinConfig
&
config
);
anakin
::
AnakinEngine
<
anakin
::
NV
,
anakin
::
graph
::
Graph
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
engine_
;
graph_
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
executor_
;
AnakinConfig
config_
;
};
}
// namespace paddle
paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
浏览文件 @
d5a88b93
...
...
@@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include
"paddle/contrib/inference/paddle_inference_api.h"
#include
<glog/logging.h>
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace
paddle
{
TEST
(
inference
,
anakin
)
{
AnakinConfig
GetConfig
(
)
{
AnakinConfig
config
;
config
.
model_file
=
"./mobilenet_v2.anakin.bin"
;
config
.
device
=
0
;
config
.
max_batch_size
=
1
;
return
config
;
}
auto
engine
=
TEST
(
inference
,
anakin
)
{
AnakinConfig
config
=
GetConfig
();
auto
predictor
=
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
config
);
float
data
[
1
*
3
*
224
*
224
]
=
{
1.0
f
};
PaddleBuf
buf
{.
data
=
data
,
.
length
=
sizeof
(
data
)};
PaddleTensor
tensor
{.
name
=
"input_0"
,
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
}),
.
data
=
buf
,
.
dtype
=
PaddleDType
::
FLOAT32
};
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
,
tensor
);
float
data_out
[
1000
];
PaddleBuf
buf_out
{.
data
=
data_out
,
.
length
=
sizeof
(
data
)};
PaddleTensor
tensor_out
{.
name
=
"prob_out"
,
.
shape
=
std
::
vector
<
int
>
({
1000
,
1
}),
.
data
=
buf_out
,
.
dtype
=
PaddleDType
::
FLOAT32
};
std
::
vector
<
PaddleTensor
>
outputs
(
1
,
tensor_out
);
ASSERT_TRUE
(
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
));
float
*
data_o
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
);
for
(
size_t
j
=
0
;
j
<
1000
;
++
j
)
{
LOG
(
INFO
)
<<
"output["
<<
j
<<
"]: "
<<
data_o
[
j
];
}
}
}
// namespace paddle
paddle/contrib/inference/test_paddle_inference_api_impl.cc
浏览文件 @
d5a88b93
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <thread>
#include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
#include "paddle/fluid/inference/tests/test_helper.h"
...
...
@@ -45,14 +47,19 @@ NativeConfig GetConfig() {
config
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
LOG
(
INFO
)
<<
"dirname "
<<
config
.
model_dir
;
config
.
fraction_of_gpu_memory
=
0.15
;
#ifdef PADDLE_WITH_CUDA
config
.
use_gpu
=
true
;
#else
config
.
use_gpu
=
false
;
#endif
config
.
device
=
0
;
return
config
;
}
TEST
(
paddle_inference_api_impl
,
word2vec
)
{
void
MainWord2Vec
(
bool
use_gpu
)
{
NativeConfig
config
=
GetConfig
();
auto
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
config
.
use_gpu
=
use_gpu
;
framework
::
LoDTensor
first_word
,
second_word
,
third_word
,
fourth_word
;
framework
::
LoD
lod
{{
0
,
1
}};
...
...
@@ -100,11 +107,12 @@ TEST(paddle_inference_api_impl, word2vec) {
free
(
outputs
[
0
].
data
.
data
);
}
TEST
(
paddle_inference_api_impl
,
image_classification
)
{
void
MainImageClassification
(
bool
use_gpu
)
{
int
batch_size
=
2
;
bool
use_mkldnn
=
false
;
bool
repeat
=
false
;
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"image_classification_resnet.inference.model"
;
...
...
@@ -149,4 +157,143 @@ TEST(paddle_inference_api_impl, image_classification) {
free
(
data
);
}
void
MainThreadsWord2Vec
(
bool
use_gpu
)
{
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
// prepare inputs data and reference results
constexpr
int
num_jobs
=
3
;
std
::
vector
<
std
::
vector
<
framework
::
LoDTensor
>>
jobs
(
num_jobs
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
paddle_tensor_feeds
(
num_jobs
);
std
::
vector
<
framework
::
LoDTensor
>
refs
(
num_jobs
);
for
(
size_t
i
=
0
;
i
<
jobs
.
size
();
++
i
)
{
// each job has 4 words
jobs
[
i
].
resize
(
4
);
for
(
size_t
j
=
0
;
j
<
4
;
++
j
)
{
framework
::
LoD
lod
{{
0
,
1
}};
int64_t
dict_size
=
2073
;
// The size of dictionary
SetupLoDTensor
(
&
jobs
[
i
][
j
],
lod
,
static_cast
<
int64_t
>
(
0
),
dict_size
-
1
);
paddle_tensor_feeds
[
i
].
push_back
(
LodTensorToPaddleTensor
(
&
jobs
[
i
][
j
]));
}
// get reference result of each job
std
::
vector
<
paddle
::
framework
::
LoDTensor
*>
ref_feeds
;
std
::
vector
<
paddle
::
framework
::
LoDTensor
*>
ref_fetches
(
1
,
&
refs
[
i
]);
for
(
auto
&
word
:
jobs
[
i
])
{
ref_feeds
.
push_back
(
&
word
);
}
TestInference
<
platform
::
CPUPlace
>
(
config
.
model_dir
,
ref_feeds
,
ref_fetches
);
}
// create threads and each thread run 1 job
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
num_jobs
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
auto
predictor
=
main_predictor
->
Clone
();
auto
&
local_inputs
=
paddle_tensor_feeds
[
tid
];
std
::
vector
<
PaddleTensor
>
local_outputs
;
ASSERT_TRUE
(
predictor
->
Run
(
local_inputs
,
&
local_outputs
));
// check outputs range
ASSERT_EQ
(
local_outputs
.
size
(),
1UL
);
const
size_t
len
=
local_outputs
[
0
].
data
.
length
;
float
*
data
=
static_cast
<
float
*>
(
local_outputs
[
0
].
data
.
data
);
for
(
size_t
j
=
0
;
j
<
len
/
sizeof
(
float
);
++
j
)
{
ASSERT_LT
(
data
[
j
],
1.0
);
ASSERT_GT
(
data
[
j
],
-
1.0
);
}
// check outputs correctness
float
*
ref_data
=
refs
[
tid
].
data
<
float
>
();
EXPECT_EQ
(
refs
[
tid
].
numel
(),
static_cast
<
int64_t
>
(
len
/
sizeof
(
float
)));
for
(
int
i
=
0
;
i
<
refs
[
tid
].
numel
();
++
i
)
{
EXPECT_NEAR
(
ref_data
[
i
],
data
[
i
],
1e-3
);
}
free
(
data
);
});
}
for
(
int
i
=
0
;
i
<
num_jobs
;
++
i
)
{
threads
[
i
].
join
();
}
}
void
MainThreadsImageClassification
(
bool
use_gpu
)
{
constexpr
int
num_jobs
=
4
;
// each job run 1 batch
constexpr
int
batch_size
=
1
;
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"image_classification_resnet.inference.model"
;
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
std
::
vector
<
framework
::
LoDTensor
>
jobs
(
num_jobs
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
paddle_tensor_feeds
(
num_jobs
);
std
::
vector
<
framework
::
LoDTensor
>
refs
(
num_jobs
);
for
(
size_t
i
=
0
;
i
<
jobs
.
size
();
++
i
)
{
// prepare inputs
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
GetFeedTargetShapes
(
config
.
model_dir
,
/*is_combined*/
false
);
feed_target_shapes
[
0
][
0
]
=
batch_size
;
framework
::
DDim
input_dims
=
framework
::
make_ddim
(
feed_target_shapes
[
0
]);
SetupTensor
<
float
>
(
&
jobs
[
i
],
input_dims
,
0.
f
,
1.
f
);
paddle_tensor_feeds
[
i
].
push_back
(
LodTensorToPaddleTensor
(
&
jobs
[
i
]));
// get reference result of each job
std
::
vector
<
framework
::
LoDTensor
*>
ref_feeds
(
1
,
&
jobs
[
i
]);
std
::
vector
<
framework
::
LoDTensor
*>
ref_fetches
(
1
,
&
refs
[
i
]);
TestInference
<
platform
::
CPUPlace
>
(
config
.
model_dir
,
ref_feeds
,
ref_fetches
);
}
// create threads and each thread run 1 job
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
num_jobs
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
auto
predictor
=
main_predictor
->
Clone
();
auto
&
local_inputs
=
paddle_tensor_feeds
[
tid
];
std
::
vector
<
PaddleTensor
>
local_outputs
;
ASSERT_TRUE
(
predictor
->
Run
(
local_inputs
,
&
local_outputs
));
// check outputs correctness
ASSERT_EQ
(
local_outputs
.
size
(),
1UL
);
const
size_t
len
=
local_outputs
[
0
].
data
.
length
;
float
*
data
=
static_cast
<
float
*>
(
local_outputs
[
0
].
data
.
data
);
float
*
ref_data
=
refs
[
tid
].
data
<
float
>
();
EXPECT_EQ
(
refs
[
tid
].
numel
(),
len
/
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
refs
[
tid
].
numel
();
++
i
)
{
EXPECT_NEAR
(
ref_data
[
i
],
data
[
i
],
1e-3
);
}
free
(
data
);
});
}
for
(
int
i
=
0
;
i
<
num_jobs
;
++
i
)
{
threads
[
i
].
join
();
}
}
TEST
(
inference_api_native
,
word2vec_cpu
)
{
MainWord2Vec
(
false
/*use_gpu*/
);
}
TEST
(
inference_api_native
,
word2vec_cpu_threads
)
{
MainThreadsWord2Vec
(
false
/*use_gpu*/
);
}
TEST
(
inference_api_native
,
image_classification_cpu
)
{
MainThreadsImageClassification
(
false
/*use_gpu*/
);
}
TEST
(
inference_api_native
,
image_classification_cpu_threads
)
{
MainThreadsImageClassification
(
false
/*use_gpu*/
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
inference_api_native
,
word2vec_gpu
)
{
MainWord2Vec
(
true
/*use_gpu*/
);
}
TEST
(
inference_api_native
,
word2vec_gpu_threads
)
{
MainThreadsWord2Vec
(
true
/*use_gpu*/
);
}
TEST
(
inference_api_native
,
image_classification_gpu
)
{
MainThreadsImageClassification
(
true
/*use_gpu*/
);
}
TEST
(
inference_api_native
,
image_classification_gpu_threads
)
{
MainThreadsImageClassification
(
true
/*use_gpu*/
);
}
#endif
}
// namespace paddle
paddle/fluid/framework/data_layout.h
浏览文件 @
d5a88b93
...
...
@@ -27,6 +27,7 @@ enum class DataLayout {
kNHWC
=
0
,
kNCHW
=
1
,
kAnyLayout
=
2
,
kMKLDNN
=
3
,
// all layouts supported by MKLDNN internally
};
inline
DataLayout
StringToDataLayout
(
const
std
::
string
&
str
)
{
...
...
@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
return
DataLayout
::
kNCHW
;
}
else
if
(
s
==
"ANYLAYOUT"
)
{
return
DataLayout
::
kAnyLayout
;
}
else
if
(
s
==
"MKLDNNLAYOUT"
)
{
return
DataLayout
::
kMKLDNN
;
}
else
{
PADDLE_THROW
(
"Unknown storage order string: %s"
,
s
);
}
...
...
@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
return
"NCHW"
;
case
DataLayout
::
kAnyLayout
:
return
"ANY_LAYOUT"
;
case
DataLayout
::
kMKLDNN
:
return
"MKLDNNLAYOUT"
;
default:
PADDLE_THROW
(
"unknown DataLayou %d"
,
data_layout
);
PADDLE_THROW
(
"unknown DataLayou
t
%d"
,
data_layout
);
}
}
...
...
paddle/fluid/framework/data_layout_transform.cc
浏览文件 @
d5a88b93
...
...
@@ -16,6 +16,9 @@
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
out
->
set_layout
(
expected_kernel_type
.
data_layout_
);
}
#ifdef PADDLE_WITH_MKLDNN
using
mkldnn
::
memory
;
using
mkldnn
::
primitive
;
using
mkldnn
::
reorder
;
void
*
GetDataFromTensor
(
const
Tensor
&
tensor
,
mkldnn
::
memory
::
data_type
type
)
{
switch
(
type
)
{
case
mkldnn
::
memory
::
data_type
::
f32
:
return
platform
::
to_void_cast
(
tensor
.
data
<
float
>
());
case
mkldnn
::
memory
::
data_type
::
s8
:
return
platform
::
to_void_cast
(
tensor
.
data
<
char
>
());
case
mkldnn
::
memory
::
data_type
::
u8
:
return
platform
::
to_void_cast
(
tensor
.
data
<
unsigned
char
>
());
case
mkldnn
::
memory
::
data_type
::
s16
:
return
platform
::
to_void_cast
(
tensor
.
data
<
int16_t
>
());
case
mkldnn
::
memory
::
data_type
::
s32
:
return
platform
::
to_void_cast
(
tensor
.
data
<
int32_t
>
());
default:
PADDLE_THROW
(
"wrong mkldnn type provided"
);
}
}
#endif
void
TransDataLayoutFromMKLDNN
(
const
OpKernelType
&
kernel_type_for_var
,
const
OpKernelType
&
expected_kernel_type
,
const
Tensor
&
in
,
Tensor
*
out
)
{
auto
in_layout
=
kernel_type_for_var
.
data_layout_
;
auto
out_layout
=
expected_kernel_type
.
data_layout_
;
PADDLE_ENFORCE
(
in_layout
==
DataLayout
::
kMKLDNN
&&
out_layout
!=
DataLayout
::
kMKLDNN
,
"TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
"non-MKLDNN"
);
#ifdef PADDLE_WITH_MKLDNN
PADDLE_ENFORCE
(
in
.
format
()
!=
memory
::
format
::
format_undef
&&
in
.
format
()
!=
memory
::
format
::
any
,
"Input tensor should have specified memory format"
);
// Set default as NCHW in case not specified
out_layout
=
out_layout
==
DataLayout
::
kAnyLayout
?
DataLayout
::
kNCHW
:
out_layout
;
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
expected_kernel_type
.
place_
));
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
std
::
vector
<
int
>
in_tz
=
paddle
::
framework
::
vectorize2int
(
in
.
dims
());
std
::
vector
<
int
>
out_tz
=
in_tz
;
memory
::
data_type
in_type
=
ToMKLDNNDataType
(
in
.
type
());
PADDLE_ENFORCE
(
in_type
!=
memory
::
data_type
::
data_undef
,
"Input tensor type is not supported: "
,
in
.
type
().
name
());
memory
::
data_type
out_type
=
in_type
;
memory
::
format
in_format
=
in_tz
.
size
()
==
2
?
memory
::
format
::
nc
:
in
.
format
();
memory
::
format
out_format
=
out_tz
.
size
()
==
2
?
memory
::
format
::
nc
:
ToMKLDNNFormat
(
out_layout
);
void
*
in_data
=
GetDataFromTensor
(
in
,
in_type
);
// output tensor has the same dims as input. Reorder don't change dims
out
->
Resize
(
in
.
dims
());
auto
out_data
=
out
->
mutable_data
(
expected_kernel_type
.
place_
,
in
.
type
());
auto
in_memory
=
memory
({{{
in_tz
},
in_type
,
in_format
},
cpu_engine
},
in_data
);
auto
out_memory
=
memory
({{{
out_tz
},
out_type
,
out_format
},
cpu_engine
},
out_data
);
platform
::
Reorder
(
in_memory
,
out_memory
);
out
->
set_layout
(
out_layout
);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out
->
set_format
(
memory
::
format
::
format_undef
);
#endif
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/data_layout_transform.h
浏览文件 @
d5a88b93
...
...
@@ -14,6 +14,7 @@
#pragma once
#include <map>
#include <vector>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h"
...
...
@@ -22,6 +23,50 @@
namespace
paddle
{
namespace
framework
{
#ifdef PADDLE_WITH_MKLDNN
using
MKLDNNFormat
=
mkldnn
::
memory
::
format
;
using
MKLDNNDataType
=
mkldnn
::
memory
::
data_type
;
inline
MKLDNNFormat
ToMKLDNNFormat
(
const
DataLayout
&
layout
)
{
switch
(
layout
)
{
case
DataLayout
::
kNHWC
:
return
MKLDNNFormat
::
nhwc
;
case
DataLayout
::
kNCHW
:
return
MKLDNNFormat
::
nchw
;
default:
PADDLE_THROW
(
"Fail to convert layout %s to MKLDNN format"
,
DataLayoutToString
(
layout
));
}
}
inline
DataLayout
ToPaddleLayout
(
const
MKLDNNFormat
&
format
)
{
switch
(
format
)
{
case
MKLDNNFormat
::
nhwc
:
return
DataLayout
::
kNHWC
;
case
MKLDNNFormat
::
nchw
:
return
DataLayout
::
kNCHW
;
default:
PADDLE_THROW
(
"Fail to convert MKLDNN format to paddle layout"
);
}
}
inline
MKLDNNDataType
ToMKLDNNDataType
(
const
std
::
type_index
type
)
{
static
const
std
::
map
<
std
::
type_index
,
MKLDNNDataType
>
dict
{
{
std
::
type_index
(
typeid
(
float
)),
MKLDNNDataType
::
f32
},
// NOLINT
{
std
::
type_index
(
typeid
(
char
)),
MKLDNNDataType
::
s8
},
// NOLINT
{
std
::
type_index
(
typeid
(
unsigned
char
)),
MKLDNNDataType
::
u8
},
{
std
::
type_index
(
typeid
(
int16_t
)),
MKLDNNDataType
::
s16
},
{
std
::
type_index
(
typeid
(
int32_t
)),
MKLDNNDataType
::
s32
}};
auto
iter
=
dict
.
find
(
type
);
if
(
iter
!=
dict
.
end
())
return
iter
->
second
;
return
MKLDNNDataType
::
data_undef
;
}
#endif
void
TransDataLayoutFromMKLDNN
(
const
OpKernelType
&
kernel_type_for_var
,
const
OpKernelType
&
expected_kernel_type
,
const
Tensor
&
in
,
Tensor
*
out
);
std
::
vector
<
int
>
GetAxis
(
const
DataLayout
&
from
,
const
DataLayout
&
to
);
void
TransDataLayout
(
const
OpKernelType
&
kernel_type_for_var
,
...
...
paddle/fluid/framework/data_transform.cc
浏览文件 @
d5a88b93
...
...
@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type,
Tensor
in
;
in
.
ShareDataWith
(
input_tensor
);
Tensor
out
;
DataLayout
lin
=
kernel_type_for_var
.
data_layout_
;
DataLayout
lout
=
expected_kernel_type
.
data_layout_
;
// do layout transform
if
(
NeedTransformLayout
(
expected_kernel_type
.
data_layout_
,
kernel_type_for_var
.
data_layout_
))
{
TransDataLayout
(
kernel_type_for_var
,
expected_kernel_type
,
in
,
&
out
);
if
(
NeedTransformLayout
(
lout
,
lin
))
{
if
(
lin
==
DataLayout
::
kMKLDNN
||
lout
==
DataLayout
::
kMKLDNN
)
{
PADDLE_ENFORCE
(
!
(
lin
==
DataLayout
::
kMKLDNN
&&
lout
==
DataLayout
::
kMKLDNN
),
"No layout transform needed between two MKLDNN OPKernels"
);
if
(
lin
!=
DataLayout
::
kMKLDNN
&&
lout
==
DataLayout
::
kMKLDNN
)
{
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
out
.
ShareDataWith
(
input_tensor
);
out
.
set_layout
(
DataLayout
::
kMKLDNN
);
out
.
set_format
(
ToMKLDNNFormat
(
lin
));
#endif
}
else
{
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
// Do transform via MKLDNN lib
TransDataLayoutFromMKLDNN
(
kernel_type_for_var
,
expected_kernel_type
,
in
,
&
out
);
}
}
else
{
// Case3 - transfrom between Non-MKLDNN OPKernels
TransDataLayout
(
kernel_type_for_var
,
expected_kernel_type
,
in
,
&
out
);
}
transformed
=
true
;
PassTensorData
(
&
out
,
&
in
);
}
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
d5a88b93
...
...
@@ -13,7 +13,7 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
if
(
WITH_GPU
)
nv_library
(
nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda
)
dynload_cuda
variable_visitor
)
set
(
multi_devices_graph_builder_deps nccl_all_reduce_op_handle
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda
)
nv_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
...
...
@@ -25,6 +25,7 @@ else()
endif
()
cc_library
(
gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope
)
cc_library
(
multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle rpc_op_handle
${
multi_devices_graph_builder_deps
}
reduce_op_handle broadcast_op_handle
)
...
...
paddle/fluid/framework/details/fuse_vars_op_handle.cc
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
void
FuseVarsOpHandle
::
RunImpl
()
{
WaitInputVarGenerated
(
place_
);
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Inputs
());
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
());
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
0
);
PADDLE_ENFORCE_EQ
(
out_var_handles
.
size
()
-
1
,
inputs_numel_
.
size
(),
""
);
auto
scope
=
local_scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
out_var_handle
=
out_var_handles
[
0
];
auto
out_var
=
scope
->
Var
(
out_var_handle
->
name_
);
auto
out_tensor
=
out_var
->
GetMutable
<
LoDTensor
>
();
out_tensor
->
Resize
({
total_numel_
}).
mutable_data
(
this
->
place_
,
type_
);
int64_t
s
=
0
;
for
(
size_t
i
=
1
;
i
<
out_var_handles
.
size
();
++
i
)
{
auto
out_name
=
out_var_handles
[
i
]
->
name_
;
auto
out_t
=
scope
->
Var
(
out_name
)
->
GetMutable
<
LoDTensor
>
();
auto
numel
=
this
->
inputs_numel_
.
at
(
out_name
);
out_t
->
ShareDataWith
(
out_tensor
->
Slice
(
s
,
s
+
numel
));
s
+=
numel
;
}
this
->
RunAndRecordEvent
([
this
]
{});
}
std
::
string
FuseVarsOpHandle
::
Name
()
const
{
return
"fuse vars"
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/fuse_vars_op_handle.h
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
struct
FuseVarsOpHandle
:
public
OpHandleBase
{
public:
FuseVarsOpHandle
(
Scope
*
local_scope
,
const
platform
::
Place
&
place
,
const
std
::
unordered_map
<
std
::
string
,
int64_t
>
&
inputs_numel
,
const
std
::
type_index
&
var_type
)
:
local_scope_
(
local_scope
),
place_
(
place
),
inputs_numel_
(
inputs_numel
),
type_
(
var_type
)
{
total_numel_
=
0
;
for
(
auto
in_numel
:
inputs_numel
)
{
PADDLE_ENFORCE_GT
(
in_numel
.
second
,
0
);
total_numel_
+=
in_numel
.
second
;
}
}
std
::
string
Name
()
const
override
;
bool
IsMultiDeviceTransfer
()
override
{
return
false
;
};
protected:
void
RunImpl
()
override
;
private:
Scope
*
local_scope_
;
const
platform
::
Place
place_
;
const
std
::
unordered_map
<
std
::
string
,
int64_t
>
inputs_numel_
;
const
std
::
type_index
type_
;
int64_t
total_numel_
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/multi_devices_graph_builder.cc
浏览文件 @
d5a88b93
...
...
@@ -230,7 +230,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
if
(
op
->
Type
()
==
"concat"
)
{
auto
got
=
remote_vars_devices_
.
find
(
op
->
InputArgumentNames
()[
0
]);
PADDLE_ENFORCE
(
got
!=
remote_vars_devices_
.
end
(),
"can not find right place to concat received var."
);
"can not find right place to concat
enate
received var."
);
CreateDistTrainOp
(
&
result
,
*
op
,
got
->
second
);
}
else
{
CreateDistTrainOp
(
&
result
,
*
op
,
0
);
...
...
@@ -503,10 +503,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
}
void
MultiDevSSAGraphBuilder
::
CreateRPCOp
(
SSAGraph
*
result
,
const
OpDesc
&
op
,
int
place_id
)
const
{
auto
&
p
=
places_
[
place_id
];
auto
*
s
=
local_scopes_
[
place_id
];
result
->
ops_
.
emplace_back
(
new
RPCOpHandle
(
op
,
s
,
p
,
op
.
Type
()));
int
device_id
)
const
{
result
->
ops_
.
emplace_back
(
new
RPCOpHandle
(
op
,
local_scopes_
[
device_id
],
op
.
Type
(),
places_
[
device_id
]));
if
(
op
.
Type
()
==
"send_barrier"
)
{
ConnectOp
(
result
,
result
->
ops_
.
back
().
get
(),
"send_vars"
);
...
...
@@ -524,7 +523,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op,
// TODO(Yancey1989): schedule rpc op on different place may
// increate throughput
CreateOpHandleIOs
(
result
,
op
,
pla
ce_id
);
CreateOpHandleIOs
(
result
,
op
,
devi
ce_id
);
}
bool
MultiDevSSAGraphBuilder
::
IsScaleLossOp
(
const
OpDesc
&
op
)
const
{
...
...
paddle/fluid/framework/details/multi_devices_graph_builder.h
浏览文件 @
d5a88b93
...
...
@@ -58,7 +58,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
private:
void
CreateOpHandleIOs
(
SSAGraph
*
result
,
const
OpDesc
&
op
,
size_t
pla
ce_id
)
const
;
size_t
devi
ce_id
)
const
;
private:
std
::
string
loss_var_name_
;
...
...
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
浏览文件 @
d5a88b93
...
...
@@ -11,10 +11,12 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -30,27 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
}
void
NCCLAllReduceOpHandle
::
RunImpl
()
{
if
(
inputs_
.
s
ize
()
==
1
)
{
if
(
NoDummyInputS
ize
()
==
1
)
{
return
;
// No need to all reduce when GPU count = 1;
}
else
{
// Wait input done
WaitInputVarGenerated
();
auto
&
var_name
=
static_cast
<
VarHandle
*>
(
this
->
inputs_
[
0
])
->
name_
;
int
dtype
=
-
1
;
size_t
numel
=
0
;
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Inputs
());
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
());
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
places_
.
size
(),
"The NoDummyInputSize should be equal to the number of places."
);
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
out_var_handles
.
size
(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."
);
std
::
vector
<
const
LoDTensor
*>
lod_tensors
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
*
s
=
local_scopes_
[
i
];
auto
&
local_scope
=
*
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
lod_tensor
=
local_scope
.
FindVar
(
var_name
)
->
Get
<
LoDTensor
>
();
auto
&
lod_tensor
=
local_scope
.
FindVar
(
in_var_handles
[
i
]
->
name_
)
->
Get
<
LoDTensor
>
();
lod_tensors
.
emplace_back
(
&
lod_tensor
);
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name_
,
out_var_handles
[
i
]
->
name_
,
"The name of input and output should be equal."
);
}
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
int
dtype
=
-
1
;
size_t
numel
=
0
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
...
...
@@ -96,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
auto
&
scope
=
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
p
=
places_
[
i
];
auto
*
var
=
scope
.
FindVar
(
var_name
);
auto
*
var
=
scope
.
FindVar
(
in_var_handles
[
i
]
->
name_
);
auto
*
dev_ctx
=
dev_ctxes_
[
p
];
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
d5a88b93
...
...
@@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
}
}
size_t
OpHandleBase
::
NoDummyInputSize
()
const
{
size_t
cnt
=
0
;
for
(
auto
*
in
:
inputs_
)
{
if
(
dynamic_cast
<
DummyVarHandle
*>
(
in
)
==
nullptr
)
{
++
cnt
;
}
}
return
cnt
;
}
bool
OpHandleBase
::
NeedWait
(
VarHandleBase
*
in_var
)
{
return
in_var
&&
in_var
->
generated_op_
;
}
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
d5a88b93
...
...
@@ -80,6 +80,8 @@ class OpHandleBase {
const
std
::
vector
<
VarHandleBase
*>
&
Outputs
()
const
{
return
outputs_
;
}
size_t
NoDummyInputSize
()
const
;
protected:
void
RunAndRecordEvent
(
const
std
::
function
<
void
()
>
&
callback
);
...
...
paddle/fluid/framework/details/rpc_op_handle.cc
浏览文件 @
d5a88b93
...
...
@@ -19,12 +19,12 @@ namespace framework {
namespace
details
{
RPCOpHandle
::
RPCOpHandle
(
const
framework
::
OpDesc
&
op_desc
,
const
Scope
*
local_scope
,
const
platform
::
Place
&
plac
e
,
const
std
::
string
&
nam
e
)
const
Scope
*
local_scope
,
const
std
::
string
&
nam
e
,
const
platform
::
Place
&
plac
e
)
:
op_
(
framework
::
OpRegistry
::
CreateOp
(
op_desc
)),
local_scope_
(
local_scope
),
place_
(
plac
e
),
name_
(
nam
e
)
{}
name_
(
nam
e
),
place_
(
plac
e
)
{}
void
RPCOpHandle
::
RunImpl
()
{
// TODO(wuyi): need further analysis whether wait VarDummyHandle.
...
...
paddle/fluid/framework/details/rpc_op_handle.h
浏览文件 @
d5a88b93
...
...
@@ -29,7 +29,7 @@ namespace details {
struct
RPCOpHandle
:
public
OpHandleBase
{
RPCOpHandle
(
const
framework
::
OpDesc
&
op_desc
,
const
Scope
*
local_scope
,
const
platform
::
Place
&
place
,
const
std
::
string
&
nam
e
);
const
std
::
string
&
name
,
const
platform
::
Place
&
plac
e
);
std
::
string
Name
()
const
override
;
...
...
@@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase {
private:
std
::
unique_ptr
<
OperatorBase
>
op_
;
const
Scope
*
local_scope_
;
const
platform
::
Place
&
place_
;
const
std
::
string
name_
;
platform
::
Place
place_
;
};
}
// namespace details
...
...
paddle/fluid/framework/details/ssa_graph_builder.h
浏览文件 @
d5a88b93
...
...
@@ -30,7 +30,9 @@ class SSAGraphBuilder {
SSAGraphBuilder
()
{}
virtual
~
SSAGraphBuilder
()
{}
virtual
std
::
unique_ptr
<
SSAGraph
>
Build
(
const
ProgramDesc
&
program
)
const
=
0
;
virtual
int
GetRemoteVarDeviceId
(
const
std
::
string
&
var_name
)
const
=
0
;
virtual
int
GetRemoteVarDeviceId
(
const
std
::
string
&
var_name
)
const
{
return
-
1
;
}
DISABLE_COPY_AND_ASSIGN
(
SSAGraphBuilder
);
...
...
paddle/fluid/framework/details/ssa_graph_printer.h
浏览文件 @
d5a88b93
...
...
@@ -20,7 +20,7 @@
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
SSAGraph
;
struct
SSAGraph
;
class
SSAGraphPrinter
{
public:
virtual
~
SSAGraphPrinter
()
{}
...
...
paddle/fluid/framework/op_kernel_type.h
浏览文件 @
d5a88b93
...
...
@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
}
inline
bool
NeedTransformLayout
(
const
DataLayout
&
l
,
const
DataLayout
&
r
)
{
return
l
!=
DataLayout
::
kAnyLayout
&&
r
!=
DataLayout
::
kAnyLayout
&&
l
!=
r
;
bool
ret
=
(
l
!=
DataLayout
::
kAnyLayout
&&
r
!=
DataLayout
::
kAnyLayout
&&
l
!=
r
);
#ifdef PADDLE_WITH_MKLDNN
// Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
ret
|=
(
l
!=
DataLayout
::
kMKLDNN
&&
r
==
DataLayout
::
kMKLDNN
);
ret
|=
(
l
==
DataLayout
::
kMKLDNN
&&
r
!=
DataLayout
::
kMKLDNN
);
#endif
return
ret
;
}
inline
bool
TransFromNeeded
(
const
OpKernelType
&
l
,
const
OpKernelType
&
r
)
{
...
...
paddle/fluid/framework/op_registry.h
浏览文件 @
d5a88b93
...
...
@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
void
operator
()(
const
char
*
op_type
,
const
char
*
library_type
)
const
{
using
T
=
typename
KERNEL_TYPE
::
ELEMENT_TYPE
;
std
::
string
library
(
library_type
);
std
::
string
data_layout
=
"ANYLAYOUT"
;
if
(
library
==
"MKLDNN"
)
{
data_layout
=
"MKLDNNLAYOUT"
;
}
OpKernelType
key
(
ToDataType
(
std
::
type_index
(
typeid
(
T
))),
PlaceType
(),
DataLayout
::
kAnyLayout
,
StringToLibraryType
(
library_type
));
StringToDataLayout
(
data_layout
),
StringToLibraryType
(
library_type
));
OperatorWithKernel
::
AllOpKernels
()[
op_type
][
key
].
reset
(
new
KERNEL_TYPE
);
constexpr
auto
size
=
std
::
tuple_size
<
std
::
tuple
<
KernelTypes
...
>>::
value
;
...
...
@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
void
operator
()(
const
char
*
op_type
,
const
char
*
library_type
)
const
{}
};
// User can register many kernel in one place. The data type could be different.
// User can register many kernel in one place. The data type could be
// different.
template
<
typename
PlaceType
,
typename
...
KernelType
>
class
OpKernelRegistrar
:
public
Registrar
{
public:
...
...
@@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
/**
* Macro to register OperatorKernel.
*/
#define REGISTER_OP_KERNEL(op_type,
LIBRARY_TYPE
, place_class, ...) \
#define REGISTER_OP_KERNEL(op_type,
library_type
, place_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##
LIBRARY_TYPE
##__, \
__reg_op_kernel_##op_type##_##
library_type
##__, \
"REGISTER_OP_KERNEL must be called in global namespace"); \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##
LIBRARY_TYPE
##__(#op_type, \
#
LIBRARY_TYPE
); \
int TouchOpKernelRegistrar_##op_type##_##
LIBRARY_TYPE
() { \
__op_kernel_registrar_##op_type##_##
LIBRARY_TYPE
##__.Touch(); \
__op_kernel_registrar_##op_type##_##
library_type
##__(#op_type, \
#
library_type
); \
int TouchOpKernelRegistrar_##op_type##_##
library_type
() { \
__op_kernel_registrar_##op_type##_##
library_type
##__.Touch(); \
return 0; \
}
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
d5a88b93
...
...
@@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
}
}
bool
ExecutionContext
::
HasInput
(
const
std
::
string
&
name
)
const
{
if
(
!
op_
.
HasInputs
(
name
))
{
return
false
;
}
auto
&
ins
=
Inputs
(
name
);
size_t
length
=
ins
.
size
();
if
(
length
==
0
)
{
return
false
;
}
PADDLE_ENFORCE_EQ
(
length
,
1UL
,
"Input %s should not have more than one inputs"
,
name
);
auto
arg
=
ins
[
0
];
auto
*
var
=
arg
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
arg
);
return
var
!=
nullptr
;
}
bool
ExecutionContext
::
HasOutput
(
const
std
::
string
&
name
)
const
{
if
(
!
op_
.
HasOutputs
(
name
))
{
return
false
;
}
auto
&
outs
=
Outputs
(
name
);
size_t
length
=
outs
.
size
();
if
(
length
==
0
)
{
return
false
;
}
PADDLE_ENFORCE_EQ
(
length
,
1UL
,
"Output %s should not have more than one inputs"
,
name
);
auto
arg
=
outs
[
0
];
auto
*
var
=
arg
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
arg
);
return
var
!=
nullptr
;
}
template
<
>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
InputVar
(
name
);
...
...
@@ -444,10 +476,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
auto
*
out_tensor
=
out_var
->
GetMutable
<
LoDTensor
>
();
out_tensor
->
set_lod
(
in_tensor
.
lod
());
// TODO(dzhwinter) : reuse ShareLoD in most operators.
// Need to call ShareLayout explicitly in sequence related ops.
// Shall we have a better method to shared info between in/out Tensor?
out_tensor
->
set_layout
(
in_tensor
.
layout
());
// TODO(dzhwinter) : reuse ShareLoD in most operators.
// Need to call ShareLayout explicitly in sequence related ops.
// Shall we have a better method to shared info between in/out Tensor?
#ifdef PADDLE_WITH_MKLDNN
// Fix me: ugly workaround below
// Correct solution:
// set_layout() should NOT be called here (i.e. ShareLoD). Instead,
// layout of output tensor should be set "manually" in Compute()
// of each OPKernel. The reason layout should NOT be shared between
// input and output "automatically" (now by InferShape()->ShareLoD())
// is that layout transform may occur after InferShape().
// Workaround:
// Skip set_layout() when input layout is kMKLDNN
// This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
// OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
// in Compute()
if
(
in_tensor
.
layout
()
!=
DataLayout
::
kMKLDNN
)
#endif
out_tensor
->
set_layout
(
in_tensor
.
layout
());
}
void
ShareLayout
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
...
...
@@ -646,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
}
if
(
t
!=
nullptr
)
{
int
tmp
=
static_cast
<
int
>
(
ToDataType
(
t
->
type
()));
PADDLE_ENFORCE
(
tmp
==
data_type
||
data_type
==
-
1
,
"DataType of Paddle Op %s must be the same."
,
Type
());
PADDLE_ENFORCE
(
tmp
==
data_type
||
data_type
==
-
1
,
"DataType of Paddle Op %s must be the same. Get %d != %d"
,
Type
(),
data_type
,
tmp
);
data_type
=
tmp
;
}
}
...
...
@@ -665,7 +714,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
OpKernelType
OperatorWithKernel
::
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
const
OpKernelType
&
expected_kernel_type
)
const
{
return
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
());
return
OpKernelType
(
expected_kernel_type
.
data_type_
,
tensor
.
place
(),
tensor
.
layout
());
}
}
// namespace framework
...
...
paddle/fluid/framework/operator.h
浏览文件 @
d5a88b93
...
...
@@ -191,9 +191,9 @@ class ExecutionContext {
return
op_
.
Attr
<
T
>
(
name
);
}
bool
HasInput
(
const
std
::
string
&
name
)
const
{
return
op_
.
HasInputs
(
name
);
}
bool
HasInput
(
const
std
::
string
&
name
)
const
;
bool
HasOutput
(
const
std
::
string
&
name
)
const
{
return
op_
.
HasOutputs
(
name
);
}
bool
HasOutput
(
const
std
::
string
&
name
)
const
;
size_t
InputSize
(
const
std
::
string
&
name
)
const
{
return
op_
.
Inputs
(
name
).
size
();
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
d5a88b93
...
...
@@ -111,11 +111,7 @@ ParallelExecutor::ParallelExecutor(
#ifdef PADDLE_WITH_CUDA
builder_factory
.
SetNCCLContextMap
(
member_
->
nccl_ctxs_
.
get
());
#endif
builder_
.
reset
(
builder_factory
.
Create
().
get
());
if
(
builder_
.
get
()
==
nullptr
)
{
VLOG
(
3
)
<<
"builder is null."
;
}
builder_
=
std
::
move
(
builder_factory
.
Create
());
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
places
,
builder_
->
Build
(
main_program
)));
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
d5a88b93
...
...
@@ -34,6 +34,28 @@ namespace framework {
class
LoDTensor
;
class
Tensor
{
#ifdef PADDLE_WITH_MKLDNN
public:
inline
mkldnn
::
memory
::
format
format
()
const
{
return
format_
;
}
inline
void
set_format
(
const
mkldnn
::
memory
::
format
format
)
{
format_
=
format
;
}
protected:
/**
* @brief the detail format of memory block which have layout as kMKLDNN
*
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, layout will be set as
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
*/
mkldnn
::
memory
::
format
format_
=
mkldnn
::
memory
::
format
::
format_undef
;
#endif
public:
template
<
typename
T
,
size_t
D
,
int
MajorType
,
typename
IndexType
>
friend
struct
EigenTensor
;
...
...
@@ -195,8 +217,10 @@ class Tensor {
* N,C,H,W for respectively the batch size, the number of
* feature maps, the height.
*/
DataLayout
layout_
=
DataLayout
::
kNHWC
;
// Fix me: here just change the default layout to kNCHW
// it doesn't fix the real issue, i.e. feeder should set up tensor layout
// according to actual input data
DataLayout
layout_
=
DataLayout
::
kNCHW
;
/**
* @brief A PlaceHolder may be shared by more than one tensor.
...
...
paddle/fluid/framework/tensor_test.cc
浏览文件 @
d5a88b93
...
...
@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) {
TEST
(
Tensor
,
Layout
)
{
framework
::
Tensor
src
;
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kN
HWC
);
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kN
CHW
);
src
.
set_layout
(
framework
::
DataLayout
::
kAnyLayout
);
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kAnyLayout
);
}
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
d5a88b93
# Add TRT tests
# This test is not stable
# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828
#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
# DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
# SERIAL)
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc
DEPS tensorrt_engine mul_op
)
...
...
@@ -16,3 +11,5 @@ nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine mul_op SERIAL
)
nv_test
(
test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine mul_op SERIAL
)
nv_test
(
test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine activation_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/activation_op.cc
浏览文件 @
d5a88b93
...
...
@@ -22,7 +22,8 @@ namespace tensorrt {
class
ReluOpConverter
:
public
OpConverter
{
public:
ReluOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
)
override
{
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
...
...
@@ -33,7 +34,12 @@ class ReluOpConverter : public OpConverter {
nvinfer1
::
IActivationLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Activation
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input_tensor
),
nvinfer1
::
ActivationType
::
kRELU
);
engine_
->
SetITensor
(
op_desc
.
Output
(
"Out"
)[
0
],
layer
->
getOutput
(
0
));
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
};
...
...
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
浏览文件 @
d5a88b93
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
USE_OP
(
relu
);
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
void
Compare
(
const
std
::
string
op_type
,
float
input
,
float
expect
)
{
TEST
(
ReluOpConverter
,
main
)
{
framework
::
Scope
scope
;
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
// init fluid op and variable
auto
x_var
=
scope
.
Var
(
"X"
);
auto
x_tensor
=
x_var
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
({
1
,
1
});
x_tensor
->
mutable_data
<
float
>
(
place
);
std
::
vector
<
float
>
init
;
init
.
push_back
(
input
);
framework
::
TensorFromVector
(
init
,
ctx
,
x_tensor
);
auto
out_var
=
scope
.
Var
(
"Out"
);
auto
out_tensor
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
out_tensor
->
Resize
({
1
,
1
});
out_tensor
->
mutable_data
<
float
>
(
place
);
framework
::
OpDesc
op_desc
;
op_desc
.
SetType
(
op_type
);
op_desc
.
SetInput
(
"X"
,
{
"X"
});
op_desc
.
SetOutput
(
"Out"
,
{
"Out"
});
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
.
Proto
());
// run fluid op
op
->
Run
(
scope
,
place
);
// get fluid output
std
::
vector
<
float
>
out1
;
framework
::
TensorToVector
(
*
out_tensor
,
ctx
,
&
out1
);
// init tensorrt op
cudaStream_t
stream
;
ASSERT_EQ
(
0
,
cudaStreamCreate
(
&
stream
));
TensorRTEngine
*
engine
=
new
TensorRTEngine
(
1
,
1
<<
10
,
&
stream
);
engine
->
InitNetwork
();
engine
->
DeclareInput
(
"X"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
DimsCHW
{
1
,
1
,
1
});
// convert op
OpConverter
op_converter
;
op_converter
.
ConvertOp
(
*
op_desc
.
Proto
(),
engine
);
engine
->
DeclareOutput
(
"Out"
);
engine
->
FreezeNetwork
();
// convert LoDTensor to ITensor
size_t
size
=
x_tensor
->
memory_size
();
EngineIOConverter
::
ConvertInput
(
op_type
,
*
x_tensor
,
engine
->
buffer
(
"X"
).
buffer
,
size
,
&
stream
);
// run tensorrt Outp
engine
->
Execute
(
1
);
// convert ITensor to LoDTensor
EngineIOConverter
::
ConvertOutput
(
op_type
,
engine
->
buffer
(
"Out"
).
buffer
,
out_tensor
,
size
,
&
stream
);
// get tensorrt output
std
::
vector
<
float
>
out2
;
framework
::
TensorToVector
(
*
out_tensor
,
ctx
,
&
out2
);
// compare
ASSERT_EQ
(
out1
[
0
],
out2
[
0
]);
ASSERT_EQ
(
out1
[
0
],
expect
);
delete
engine
;
cudaStreamDestroy
(
stream
);
}
TEST
(
OpConverter
,
ConvertRelu
)
{
Compare
(
"relu"
,
1
,
1
);
// relu(1) = 1
Compare
(
"relu"
,
-
5
,
0
);
// relu(-5) = 0
std
::
unordered_set
<
std
::
string
>
parameters
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"relu-X"
,
nvinfer1
::
Dims2
(
10
,
6
));
validator
.
DeclOutputVar
(
"relu-Out"
,
nvinfer1
::
Dims2
(
10
,
6
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"relu"
);
desc
.
SetInput
(
"X"
,
{
"relu-X"
});
desc
.
SetOutput
(
"Out"
,
{
"relu-Out"
});
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
10
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
activation
);
USE_OP
(
relu
);
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
d5a88b93
...
...
@@ -166,8 +166,6 @@ function(op_library TARGET)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
if
(
${
TARGET
}
STREQUAL
"activation"
)
file
(
APPEND
${
pybind_file
}
"USE_OP(relu);
\n
"
)
elseif
(
${
TARGET
}
STREQUAL
"reduce"
)
file
(
APPEND
${
pybind_file
}
"USE_OP(reduce_sum);
\n
"
)
elseif
(
${
TARGET
}
STREQUAL
"fake_dequantize"
)
file
(
APPEND
${
pybind_file
}
"USE_OP(fake_dequantize_max_abs);
\n
"
)
else
()
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
d5a88b93
...
...
@@ -58,14 +58,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
const
framework
::
OperatorWithKernel
&
oper
,
const
std
::
string
&
name
)
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kPlain
};
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
auto
it
=
oper
.
Attrs
().
find
(
"use_mkldnn"
);
if
(
library
==
framework
::
LibraryType
::
kPlain
&&
it
!=
oper
.
Attrs
().
end
()
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
Tensor
>
(
name
)
->
type
()),
ctx
.
GetPlace
(),
layout
,
library
);
...
...
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
d5a88b93
...
...
@@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel {
"Variance input should be of float type"
);
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library_
);
}
...
...
@@ -367,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
}
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
(),
layout
,
library_
);
layout
_
,
library_
);
}
};
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
d5a88b93
...
...
@@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
ConvOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kCUDNN
;
...
...
@@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
if
(
library
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
...
...
@@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"float16 can only be used when CUDNN is used"
);
}
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
);
}
...
...
@@ -309,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
ConvOpGrad
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
...
...
@@ -318,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
layout_
,
library_
);
...
...
paddle/fluid/operators/fc_op.cc
浏览文件 @
d5a88b93
...
...
@@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
FCOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kMKLDNN
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
k
AnyLayout
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
k
MKLDNN
};
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
...
...
@@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
FCOpGrad
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kMKLDNN
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
k
AnyLayout
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
k
MKLDNN
};
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
...
...
paddle/fluid/operators/lrn_op.cc
浏览文件 @
d5a88b93
...
...
@@ -124,16 +124,17 @@ namespace {
framework
::
OpKernelType
GetExpectedLRNKernel
(
const
framework
::
ExecutionContext
&
ctx
)
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_MKLDNN
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
(),
layout_
,
library_
);
...
...
paddle/fluid/operators/pool_mkldnn_op.cc
浏览文件 @
d5a88b93
...
...
@@ -24,10 +24,13 @@ using mkldnn::pooling_backward;
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static
std
::
string
gethash
(
memory
::
dims
&
input_dims
,
std
::
string
&
pooling_type
,
std
::
vector
<
int
>&
ksize
,
std
::
vector
<
int
>&
strides
,
std
::
vector
<
int
>&
paddings
,
std
::
string
suffix
)
{
auto
dims2str
=
[](
memory
::
dims
&
operand_dims
)
{
static
std
::
string
gethash
(
const
memory
::
dims
&
input_dims
,
const
std
::
string
&
pooling_type
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
&
suffix
)
{
auto
dims2str
=
[](
const
memory
::
dims
&
operand_dims
)
{
std
::
string
dstr
=
""
;
for
(
size_t
i
=
0
;
i
<
operand_dims
.
size
();
++
i
)
{
dstr
+=
std
::
to_string
(
operand_dims
[
i
])
+
"-"
;
...
...
paddle/fluid/operators/pool_op.cc
浏览文件 @
d5a88b93
...
...
@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
framework
::
OpKernelType
PoolOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
...
...
@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
(),
layout_
,
library_
);
...
...
@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
framework
::
OpKernelType
PoolOpGrad
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
...
...
@@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
...
...
@@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
PADDLE_ENFORCE_EQ
(
library_
,
framework
::
LibraryType
::
kCUDNN
,
"float16 can only be used when CUDNN is used"
);
}
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout_
,
library_
);
}
...
...
paddle/fluid/operators/reduce_max_op.cc
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_REDUCE_OP
(
reduce_max
);
REGISTER_OP_CPU_KERNEL
(
reduce_max
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
MaxFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
MaxFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
MaxFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
MaxFunctor
>
);
REGISTER_OP_CPU_KERNEL
(
reduce_max_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
MaxOrMinGradFunctor
>
);
paddle/fluid/operators/reduce_max_op.cu
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL
(
reduce_max
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MaxFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
MaxFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MaxFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MaxFunctor
>
);
REGISTER_OP_CUDA_KERNEL
(
reduce_max_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MaxOrMinGradFunctor
>
);
paddle/fluid/operators/reduce_mean_op.cc
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_REDUCE_OP
(
reduce_mean
);
REGISTER_OP_CPU_KERNEL
(
reduce_mean
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
MeanFunctor
>
);
REGISTER_OP_CPU_KERNEL
(
reduce_mean_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
MeanGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
MeanGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
MeanGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
MeanGradFunctor
>
);
paddle/fluid/operators/reduce_mean_op.cu
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL
(
reduce_mean
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MeanFunctor
>
);
REGISTER_OP_CUDA_KERNEL
(
reduce_mean_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MeanGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
MeanGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MeanGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MeanGradFunctor
>
);
paddle/fluid/operators/reduce_mean_op.h
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace
paddle
{
namespace
operators
{
struct
MeanFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
mean
(
dim
);
}
};
struct
MeanGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
)
/
dx
->
constant
(
size
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reduce_min_max_op.h
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace
paddle
{
namespace
operators
{
struct
MaxFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
maximum
(
dim
);
}
};
struct
MinFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
minimum
(
dim
);
}
};
struct
MaxOrMinGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
auto
equals
=
(
*
x
)
==
y
->
broadcast
(
dim
);
auto
ones
=
dx
->
constant
(
1
);
auto
zeros
=
dx
->
constant
(
0
);
// If there are multiple minimum or maximum elements, the subgradient of
// each is the set [0, 1], and we pass gradient to all of them here.
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
)
*
equals
.
select
(
ones
,
zeros
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reduce_min_op.cc
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_REDUCE_OP
(
reduce_min
);
REGISTER_OP_CPU_KERNEL
(
reduce_min
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
MinFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
MinFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
MinFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
MinFunctor
>
);
REGISTER_OP_CPU_KERNEL
(
reduce_min_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
MaxOrMinGradFunctor
>
);
paddle/fluid/operators/reduce_min_op.cu
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL
(
reduce_min
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MinFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
MinFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MinFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MinFunctor
>
);
REGISTER_OP_CUDA_KERNEL
(
reduce_min_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MaxOrMinGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MaxOrMinGradFunctor
>
);
paddle/fluid/operators/reduce_op.cc
已删除
100644 → 0
浏览文件 @
23433def
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_op.h"
#include <algorithm>
#include <string>
#include <vector>
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
class
ReduceOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of ReduceOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of ReduceOp should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
x_rank
=
x_dims
.
size
();
PADDLE_ENFORCE_LE
(
x_rank
,
6
,
"Tensors with rank at most 6 are supported."
);
auto
dims
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dim"
);
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
dims
[
i
]
=
x_rank
+
dims
[
i
];
PADDLE_ENFORCE_LT
(
dims
[
i
],
x_rank
,
"The dim should be in the range [-rank(input), rank(input))."
);
}
sort
(
dims
.
begin
(),
dims
.
end
());
bool
reduce_all
=
ctx
->
Attrs
().
Get
<
bool
>
(
"reduce_all"
);
bool
keep_dim
=
ctx
->
Attrs
().
Get
<
bool
>
(
"keep_dim"
);
if
(
reduce_all
)
{
if
(
keep_dim
)
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
std
::
vector
<
int64_t
>
(
x_rank
,
1
)));
else
ctx
->
SetOutputDim
(
"Out"
,
{
1
});
}
else
{
auto
dims_vector
=
vectorize
(
x_dims
);
if
(
keep_dim
)
{
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
dims_vector
[
dims
[
i
]]
=
1
;
}
}
else
{
const
int
kDelFlag
=
-
2
;
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
dims_vector
[
dims
[
i
]]
=
kDelFlag
;
}
dims_vector
.
erase
(
remove
(
dims_vector
.
begin
(),
dims_vector
.
end
(),
kDelFlag
),
dims_vector
.
end
());
}
auto
out_dims
=
framework
::
make_ddim
(
dims_vector
);
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
if
(
dims
[
0
]
!=
0
)
{
// Only pass LoD when not reducing on the first dim.
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
}
}
};
class
ReduceGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
x_rank
=
x_dims
.
size
();
PADDLE_ENFORCE_LE
(
x_rank
,
6
,
"Tensors with rank at most 6 are supported."
);
auto
dims
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dim"
);
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
dims
[
i
]
=
x_rank
+
dims
[
i
];
PADDLE_ENFORCE_LT
(
dims
[
i
],
x_rank
,
"The dim should be in the range [-rank(input), rank(input))."
);
}
sort
(
dims
.
begin
(),
dims
.
end
());
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
ctx
->
SetOutputDim
(
x_grad_name
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
x_grad_name
);
}
}
};
class
ReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
final
{
AddInput
(
"X"
,
"(Tensor) The input tensor. Tensors with rank at most 6 are "
"supported."
);
AddOutput
(
"Out"
,
"(Tensor) The result tensor."
);
AddAttr
<
std
::
vector
<
int
>>
(
"dim"
,
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
"Note that reducing on the first dim will make the LoD info lost."
)
.
SetDefault
({
0
});
AddAttr
<
bool
>
(
"keep_dim"
,
"(bool, default false) "
"If true, retain the reduced dimension with length 1."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"reduce_all"
,
"(bool, default false) "
"If true, output a scalar reduced along all dimensions."
)
.
SetDefault
(
false
);
AddComment
(
string
::
Sprintf
(
R"DOC(
%s Operator.
This operator computes the %s of input tensor along the given dimension.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
)DOC"
,
GetOpType
(),
GetName
()));
}
protected:
virtual
std
::
string
GetName
()
const
=
0
;
virtual
std
::
string
GetOpType
()
const
=
0
;
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
#define REGISTER_REDUCE_OP(op_name) \
class __##op_name##Maker__ : public ops::ReduceOpMaker { \
protected: \
virtual std::string GetName() const { return #op_name; } \
virtual std::string GetOpType() const { return "Reduce " #op_name; } \
}; \
REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
REGISTER_REDUCE_OP
(
sum
);
REGISTER_REDUCE_OP
(
mean
);
REGISTER_REDUCE_OP
(
max
);
REGISTER_REDUCE_OP
(
min
);
REGISTER_REDUCE_OP
(
prod
);
#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL(reduce_type, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
float, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
double, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
int, ops::functor>, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
int64_t, ops::functor>); \
REGISTER_OP_CPU_KERNEL( \
reduce_type##_grad, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t, \
ops::grad_functor>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_REDUCE_CPU_KERNEL
);
paddle/fluid/operators/reduce_op.cu
已删除
100644 → 0
浏览文件 @
23433def
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/reduce_op.h"
namespace
ops
=
paddle
::
operators
;
#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
float, ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, double, \
ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, int, \
ops::functor>, \
ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t, \
ops::functor>); \
REGISTER_OP_CUDA_KERNEL( \
reduce_type##_grad, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int, \
ops::grad_functor>, \
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
ops::grad_functor>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_REDUCE_GPU_KERNEL
);
paddle/fluid/operators/reduce_op.h
浏览文件 @
d5a88b93
...
...
@@ -14,105 +14,20 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/reduce_op_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
DDim
=
framework
::
DDim
;
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenScalar
=
framework
::
EigenScalar
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
struct
SumFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
sum
(
dim
);
}
};
struct
SumGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
);
}
};
struct
MeanFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
mean
(
dim
);
}
};
struct
MeanGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
)
/
dx
->
constant
(
size
);
}
};
struct
MaxFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
maximum
(
dim
);
}
};
struct
MinFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
minimum
(
dim
);
}
};
struct
MaxOrMinGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
auto
equals
=
(
*
x
)
==
y
->
broadcast
(
dim
);
auto
ones
=
dx
->
constant
(
1
);
auto
zeros
=
dx
->
constant
(
0
);
// If there are multiple minimum or maximum elements, the subgradient of
// each is the set [0, 1], and we pass gradient to all of them here.
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
)
*
equals
.
select
(
ones
,
zeros
);
}
};
struct
ProdFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
prod
(
dim
);
}
};
struct
ProdGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
)
*
y
->
broadcast
(
dim
)
*
x
->
inverse
();
}
};
#define HANDLE_DIM(NDIM, RDIM) \
if (ndim == NDIM && rdim == RDIM) { \
ReduceCompute<NDIM, RDIM>(context); \
#define HANDLE_DIM(NDIM, RDIM) \
if (ndim == NDIM && rdim == RDIM) { \
ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>( \
context.template device_context<DeviceContext>(), *input, output, \
dims, keep_dim); \
}
template
<
typename
DeviceContext
,
typename
T
,
typename
Functor
>
...
...
@@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel<T> {
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
context
.
Attr
<
bool
>
(
"keep_dim"
);
if
(
reduce_all
)
{
// Flatten and reduce 1-D tensor
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x
=
EigenVector
<
T
>::
Flatten
(
*
input
);
auto
out
=
EigenScalar
<
T
>::
From
(
*
output
);
auto
&
place
=
...
...
@@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel<T> {
Functor
functor
;
functor
(
place
,
&
x
,
&
out
,
reduce_dim
);
}
else
{
int
ndim
=
context
.
Input
<
Tensor
>
(
"X"
)
->
dims
().
size
();
int
rdim
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
)
.
size
();
int
ndim
=
input
->
dims
().
size
();
int
rdim
=
dims
.
size
();
// comments for accelerating compiling temporarily.
// HANDLE_DIM(6, 5);
// HANDLE_DIM(6, 4);
...
...
@@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel<T> {
HANDLE_DIM
(
1
,
1
);
}
}
private:
template
<
size_t
D
,
size_t
R_D
>
void
ReduceCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x
=
EigenTensor
<
T
,
D
>::
From
(
*
input
);
auto
x_rank
=
static_cast
<
int
>
(
x
.
dimensions
().
size
());
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
reduce_dim
=
Eigen
::
array
<
int
,
R_D
>
();
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
dims
[
i
]
=
x_rank
+
dims
[
i
];
reduce_dim
[
i
]
=
dims
[
i
];
}
// construct the squeezed output tensor
bool
keep_dim
=
context
.
Attr
<
bool
>
(
"keep_dim"
);
DDim
out_dims
=
output
->
dims
();
if
(
keep_dim
&&
x_rank
>
1
)
{
const
int
kDelFlag
=
-
2
;
auto
dims_vector
=
vectorize
(
out_dims
);
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
dims_vector
[
dims
[
i
]]
=
kDelFlag
;
}
dims_vector
.
erase
(
remove
(
dims_vector
.
begin
(),
dims_vector
.
end
(),
kDelFlag
),
dims_vector
.
end
());
out_dims
=
framework
::
make_ddim
(
dims_vector
);
}
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
Functor
functor
;
if
(
D
==
1
)
{
auto
out
=
EigenScalar
<
T
>::
From
(
*
output
);
functor
(
place
,
&
x
,
&
out
,
reduce_dim
);
}
else
{
auto
out
=
EigenTensor
<
T
,
(
D
-
R_D
)
>::
From
(
*
output
,
out_dims
);
functor
(
place
,
&
x
,
&
out
,
reduce_dim
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
,
typename
Functor
>
...
...
@@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
*
input0
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
input1
=
context
.
Input
<
Tensor
>
(
"Out"
);
auto
*
input2
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
output
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
reduce_all
)
{
auto
*
input0
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
input1
=
context
.
Input
<
Tensor
>
(
"Out"
);
auto
*
input2
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
output
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x
=
EigenVector
<
T
>::
Flatten
(
*
input0
);
auto
x_reduce
=
EigenVector
<
T
>::
From
(
*
input1
);
auto
x_reduce_grad
=
EigenVector
<
T
>::
From
(
*
input2
);
...
...
@@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
functor
(
place
,
&
x
,
&
x_reduce
,
&
x_grad
,
&
x_reduce_grad
,
broadcast_dim
,
broadcast_dim
[
0
]);
}
else
{
int
rank
=
context
.
Input
<
Tensor
>
(
"X"
)
->
dims
().
size
();
int
rank
=
input0
->
dims
().
size
();
switch
(
rank
)
{
case
1
:
ReduceGradCompute
<
1
>
(
context
);
ReduceGradFunctor
<
DeviceContext
,
T
,
1
,
Functor
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
input0
,
*
input1
,
*
input2
,
output
,
dims
);
break
;
case
2
:
ReduceGradCompute
<
2
>
(
context
);
ReduceGradFunctor
<
DeviceContext
,
T
,
2
,
Functor
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
input0
,
*
input1
,
*
input2
,
output
,
dims
);
break
;
case
3
:
ReduceGradCompute
<
3
>
(
context
);
ReduceGradFunctor
<
DeviceContext
,
T
,
3
,
Functor
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
input0
,
*
input1
,
*
input2
,
output
,
dims
);
break
;
case
4
:
ReduceGradCompute
<
4
>
(
context
);
ReduceGradFunctor
<
DeviceContext
,
T
,
4
,
Functor
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
input0
,
*
input1
,
*
input2
,
output
,
dims
);
break
;
case
5
:
ReduceGradCompute
<
5
>
(
context
);
ReduceGradFunctor
<
DeviceContext
,
T
,
5
,
Functor
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
input0
,
*
input1
,
*
input2
,
output
,
dims
);
break
;
case
6
:
ReduceGradCompute
<
6
>
(
context
);
ReduceGradFunctor
<
DeviceContext
,
T
,
6
,
Functor
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
input0
,
*
input1
,
*
input2
,
output
,
dims
);
break
;
}
}
}
};
private:
template
<
size_t
D
>
void
ReduceGradCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
input0
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
input1
=
context
.
Input
<
Tensor
>
(
"Out"
);
auto
*
input2
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
output
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
class
ReduceOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x
=
EigenTensor
<
T
,
D
>::
From
(
*
input0
);
auto
x_grad
=
EigenTensor
<
T
,
D
>::
From
(
*
output
);
auto
x_rank
=
static_cast
<
int
>
(
x
.
dimensions
().
size
());
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
x_dims
=
input0
->
dims
();
auto
reduced_dims_v
=
vectorize
(
x_dims
);
Eigen
::
array
<
int
,
D
>
broadcast_dim
;
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
broadcast_dim
[
i
]
=
1
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of ReduceOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of ReduceOp should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
x_rank
=
x_dims
.
size
();
PADDLE_ENFORCE_LE
(
x_rank
,
6
,
"Tensors with rank at most 6 are supported."
);
auto
dims
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dim"
);
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
dims
[
i
]
=
x_rank
+
dims
[
i
];
PADDLE_ENFORCE_LT
(
dims
[
i
],
x_rank
,
"The dim should be in the range [-rank(input), rank(input))."
);
}
sort
(
dims
.
begin
(),
dims
.
end
());
bool
reduce_all
=
ctx
->
Attrs
().
Get
<
bool
>
(
"reduce_all"
);
bool
keep_dim
=
ctx
->
Attrs
().
Get
<
bool
>
(
"keep_dim"
);
if
(
reduce_all
)
{
if
(
keep_dim
)
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
std
::
vector
<
int64_t
>
(
x_rank
,
1
)));
else
ctx
->
SetOutputDim
(
"Out"
,
{
1
});
}
else
{
auto
dims_vector
=
vectorize
(
x_dims
);
if
(
keep_dim
)
{
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
dims_vector
[
dims
[
i
]]
=
1
;
}
}
else
{
const
int
kDelFlag
=
-
2
;
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
dims_vector
[
dims
[
i
]]
=
kDelFlag
;
}
dims_vector
.
erase
(
remove
(
dims_vector
.
begin
(),
dims_vector
.
end
(),
kDelFlag
),
dims_vector
.
end
());
}
auto
out_dims
=
framework
::
make_ddim
(
dims_vector
);
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
if
(
dims
[
0
]
!=
0
)
{
// Only pass LoD when not reducing on the first dim.
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
}
}
};
class
ReduceGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
int
broad_cats_times
=
1
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
x_rank
=
x_dims
.
size
();
PADDLE_ENFORCE_LE
(
x_rank
,
6
,
"Tensors with rank at most 6 are supported."
);
auto
dims
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dim"
);
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
dims
[
i
]
=
x_rank
+
dims
[
i
];
reduced_dims_v
[
dims
[
i
]]
=
1
;
broadcast_dim
[
dims
[
i
]]
=
x_dims
[
dims
[
i
]];
broad_cats_times
*=
x_dims
[
dims
[
i
]];
PADDLE_ENFORCE_LT
(
dims
[
i
],
x_rank
,
"The dim should be in the range [-rank(input), rank(input))."
);
}
sort
(
dims
.
begin
(),
dims
.
end
());
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
ctx
->
SetOutputDim
(
x_grad_name
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
x_grad_name
);
}
auto
reduced_dims
=
framework
::
make_ddim
(
reduced_dims_v
);
auto
x_reduce
=
EigenTensor
<
T
,
D
>::
From
(
*
input1
,
reduced_dims
);
auto
x_reduce_grad
=
EigenTensor
<
T
,
D
>::
From
(
*
input2
,
reduced_dims
);
}
};
class
ReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
final
{
AddInput
(
"X"
,
"(Tensor) The input tensor. Tensors with rank at most 6 are "
"supported."
);
AddOutput
(
"Out"
,
"(Tensor) The result tensor."
);
AddAttr
<
std
::
vector
<
int
>>
(
"dim"
,
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
"Note that reducing on the first dim will make the LoD info lost."
)
.
SetDefault
({
0
});
AddAttr
<
bool
>
(
"keep_dim"
,
"(bool, default false) "
"If true, retain the reduced dimension with length 1."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"reduce_all"
,
"(bool, default false) "
"If true, output a scalar reduced along all dimensions."
)
.
SetDefault
(
false
);
AddComment
(
string
::
Sprintf
(
R"DOC(
%s Operator.
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
This operator computes the %s of input tensor along the given dimension.
The result tensor has 1 fewer dimension than the input unless keep_dim is true.
If reduce_all is true, just reduce along all dimensions and output a scalar.
Functor
functor
;
functor
(
place
,
&
x
,
&
x_reduce
,
&
x_grad
,
&
x_reduce_grad
,
broadcast_dim
,
broad_cats_times
);
)DOC"
,
GetOpType
(),
GetName
()));
}
protected:
virtual
std
::
string
GetName
()
const
=
0
;
virtual
std
::
string
GetOpType
()
const
=
0
;
};
}
// namespace operators
}
// namespace paddle
#define FOR_EACH_KERNEL_FUNCTOR(__macro) \
__macro(reduce_sum, SumFunctor, SumGradFunctor); \
__macro(reduce_mean, MeanFunctor, MeanGradFunctor); \
__macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
__macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
__macro(reduce_prod, ProdFunctor, ProdGradFunctor);
namespace
ops
=
paddle
::
operators
;
#define REGISTER_REDUCE_OP(op_name) \
class __##op_name##Maker__ : public ops::ReduceOpMaker { \
protected: \
virtual std::string GetName() const { return #op_name; } \
virtual std::string GetOpType() const { return "Reduce " #op_name; } \
}; \
REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
paddle/fluid/operators/reduce_op_function.h
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
DDim
=
framework
::
DDim
;
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenScalar
=
framework
::
EigenScalar
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
,
size_t
R_D
,
typename
Functor
>
void
ReduceFunctor
(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
output
,
const
std
::
vector
<
int
>&
dims
,
bool
keep_dim
)
{
auto
x
=
EigenTensor
<
T
,
D
>::
From
(
input
);
auto
x_rank
=
static_cast
<
int
>
(
x
.
dimensions
().
size
());
auto
reduce_dim
=
Eigen
::
array
<
int
,
R_D
>
();
std
::
vector
<
int
>
dims_ref
=
dims
;
for
(
size_t
i
=
0
;
i
<
dims_ref
.
size
();
++
i
)
{
if
(
dims_ref
[
i
]
<
0
)
dims_ref
[
i
]
=
x_rank
+
dims_ref
[
i
];
reduce_dim
[
i
]
=
dims_ref
[
i
];
}
// construct the squeezed output tensor
DDim
out_dims
=
output
->
dims
();
if
(
keep_dim
&&
x_rank
>
1
)
{
const
int
kDelFlag
=
-
2
;
auto
dims_vector
=
framework
::
vectorize
(
out_dims
);
for
(
size_t
i
=
0
;
i
<
dims_ref
.
size
();
++
i
)
{
dims_vector
[
dims_ref
[
i
]]
=
kDelFlag
;
}
dims_vector
.
erase
(
remove
(
dims_vector
.
begin
(),
dims_vector
.
end
(),
kDelFlag
),
dims_vector
.
end
());
out_dims
=
framework
::
make_ddim
(
dims_vector
);
}
auto
&
place
=
*
context
.
eigen_device
();
Functor
functor
;
if
(
D
==
1
)
{
auto
out
=
EigenScalar
<
T
>::
From
(
*
output
);
functor
(
place
,
&
x
,
&
out
,
reduce_dim
);
}
else
{
auto
out
=
EigenTensor
<
T
,
(
D
-
R_D
)
>::
From
(
*
output
,
out_dims
);
functor
(
place
,
&
x
,
&
out
,
reduce_dim
);
}
}
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
,
typename
Functor
>
void
ReduceGradFunctor
(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input0
,
const
framework
::
Tensor
&
input1
,
const
framework
::
Tensor
&
input2
,
framework
::
Tensor
*
output
,
const
std
::
vector
<
int
>&
dims
)
{
auto
x
=
EigenTensor
<
T
,
D
>::
From
(
input0
);
auto
x_grad
=
EigenTensor
<
T
,
D
>::
From
(
*
output
);
auto
x_rank
=
static_cast
<
int
>
(
x
.
dimensions
().
size
());
auto
x_dims
=
input0
.
dims
();
auto
reduced_dims_v
=
framework
::
vectorize
(
x_dims
);
std
::
vector
<
int
>
dims_ref
=
dims
;
Eigen
::
array
<
int
,
D
>
broadcast_dim
;
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
broadcast_dim
[
i
]
=
1
;
int
broad_cats_times
=
1
;
for
(
size_t
i
=
0
;
i
<
dims_ref
.
size
();
++
i
)
{
if
(
dims_ref
[
i
]
<
0
)
{
dims_ref
[
i
]
=
x_rank
+
dims_ref
[
i
];
}
reduced_dims_v
[
dims_ref
[
i
]]
=
1
;
broadcast_dim
[
dims_ref
[
i
]]
=
x_dims
[
dims_ref
[
i
]];
broad_cats_times
*=
x_dims
[
dims_ref
[
i
]];
}
auto
reduced_dims
=
framework
::
make_ddim
(
reduced_dims_v
);
auto
x_reduce
=
EigenTensor
<
T
,
D
>::
From
(
input1
,
reduced_dims
);
auto
x_reduce_grad
=
EigenTensor
<
T
,
D
>::
From
(
input2
,
reduced_dims
);
auto
&
place
=
*
context
.
eigen_device
();
Functor
functor
;
functor
(
place
,
&
x
,
&
x_reduce
,
&
x_grad
,
&
x_reduce_grad
,
broadcast_dim
,
broad_cats_times
);
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reduce_prod_op.cc
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_prod_op.h"
REGISTER_REDUCE_OP
(
reduce_prod
);
REGISTER_OP_CPU_KERNEL
(
reduce_prod
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
ProdFunctor
>
);
REGISTER_OP_CPU_KERNEL
(
reduce_prod_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
ProdGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
ProdGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
ProdGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
ProdGradFunctor
>
);
paddle/fluid/operators/reduce_prod_op.cu
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_prod_op.h"
REGISTER_OP_CUDA_KERNEL
(
reduce_prod
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
ProdFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
ProdFunctor
>
);
REGISTER_OP_CUDA_KERNEL
(
reduce_prod_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
ProdGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
ProdGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
ProdGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
ProdGradFunctor
>
);
paddle/fluid/operators/reduce_prod_op.h
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace
paddle
{
namespace
operators
{
struct
ProdFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
prod
(
dim
);
}
};
struct
ProdGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
)
*
y
->
broadcast
(
dim
)
*
x
->
inverse
();
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reduce_sum_op.cc
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_REDUCE_OP
(
reduce_sum
);
REGISTER_OP_CPU_KERNEL
(
reduce_sum
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
SumFunctor
>
);
REGISTER_OP_CPU_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
SumGradFunctor
>
);
paddle/fluid/operators/reduce_sum_op.cu
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL
(
reduce_sum
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
SumFunctor
>
);
REGISTER_OP_CUDA_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
SumGradFunctor
>
);
paddle/fluid/operators/reduce_sum_op.h
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_op.h"
namespace
paddle
{
namespace
operators
{
struct
SumFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
y
->
device
(
place
)
=
x
->
sum
(
dim
);
}
};
struct
SumGradFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
DX
,
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reverse_op.cc
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reverse_op.h"
#include <vector>
namespace
paddle
{
namespace
operators
{
class
ReverseOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) should not be null"
);
const
auto
&
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
auto
&
axis
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"axis"
);
PADDLE_ENFORCE
(
!
axis
.
empty
(),
"'axis' can not be empty."
);
for
(
int
a
:
axis
)
{
PADDLE_ENFORCE_LT
(
a
,
x_dims
.
size
(),
"The axis must be less than input tensor's rank."
);
}
ctx
->
SetOutputDim
(
"Out"
,
x_dims
);
}
};
class
ReverseOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The LoDTensor to be flipped."
);
AddOutput
(
"Out"
,
"The LoDTensor after flipping."
);
AddAttr
<
std
::
vector
<
int
>>
(
"axis"
,
"The axises that along which order of elements is reversed."
);
AddComment
(
R"DOC(
Reverse Operator.
Reverse the order of elements in the input LoDTensor along given axises.
Case 1:
Given
X = [[1, 2, 3, 4, 5]
[6, 7, 8, 9, 10]
[11, 12, 13, 14, 15]],
and
axis = [0],
we get:
Out = [[11, 12, 13, 14, 15]
[6, 7, 8, 9, 10]
[1, 2, 3, 4, 5]].
Case 2:
Given
X = [[[1, 2, 3, 4]
[5, 6, 7, 8]]
[[9, 10, 11, 12]
[13, 14, 15, 16]]],
and
axis = [0, 2],
we get:
Out = [[[12, 11, 10, 9]
[16, 15, 14, 13]]
[[4, 3, 2, 1]
[8, 7, 6, 5]]],
)DOC"
);
}
};
class
ReverseGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
grad_op
=
new
framework
::
OpDesc
();
grad_op
->
SetType
(
"reverse"
);
grad_op
->
SetInput
(
"X"
,
OutputGrad
(
"Out"
));
grad_op
->
SetOutput
(
"Out"
,
InputGrad
(
"X"
));
grad_op
->
SetAttr
(
"axis"
,
GetAttr
(
"axis"
));
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
grad_op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
reverse
,
ops
::
ReverseOp
,
ops
::
ReverseOpMaker
,
ops
::
ReverseGradMaker
);
REGISTER_OPERATOR
(
reverse_grad
,
ops
::
ReverseOp
);
REGISTER_OP_CPU_KERNEL
(
reverse
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CPUDeviceContext
,
uint8_t
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CPUDeviceContext
,
bool
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
)
paddle/fluid/operators/reverse_op.cu
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reverse_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
reverse
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CUDADeviceContext
,
bool
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ReverseKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
)
paddle/fluid/operators/reverse_op.h
0 → 100644
浏览文件 @
d5a88b93
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
,
int
Rank
>
struct
ReverseFunctor
{
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
LoDTensor
&
in
,
framework
::
LoDTensor
*
out
,
const
std
::
vector
<
int
>&
axis
)
{
Eigen
::
array
<
bool
,
Rank
>
reverse_axis
;
for
(
int
i
=
0
;
i
<
Rank
;
++
i
)
{
reverse_axis
[
i
]
=
false
;
}
for
(
int
a
:
axis
)
{
reverse_axis
[
a
]
=
true
;
}
auto
in_eigen
=
framework
::
EigenTensor
<
T
,
Rank
>::
From
(
in
);
auto
out_eigen
=
framework
::
EigenTensor
<
T
,
Rank
>::
From
(
*
out
);
auto
*
dev
=
context
.
eigen_device
();
out_eigen
.
device
(
*
dev
)
=
in_eigen
.
reverse
(
reverse_axis
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ReverseKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
auto
&
axis
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"axis"
);
int
rank
=
x
->
dims
().
size
();
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
switch
(
rank
)
{
case
1
:
ReverseFunctor
<
DeviceContext
,
T
,
1
>
functor1
;
functor1
(
dev_ctx
,
*
x
,
out
,
axis
);
break
;
case
2
:
ReverseFunctor
<
DeviceContext
,
T
,
2
>
functor2
;
functor2
(
dev_ctx
,
*
x
,
out
,
axis
);
break
;
case
3
:
ReverseFunctor
<
DeviceContext
,
T
,
3
>
functor3
;
functor3
(
dev_ctx
,
*
x
,
out
,
axis
);
break
;
case
4
:
ReverseFunctor
<
DeviceContext
,
T
,
4
>
functor4
;
functor4
(
dev_ctx
,
*
x
,
out
,
axis
);
break
;
case
5
:
ReverseFunctor
<
DeviceContext
,
T
,
5
>
functor5
;
functor5
(
dev_ctx
,
*
x
,
out
,
axis
);
break
;
case
6
:
ReverseFunctor
<
DeviceContext
,
T
,
6
>
functor6
;
functor6
(
dev_ctx
,
*
x
,
out
,
axis
);
break
;
default:
PADDLE_THROW
(
"Reserve operator doesn't supports tensors whose ranks are greater "
"than 6."
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/softmax_op.cc
浏览文件 @
d5a88b93
...
...
@@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// choose cudnn kernel if the runtime supported.
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
...
...
@@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
...
...
@@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
"float16 can only be used on GPU place"
);
}
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
framework
::
StringToDataLayout
(
data_format
),
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout_
,
library_
);
}
};
...
...
@@ -142,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// choose cudnn kernel if the runtime supported.
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
...
...
paddle/fluid/platform/mkldnn_helper.h
浏览文件 @
d5a88b93
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <mkldnn.h>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -86,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
return
mkldnn
::
memory
::
f32
;
}
inline
void
Reorder
(
const
mkldnn
::
memory
&
src
,
const
mkldnn
::
memory
&
dst
)
{
auto
reorder_prim
=
mkldnn
::
reorder
(
src
,
dst
);
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
reorder_prim
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
inline
mkldnn
::
memory
::
format
GetMKLDNNFormat
(
const
mkldnn
::
memory
memory
)
{
return
static_cast
<
mkldnn
::
memory
::
format
>
(
memory
.
get_primitive_desc
().
desc
().
data
.
format
);
}
}
// namespace platform
}
// namespace paddle
paddle/scripts/paddle_build.sh
浏览文件 @
d5a88b93
...
...
@@ -447,7 +447,7 @@ EOF
# run paddle version to install python packages first
RUN apt-get update &&
\
${
NCCL_DEPS
}
\
apt-get install -y wget python-pip python-opencv dmidecode python-tk && easy_install -U pip &&
\
apt-get install -y wget python-pip python-opencv
libgtk2.0-dev
dmidecode python-tk && easy_install -U pip &&
\
pip install /*.whl; apt-get install -f -y &&
\
apt-get clean -y &&
\
rm -f /*.whl &&
\
...
...
python/paddle/fluid/executor.py
浏览文件 @
d5a88b93
...
...
@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
return
var
.
desc
.
name
()
elif
isinstance
(
var
,
str
):
return
var
elif
isinstance
(
var
,
basestring
):
return
str
(
var
)
else
:
raise
TypeError
(
str
(
var
)
+
" should be Variable or str"
)
...
...
python/paddle/fluid/framework.py
浏览文件 @
d5a88b93
...
...
@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
return
core
.
VarDesc
.
VarType
.
INT64
elif
dtype
==
np
.
bool
:
return
core
.
VarDesc
.
VarType
.
BOOL
elif
dtype
==
np
.
uint16
:
return
core
.
VarDesc
.
VarType
.
INT16
elif
dtype
==
np
.
uint8
:
return
core
.
VarDesc
.
VarType
.
UINT8
else
:
...
...
@@ -368,6 +370,13 @@ class Operator(object):
Block. Users can use the build in instructions to describe their neural
network.
"""
OP_WITHOUT_KERNEL_SET
=
{
'feed'
,
'fetch'
,
'save'
,
'load'
,
'recurrent'
,
'go'
,
'rnn_memory_helper_grad'
,
'conditional_block'
,
'while'
,
'send'
,
'recv'
,
'listen_and_serv'
,
'parallel_do'
,
'save_combine'
,
'load_combine'
,
'ncclInit'
,
'channel_create'
,
'channel_close'
,
'channel_send'
,
'channel_recv'
,
'select'
}
def
__init__
(
self
,
block
,
...
...
@@ -504,17 +513,13 @@ class Operator(object):
else
:
self
.
desc
.
set_attr
(
attr_name
,
self
.
attrs
[
attr_name
])
self
.
desc
.
check_attrs
()
no_kernel_op_set
=
{
'feed'
,
'fetch'
,
'save'
,
'load'
,
'recurrent'
,
'go'
,
'rnn_memory_helper_grad'
,
'conditional_block'
,
'while'
,
'send'
,
'recv'
,
'listen_and_serv'
,
'parallel_do'
,
'save_combine'
,
'load_combine'
,
'ncclInit'
,
'channel_create'
,
'channel_close'
,
'channel_send'
,
'channel_recv'
,
'select'
,
'gen_nccl_id'
}
if
type
not
in
no_kernel_op_set
:
if
self
.
has_kernel
(
type
):
self
.
desc
.
infer_var_type
(
self
.
block
.
desc
)
self
.
desc
.
infer_shape
(
self
.
block
.
desc
)
def
has_kernel
(
self
,
op_type
):
return
op_type
not
in
self
.
OP_WITHOUT_KERNEL_SET
def
to_string
(
self
,
throw_on_error
):
"""
To debug string.
...
...
@@ -742,7 +747,9 @@ class Block(object):
def
var
(
self
,
name
):
if
not
isinstance
(
name
,
basestring
):
raise
TypeError
()
raise
TypeError
(
"var require string as parameter, but get %s instead."
%
(
type
(
name
)))
v
=
self
.
vars
.
get
(
name
,
None
)
if
v
is
None
:
raise
ValueError
(
"var %s not in this block"
%
name
)
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
d5a88b93
...
...
@@ -434,7 +434,7 @@ def open_files(filenames,
shapes
,
lod_levels
,
dtypes
,
thread_num
,
thread_num
=
1
,
buffer_size
=
None
,
pass_num
=
1
,
for_parallel
=
True
):
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
d5a88b93
...
...
@@ -363,6 +363,40 @@ def zeros(shape, dtype, force_cpu=False):
return
fill_constant
(
value
=
0.0
,
**
locals
())
def
reverse
(
x
,
axis
):
"""
**reverse**
This function reverse the input 'x' along given axises.
Args:
x(Vairbale): the input to be reversed.
axis(int|tuple|list): Axis that along which order of elements
is reversed. If it is a tuple or a list, reversing
will be apply on each axis in the tuple or list.
Returns:
Variable: The reversed tensor.
Examples:
.. code-block:: python
out = fluid.layers.reverse(x=in, axis=0)
# or:
out = fluid.layers.reverse(x=in, axis=[0,1])
"""
if
isinstance
(
axis
,
int
):
axis
=
[
axis
]
helper
=
LayerHelper
(
"reverse"
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'reverse'
,
inputs
=
{
'Input'
:
x
},
outputs
=
{
'Out'
:
[
out
]},
attrs
=
{
'axis'
:
axis
})
return
out
def
save
(
x
,
file_path
,
overwrite
=
True
):
"""
Saves a variable as a file.
...
...
python/paddle/fluid/tests/unittests/benchmark.py
0 → 100644
浏览文件 @
d5a88b93
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
unittest
import
time
import
itertools
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
op_test
import
OpTest
class
BenchmarkSuite
(
OpTest
):
def
timeit_function
(
self
,
callback
,
iters
,
*
args
,
**
kwargs
):
assert
iters
!=
0
,
"Iters should >= 1"
start
=
time
.
time
()
for
i
in
range
(
iters
):
callback
(
*
args
,
**
kwargs
)
elapse
=
time
.
time
()
-
start
return
elapse
/
iters
def
_assert_cpu_gpu_same
(
self
,
cpu_outs
,
gpu_outs
,
fetch_list
,
atol
):
for
item_cpu_out
,
item_gpu_out
,
variable
in
zip
(
cpu_outs
,
gpu_outs
,
fetch_list
):
# the cpu version is baseline, expect gpu version keep same with cpu version.
expect
=
item_cpu_out
expect_t
=
np
.
array
(
item_cpu_out
)
actual
=
item_gpu_out
actual_t
=
np
.
array
(
item_gpu_out
)
var_name
=
variable
if
isinstance
(
variable
,
basestring
)
else
variable
.
name
self
.
assertTrue
(
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
),
"Output ("
+
var_name
+
") has diff"
+
str
(
actual_t
)
+
"
\n
"
+
str
(
expect_t
))
self
.
assertListEqual
(
actual
.
lod
(),
expect
.
lod
(),
"Output ("
+
var_name
+
") has different lod"
)
def
_get_input_names
(
self
):
inputs
=
[]
for
name
,
value
in
self
.
inputs
.
iteritems
():
if
isinstance
(
value
,
list
):
inputs
.
extend
([
sub_name
for
sub_name
,
_
in
value
])
inputs
.
append
(
name
)
return
inputs
def
_get_output_names
(
self
):
outputs
=
[]
for
var_name
,
var
in
self
.
outputs
.
iteritems
():
if
isinstance
(
var
,
list
):
for
sub_var_name
,
sub_var
in
var
:
outputs
.
append
(
sub_var_name
)
else
:
outputs
.
append
(
var_name
)
if
len
(
outputs
)
==
0
:
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
self
.
op_type
):
outputs
.
append
(
str
(
out_name
))
return
outputs
def
check_output_stability
(
self
,
atol
=
1e-8
):
places
=
self
.
_get_places
()
if
len
(
places
)
<
2
:
return
cpu_outs
,
fetch_list
=
self
.
_calc_output
(
places
[
0
])
gpu_outs
,
_
=
self
.
_calc_output
(
places
[
1
])
self
.
_assert_cpu_gpu_same
(
cpu_outs
,
gpu_outs
,
fetch_list
,
atol
)
def
timeit_output_with_place
(
self
,
place
,
iters
):
return
self
.
timeit_function
(
self
.
calc_output
,
iters
,
place
)
def
timeit_output
(
self
,
iters
=
100
):
places
=
self
.
_get_places
()
elapses
=
[]
for
place
in
places
:
elapses
.
append
(
self
.
timeit_output_with_place
(
place
,
iters
))
for
place
,
elapse
in
zip
(
places
,
elapses
):
print
(
"One pass of ({2}_op) at {0} cost {1}"
.
format
(
str
(
place
),
elapse
,
self
.
op_type
))
def
timeit_grad_with_place
(
self
,
place
,
iters
=
100
):
inputs_to_check
=
self
.
_get_input_names
()
output_names
=
self
.
_get_output_names
()
return
self
.
timeit_function
(
self
.
_get_gradient
,
iters
,
inputs_to_check
,
place
,
output_names
,
no_grad_set
=
None
)
def
timeit_grad
(
self
,
iters
=
100
):
places
=
self
.
_get_places
()
elapses
=
[]
for
place
in
places
:
elapses
.
append
(
self
.
timeit_grad_with_place
(
place
,
iters
))
for
place
,
elapse
in
zip
(
places
,
elapses
):
print
(
"One pass of ({2}_grad_op) at {0} cost {1}"
.
format
(
str
(
place
),
elapse
,
self
.
op_type
))
python/paddle/fluid/tests/unittests/benchmark_sum_op.py
0 → 100644
浏览文件 @
d5a88b93
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
import
paddle.fluid
as
fluid
from
benchmark
import
BenchmarkSuite
from
op_test
import
OpTest
# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
class
TestSumOp
(
BenchmarkSuite
):
def
setUp
(
self
):
self
.
op_type
=
"sum"
self
.
customize_testcase
()
self
.
customize_fetch_list
()
def
customize_fetch_list
(
self
):
"""
customize fetch list, configure the wanted variables.
>>> self.fetch_list = ["Out"]
"""
self
.
fetch_list
=
[
"Out"
]
# pass
def
customize_testcase
(
self
):
# a test case
x0
=
np
.
random
.
random
((
300
,
400
)).
astype
(
'float32'
)
x1
=
np
.
random
.
random
((
300
,
400
)).
astype
(
'float32'
)
x2
=
np
.
random
.
random
((
300
,
400
)).
astype
(
'float32'
)
# NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
# only the output dtype is used, the shape, lod and data is computed from input.
self
.
inputs
=
{
"X"
:
[(
"x0"
,
x0
),
(
"x1"
,
x1
),
(
"x2"
,
x2
)]}
self
.
outputs
=
{
"Out"
:
x0
+
x1
+
x2
}
def
test_check_output
(
self
):
"""
compare the output with customized output. In this case,
you should set the correct output by hands.
>>> self.outputs = {"Out": x0 + x1 + x2}
"""
self
.
check_output
(
atol
=
1e-8
)
def
test_output_stability
(
self
):
# compare the cpu gpu output in high resolution.
self
.
check_output_stability
()
def
test_timeit_output
(
self
):
"""
perf the op, time cost will be averged in iters.
output example
>>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
>>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
"""
self
.
timeit_output
(
iters
=
100
)
def
test_timeit_grad
(
self
):
"""
perf the op gradient, time cost will be averged in iters.
output example
>>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
>>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
"""
self
.
timeit_grad
(
iters
=
100
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
d5a88b93
...
...
@@ -15,13 +15,17 @@
import
unittest
import
numpy
as
np
import
random
import
time
import
itertools
import
paddle.fluid.core
as
core
import
collections
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.op
import
Operator
from
paddle.fluid.executor
import
Executor
from
paddle.fluid.framework
import
Program
,
OpProtoHolder
from
paddle.fluid.framework
import
Program
,
OpProtoHolder
,
Variable
from
testsuite
import
create_op
,
set_input
,
append_input_output
,
append_loss_ops
def
randomize_probability
(
batch_size
,
class_num
,
dtype
=
'float32'
):
...
...
@@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
return
prob
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
):
kwargs
=
dict
()
op_maker
=
core
.
op_proto_and_checker_maker
op_role_attr_name
=
op_maker
.
kOpRoleAttrName
()
if
op_role_attr_name
not
in
attrs
:
attrs
[
op_role_attr_name
]
=
int
(
op_maker
.
OpRole
.
Forward
)
def
__create_var__
(
name
,
var_name
):
scope
.
var
(
var_name
).
get_tensor
()
kwargs
[
name
].
append
(
var_name
)
for
in_name
,
in_dup
in
Operator
.
get_op_inputs
(
op_type
):
if
in_name
in
inputs
:
kwargs
[
in_name
]
=
[]
if
in_dup
:
sub_in
=
inputs
[
in_name
]
for
item
in
sub_in
:
sub_in_name
,
_
=
item
[
0
],
item
[
1
]
__create_var__
(
in_name
,
sub_in_name
)
else
:
__create_var__
(
in_name
,
in_name
)
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
op_type
):
if
out_name
in
outputs
:
kwargs
[
out_name
]
=
[]
if
out_dup
:
sub_out
=
outputs
[
out_name
]
for
item
in
sub_out
:
sub_out_name
,
_
=
item
[
0
],
item
[
1
]
__create_var__
(
out_name
,
sub_out_name
)
else
:
__create_var__
(
out_name
,
out_name
)
for
attr_name
in
Operator
.
get_op_attr_names
(
op_type
):
if
attr_name
in
attrs
:
kwargs
[
attr_name
]
=
attrs
[
attr_name
]
return
Operator
(
op_type
,
**
kwargs
)
def
set_input
(
scope
,
op
,
inputs
,
place
):
def
__set_input__
(
var_name
,
var
):
if
isinstance
(
var
,
tuple
)
or
isinstance
(
var
,
np
.
ndarray
):
tensor
=
scope
.
find_var
(
var_name
).
get_tensor
()
if
isinstance
(
var
,
tuple
):
tensor
.
set_lod
(
var
[
1
])
var
=
var
[
0
]
tensor
.
set_dims
(
var
.
shape
)
tensor
.
set
(
var
,
place
)
elif
isinstance
(
var
,
float
):
scope
.
find_var
(
var_name
).
set_float
(
var
)
elif
isinstance
(
var
,
int
):
scope
.
find_var
(
var_name
).
set_int
(
var
)
for
in_name
,
in_dup
in
Operator
.
get_op_inputs
(
op
.
type
()):
if
in_name
in
inputs
:
if
in_dup
:
sub_in
=
inputs
[
in_name
]
for
item
in
sub_in
:
sub_in_name
,
sub_in_val
=
item
[
0
],
item
[
1
]
__set_input__
(
sub_in_name
,
sub_in_val
)
else
:
__set_input__
(
in_name
,
inputs
[
in_name
])
def
get_numeric_gradient
(
place
,
scope
,
op
,
...
...
@@ -173,54 +110,15 @@ def get_numeric_gradient(place,
return
gradient_flat
.
reshape
(
tensor_to_check
.
get_dims
())
def
append_input_output
(
block
,
op_proto
,
np_list
,
is_input
):
'''Insert VarDesc and generate Python variable instance'''
proto_list
=
op_proto
.
inputs
if
is_input
else
op_proto
.
outputs
def
create_var
(
block
,
name
,
np_list
,
var_proto
):
if
name
not
in
np_list
:
assert
var_proto
.
intermediate
,
"{} not found"
.
format
(
name
)
shape
=
None
lod_level
=
None
else
:
np_value
=
np_list
[
name
]
if
isinstance
(
np_value
,
tuple
):
shape
=
list
(
np_value
[
0
].
shape
)
lod_level
=
len
(
np_value
[
1
])
else
:
shape
=
list
(
np_value
.
shape
)
lod_level
=
0
return
block
.
create_var
(
dtype
=
"float32"
,
shape
=
shape
,
lod_level
=
lod_level
,
name
=
name
)
var_dict
=
{}
for
var_proto
in
proto_list
:
var_name
=
str
(
var_proto
.
name
)
if
is_input
:
if
(
var_name
not
in
np_list
)
and
var_proto
.
dispensable
:
continue
assert
(
var_name
in
np_list
)
or
(
var_proto
.
dispensable
),
\
"Missing {} as input"
.
format
(
var_name
)
if
var_proto
.
duplicable
:
assert
isinstance
(
np_list
[
var_name
],
list
),
\
"Duplicable {} should be set as list"
.
format
(
var_name
)
var_list
=
[]
for
(
name
,
np_value
)
in
np_list
[
var_name
]:
var_list
.
append
(
create_var
(
block
,
name
,
{
name
:
np_value
},
var_proto
))
var_dict
[
var_name
]
=
var_list
else
:
var_dict
[
var_name
]
=
create_var
(
block
,
var_name
,
np_list
,
var_proto
)
return
var_dict
class
OpTest
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
'''Fix random seeds to remove randomness from tests'''
cls
.
_np_rand_state
=
np
.
random
.
get_state
()
cls
.
_py_rand_state
=
random
.
getstate
()
cls
.
call_once
=
False
cls
.
dtype
=
"float32"
cls
.
outputs
=
{}
np
.
random
.
seed
(
123
)
random
.
seed
(
124
)
...
...
@@ -231,6 +129,31 @@ class OpTest(unittest.TestCase):
np
.
random
.
set_state
(
cls
.
_np_rand_state
)
random
.
setstate
(
cls
.
_py_rand_state
)
def
try_call_once
(
self
,
data_type
):
if
not
self
.
call_once
:
self
.
call_once
=
True
self
.
dtype
=
data_type
def
infer_dtype_from_inputs_outputs
(
self
,
inputs
,
outputs
):
def
infer_dtype
(
numpy_dict
):
assert
isinstance
(
numpy_dict
,
dict
),
"self.inputs, self.outputs must be numpy_dict"
for
var_name
,
var_value
in
numpy_dict
.
iteritems
():
if
isinstance
(
var_value
,
(
np
.
ndarray
,
np
.
generic
)):
self
.
try_call_once
(
var_value
.
dtype
)
elif
isinstance
(
var_value
,
(
list
,
tuple
)):
# the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
if
len
(
var_value
)
>
1
and
isinstance
(
var_value
[
1
],
(
np
.
ndarray
,
np
.
generic
)):
instance
=
var_value
[
1
]
self
.
try_call_once
(
instance
[
1
].
dtype
)
else
:
self
.
try_call_once
(
"float32"
)
infer_dtype
(
inputs
)
infer_dtype
(
outputs
)
def
feed_var
(
self
,
input_vars
,
place
):
feed_map
=
{}
for
var_name
in
input_vars
:
...
...
@@ -254,18 +177,14 @@ class OpTest(unittest.TestCase):
return
feed_map
def
calc_output
(
self
,
place
):
outs
,
_
=
self
.
_calc_output
(
place
)
return
outs
def
_calc_output
(
self
,
place
):
def
_append_ops
(
self
,
block
):
op_proto
=
OpProtoHolder
.
instance
().
get_op_proto
(
self
.
op_type
)
program
=
Program
(
)
block
=
program
.
global_block
()
inputs
=
append_input_output
(
block
,
op_proto
,
self
.
inputs
,
True
)
outputs
=
append_input_output
(
block
,
op_proto
,
self
.
outputs
,
Fals
e
)
"infer datatype from inputs and outputs for this test case"
self
.
infer_dtype_from_inputs_outputs
(
self
.
inputs
,
self
.
outputs
)
inputs
=
append_input_output
(
block
,
op_proto
,
self
.
inputs
,
True
,
self
.
dtype
)
outputs
=
append_input_output
(
block
,
op_proto
,
self
.
outputs
,
False
,
self
.
dtyp
e
)
op
=
block
.
append_op
(
type
=
self
.
op_type
,
inputs
=
inputs
,
...
...
@@ -275,22 +194,68 @@ class OpTest(unittest.TestCase):
op
.
desc
.
infer_var_type
(
block
.
desc
)
op
.
desc
.
infer_shape
(
block
.
desc
)
fetch_list
=
[]
for
var_name
,
var
in
outputs
.
iteritems
():
if
var_name
in
self
.
outputs
:
def
_get_io_vars
(
self
,
block
,
numpy_inputs
):
inputs
=
{}
for
name
,
value
in
numpy_inputs
.
iteritems
():
if
isinstance
(
value
,
list
):
var_list
=
[
block
.
var
(
sub_name
)
for
sub_name
,
sub_value
in
value
]
inputs
[
name
]
=
var_list
else
:
inputs
[
name
]
=
block
.
var
(
name
)
return
inputs
def
_get_inputs
(
self
,
block
):
return
self
.
_get_io_vars
(
block
,
self
.
inputs
)
def
_get_outputs
(
self
,
block
):
return
self
.
_get_io_vars
(
block
,
self
.
outputs
)
def
calc_output
(
self
,
place
):
outs
,
_
=
self
.
_calc_output
(
place
)
return
outs
def
_calc_output
(
self
,
place
,
parallel
=
False
):
program
=
Program
()
block
=
program
.
global_block
()
self
.
_append_ops
(
block
)
inputs
=
self
.
_get_inputs
(
block
)
outputs
=
self
.
_get_outputs
(
block
)
feed_map
=
self
.
feed_var
(
inputs
,
place
)
if
parallel
:
use_cuda
=
False
if
isinstance
(
place
,
fluid
.
CUDAPlace
(
0
)):
use_cuda
=
True
executor
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
program
)
else
:
executor
=
Executor
(
place
)
fetch_list
=
getattr
(
self
,
"fetch_list"
,
[])
# if the fetch_list is customized by user, we use it directly.
# if not, fill the fetch_list by the user configured outputs in test.
if
len
(
fetch_list
)
==
0
:
for
var_name
,
var
in
outputs
.
iteritems
():
if
isinstance
(
var
,
list
):
for
v
in
var
:
fetch_list
.
append
(
v
)
else
:
fetch_list
.
append
(
var
)
feed_map
=
self
.
feed_var
(
inputs
,
place
)
exe
=
Executor
(
place
)
outs
=
exe
.
run
(
program
,
feed
=
feed_map
,
fetch_list
=
fetch_list
,
return_numpy
=
False
)
# if the fetch_list still empty, fill the fetch_list by the operator output.
if
len
(
fetch_list
)
==
0
:
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
self
.
op_type
):
fetch_list
.
append
(
str
(
out_name
))
# fetch_list = map(block.var, fetch_list)
if
not
isinstance
(
fetch_list
[
0
],
Variable
):
fetch_list
=
map
(
block
.
var
,
fetch_list
)
outs
=
executor
.
run
(
program
,
feed
=
feed_map
,
fetch_list
=
fetch_list
,
return_numpy
=
False
)
return
outs
,
fetch_list
def
check_output_with_place
(
self
,
place
,
atol
):
...
...
@@ -346,17 +311,19 @@ class OpTest(unittest.TestCase):
"Output ("
+
out_name
+
") has different lod at "
+
str
(
place
))
def
check_output
(
self
,
atol
=
1e-5
):
places
=
[
core
.
CPUPlace
()]
def
_get_places
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
return
places
def
check_output
(
self
,
atol
=
1e-5
):
places
=
self
.
_get_places
()
for
place
in
places
:
self
.
check_output_with_place
(
place
,
atol
)
def
check_output_customized
(
self
,
checker
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
places
=
self
.
_get_places
()
for
place
in
places
:
outs
=
self
.
calc_output
(
place
)
outs
=
[
np
.
array
(
out
)
for
out
in
outs
]
...
...
@@ -389,9 +356,7 @@ class OpTest(unittest.TestCase):
in_place
=
False
,
max_relative_error
=
0.005
,
user_defined_grads
=
None
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
places
=
self
.
_get_places
()
for
place
in
places
:
self
.
check_grad_with_place
(
place
,
inputs_to_check
,
output_names
,
no_grad_set
,
numeric_grad_delta
,
...
...
@@ -438,35 +403,6 @@ class OpTest(unittest.TestCase):
max_relative_error
,
"Gradient Check On %s"
%
str
(
place
))
@
staticmethod
def
_create_var_descs_
(
block
,
var_dict
):
# FIXME: Try unify with `append_input_output`
for
param_name
in
var_dict
:
var
=
var_dict
[
param_name
]
if
not
isinstance
(
var
,
list
)
and
not
isinstance
(
var
,
tuple
):
var
=
[(
param_name
,
var
,
None
)]
if
not
isinstance
(
var
[
0
],
list
)
and
not
isinstance
(
var
[
0
],
tuple
):
var
=
[(
param_name
,
var
[
0
],
var
[
1
])]
for
i
,
item
in
enumerate
(
var
):
if
not
isinstance
(
item
[
0
],
basestring
):
item
=
[[
param_name
]
+
list
(
item
)]
if
len
(
item
)
==
2
:
if
isinstance
(
item
[
1
],
tuple
):
var
[
i
]
=
[
item
[
0
],
item
[
1
][
0
],
item
[
1
][
1
]]
else
:
# only set var name and value, set lod to None
var
[
i
]
=
list
(
item
)
+
[
None
]
var_descs
=
[(
block
.
create_var
(
name
=
name
,
shape
=
each
.
shape
,
dtype
=
each
.
dtype
),
each
,
lod
)
for
name
,
each
,
lod
in
var
]
yield
param_name
,
var_descs
@
staticmethod
def
_merge_list
(
iterable
):
return
reduce
(
lambda
a
,
b
:
list
(
a
)
+
list
(
b
),
iterable
,
[])
@
staticmethod
def
_numpy_to_lod_tensor
(
np_value
,
lod
,
place
):
tensor
=
core
.
LoDTensor
()
...
...
@@ -497,83 +433,31 @@ class OpTest(unittest.TestCase):
input
.
dtype
=
np
.
uint16
return
input
def
_get_gradient
(
self
,
input_to_check
,
place
,
output_names
,
no_grad_set
):
def
_get_gradient
(
self
,
input_to_check
,
place
,
output_names
,
no_grad_set
,
parallel
=
False
):
prog
=
Program
()
block
=
prog
.
global_block
()
inputs_with_np
=
{
key
:
value
for
(
key
,
value
)
in
OpTest
.
_create_var_descs_
(
block
,
getattr
(
self
,
'inputs'
,
{}))
}
outputs_with_np
=
{
key
:
val
for
(
key
,
val
)
in
OpTest
.
_create_var_descs_
(
block
,
getattr
(
self
,
'outputs'
,
{}))
}
inputs
=
{
k
:
[
item
[
0
]
for
item
in
inputs_with_np
[
k
]]
for
k
in
inputs_with_np
}
outputs
=
{
k
:
[
item
[
0
]
for
item
in
outputs_with_np
[
k
]]
for
k
in
outputs_with_np
}
op
=
block
.
append_op
(
type
=
self
.
op_type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
getattr
(
self
,
'attrs'
,
{}))
# infer variable type and infer shape in compile-time
op
.
desc
.
infer_var_type
(
block
.
desc
)
op
.
desc
.
infer_shape
(
block
.
desc
)
mean_inputs
=
map
(
block
.
var
,
output_names
)
if
len
(
mean_inputs
)
==
1
:
loss
=
block
.
create_var
(
dtype
=
mean_inputs
[
0
].
dtype
,
shape
=
[
1
])
op
=
block
.
append_op
(
inputs
=
{
"X"
:
mean_inputs
},
outputs
=
{
"Out"
:
loss
},
type
=
'mean'
)
op
.
desc
.
infer_var_type
(
block
.
desc
)
op
.
desc
.
infer_shape
(
block
.
desc
)
else
:
avg_sum
=
[]
for
cur_loss
in
mean_inputs
:
cur_avg_loss
=
block
.
create_var
(
dtype
=
cur_loss
.
dtype
,
shape
=
[
1
])
op
=
block
.
append_op
(
inputs
=
{
"X"
:
[
cur_loss
]},
outputs
=
{
"Out"
:
[
cur_avg_loss
]},
type
=
"mean"
)
op
.
desc
.
infer_var_type
(
block
.
desc
)
op
.
desc
.
infer_shape
(
block
.
desc
)
avg_sum
.
append
(
cur_avg_loss
)
loss_sum
=
block
.
create_var
(
dtype
=
avg_sum
[
0
].
dtype
,
shape
=
[
1
])
op_sum
=
block
.
append_op
(
inputs
=
{
"X"
:
avg_sum
},
outputs
=
{
"Out"
:
loss_sum
},
type
=
'sum'
)
op_sum
.
desc
.
infer_var_type
(
block
.
desc
)
op_sum
.
desc
.
infer_shape
(
block
.
desc
)
loss
=
block
.
create_var
(
dtype
=
loss_sum
.
dtype
,
shape
=
[
1
])
op_loss
=
block
.
append_op
(
inputs
=
{
"X"
:
loss_sum
},
outputs
=
{
"Out"
:
loss
},
type
=
'scale'
,
attrs
=
{
'scale'
:
1.0
/
float
(
len
(
avg_sum
))})
op_loss
.
desc
.
infer_var_type
(
block
.
desc
)
op_loss
.
desc
.
infer_shape
(
block
.
desc
)
self
.
_append_ops
(
block
)
loss
=
append_loss_ops
(
block
,
output_names
)
param_grad_list
=
append_backward
(
loss
=
loss
,
parameter_list
=
input_to_check
,
no_grad_set
=
no_grad_set
)
feed_dict
=
{
item
[
0
].
name
:
OpTest
.
_numpy_to_lod_tensor
(
item
[
1
],
item
[
2
],
place
)
for
p_name
in
inputs_with_np
for
item
in
inputs_with_np
[
p_name
]
}
inputs
=
self
.
_get_inputs
(
block
)
feed_dict
=
self
.
feed_var
(
inputs
,
place
)
fetch_list
=
[
g
for
p
,
g
in
param_grad_list
]
executor
=
Executor
(
place
)
if
parallel
:
use_cuda
=
False
if
isinstance
(
place
,
fluid
.
CUDAPlace
(
0
)):
use_cuda
=
True
executor
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
program
)
else
:
executor
=
Executor
(
place
)
return
map
(
np
.
array
,
executor
.
run
(
prog
,
feed_dict
,
fetch_list
,
return_numpy
=
False
))
python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
浏览文件 @
d5a88b93
...
...
@@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
self
.
axis
=
1
class
TestElementwiseAddOp_channelwise_add
(
TestElementwiseAddOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
3
,
20
,
20
).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
rand
(
3
,
1
,
1
).
astype
(
self
.
dtype
)
self
.
out
=
self
.
x
+
self
.
y
def
init_axis
(
self
):
self
.
axis
=
-
1
class
TestFP16ElementwiseAddOp_channelwise_add
(
TestFP16ElementwiseAddOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
3
,
10
,
20
).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
rand
(
3
,
1
,
1
).
astype
(
self
.
dtype
)
self
.
out
=
self
.
x
+
self
.
y
def
init_axis
(
self
):
self
.
axis
=
-
1
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_lstm_op.py
浏览文件 @
d5a88b93
...
...
@@ -194,107 +194,104 @@ class TestLstmOp(OpTest):
[
'Input'
,
'Weight'
,
'Bias'
],
[
'Hidden'
],
max_relative_error
=
5e-4
)
class
TestLstmOpHasInitial
(
TestLstmOp
):
def
set_argument
(
self
):
self
.
lod
=
[[
0
,
2
,
5
,
7
]]
self
.
D
=
16
self
.
act_gate
=
'sigmoid'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
has_initial_state
=
True
self
.
is_reverse
=
True
self
.
use_peepholes
=
True
def
test_check_grad
(
self
):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N
=
len
(
self
.
lod
[
0
])
-
1
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'Bias'
,
'H0'
,
'C0'
],
[
'Hidden'
],
max_relative_error
=
5e-4
)
def
test_check_grad_ingore_bias
(
self
):
N
=
len
(
self
.
lod
[
0
])
-
1
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
[
'Input'
,
'Weight'
],
[
'Hidden'
],
max_relative_error
=
5e-4
,
no_grad_set
=
set
(
'Bias'
))
def
test_check_grad_ingore_weight
(
self
):
N
=
len
(
self
.
lod
[
0
])
-
1
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
[
'Input'
,
'Bias'
],
[
'Hidden'
],
max_relative_error
=
5e-4
,
no_grad_set
=
set
(
'Weight'
))
def
test_check_grad_ingore_input
(
self
):
N
=
len
(
self
.
lod
[
0
])
-
1
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
[
'Weight'
,
'Bias'
],
[
'Hidden'
],
max_relative_error
=
5e-4
,
no_grad_set
=
set
(
'Input'
))
def
test_check_grad_ingore_h0
(
self
):
N
=
len
(
self
.
lod
[
0
])
-
1
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'Bias'
,
'C0'
],
[
'Hidden'
],
max_relative_error
=
5e-4
,
no_grad_set
=
set
(
'H0'
))
def
test_check_grad_ingore_c0
(
self
):
N
=
len
(
self
.
lod
[
0
])
-
1
self
.
outputs
[
'BatchGate'
]
=
np
.
zeros
((
N
,
4
*
self
.
D
)).
astype
(
'float64'
)
self
.
outputs
[
'BatchCellPreAct'
]
=
np
.
zeros
(
(
N
,
self
.
D
)).
astype
(
'float64'
)
self
.
check_grad
(
[
'Input'
,
'Weight'
,
'Bias'
,
'H0'
],
[
'Hidden'
],
max_relative_error
=
5e-4
,
no_grad_set
=
set
(
'C0'
))
class
TestLstmOpRerverse
(
TestLstmOp
):
def
set_argument
(
self
):
self
.
lod
=
[[
0
,
2
,
5
,
7
]]
self
.
D
=
16
self
.
act_gate
=
'sigmoid'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
has_initial_state
=
False
self
.
is_reverse
=
True
self
.
use_peepholes
=
True
class
TestLstmOpNotUsePeepholes
(
TestLstmOp
):
def
set_argument
(
self
):
self
.
lod
=
[[
0
,
2
,
5
,
7
]]
self
.
D
=
16
self
.
act_gate
=
'sigmoid'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
has_initial_state
=
False
self
.
is_reverse
=
True
self
.
use_peepholes
=
False
# class TestLstmOpHasInitial(TestLstmOp):
# def set_argument(self):
# self.lod = [[0, 2, 5, 7]]
# self.D = 16
# self.act_gate = 'sigmoid'
# self.act_cell = 'tanh'
# self.act_cand = 'tanh'
# self.has_initial_state = True
# self.is_reverse = True
# self.use_peepholes = True
# def test_check_grad(self):
# # TODO(qingqing) remove folowing lines after the check_grad is refined.
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
# max_relative_error=5e-4)
# def test_check_grad_ingore_bias(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('Bias'))
# def test_check_grad_ingore_weight(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Bias'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('Weight'))
# def test_check_grad_ingore_input(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Weight', 'Bias'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('Input'))
# def test_check_grad_ingore_h0(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('H0'))
# def test_check_grad_ingore_c0(self):
# N = len(self.lod[0]) - 1
# self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
# self.outputs['BatchCellPreAct'] = np.zeros(
# (N, self.D)).astype('float64')
# self.check_grad(
# ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
# max_relative_error=5e-4,
# no_grad_set=set('C0'))
# class TestLstmOpRerverse(TestLstmOp):
# def set_argument(self):
# self.lod = [[0, 2, 5, 7]]
# self.D = 16
# self.act_gate = 'sigmoid'
# self.act_cell = 'tanh'
# self.act_cand = 'tanh'
# self.has_initial_state = False
# self.is_reverse = True
# self.use_peepholes = True
# class TestLstmOpNotUsePeepholes(TestLstmOp):
# def set_argument(self):
# self.lod = [[0, 2, 5, 7]]
# self.D = 16
# self.act_gate = 'sigmoid'
# self.act_cell = 'tanh'
# self.act_cand = 'tanh'
# self.has_initial_state = False
# self.is_reverse = True
# self.use_peepholes = False
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_reverse_op.py
0 → 100644
浏览文件 @
d5a88b93
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
class
TestReverseOp
(
OpTest
):
def
initTestCase
(
self
):
self
.
x
=
np
.
random
.
random
((
3
,
4
)).
astype
(
'float32'
)
self
.
axis
=
[
0
]
def
setUp
(
self
):
self
.
initTestCase
()
self
.
op_type
=
"reverse"
self
.
inputs
=
{
"X"
:
self
.
x
}
self
.
attrs
=
{
'axis'
:
self
.
axis
}
out
=
self
.
x
for
a
in
self
.
axis
:
out
=
np
.
flip
(
out
,
axis
=
a
)
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestCase0
(
TestReverseOp
):
def
initTestCase
(
self
):
self
.
x
=
np
.
random
.
random
((
3
,
4
)).
astype
(
'float32'
)
self
.
axis
=
[
1
]
class
TestCase1
(
TestReverseOp
):
def
initTestCase
(
self
):
self
.
x
=
np
.
random
.
random
((
3
,
4
)).
astype
(
'float32'
)
self
.
axis
=
[
0
,
1
]
class
TestCase2
(
TestReverseOp
):
def
initTestCase
(
self
):
self
.
x
=
np
.
random
.
random
((
3
,
4
,
5
)).
astype
(
'float32'
)
self
.
axis
=
[
0
,
2
]
class
TestCase3
(
TestReverseOp
):
def
initTestCase
(
self
):
self
.
x
=
np
.
random
.
random
((
3
,
4
,
5
)).
astype
(
'float32'
)
self
.
axis
=
[
1
,
2
]
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/testsuite.py
0 → 100644
浏览文件 @
d5a88b93
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录