Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
234a1d92
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
234a1d92
编写于
11月 08, 2018
作者:
D
dzhwinter
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/develop' into windows/debug
test=develop
上级
2835e044
a270fdf2
变更
79
展开全部
隐藏空白更改
内联
并排
Showing
79 changed file
with
1279 addition
and
1419 deletion
+1279
-1419
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+2
-0
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+3
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+3
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+1
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+12
-11
paddle/fluid/framework/threadpool.cc
paddle/fluid/framework/threadpool.cc
+18
-12
paddle/fluid/framework/threadpool.h
paddle/fluid/framework/threadpool.h
+8
-3
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+1
-1
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+20
-1
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+4
-0
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+12
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+16
-7
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+2
-28
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+29
-0
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+11
-6
paddle/fluid/inference/tests/test.cmake
paddle/fluid/inference/tests/test.cmake
+0
-0
paddle/fluid/inference/tests/test_helper.h
paddle/fluid/inference/tests/test_helper.h
+8
-18
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+3
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+2
-3
paddle/fluid/operators/adagrad_op.cc
paddle/fluid/operators/adagrad_op.cc
+2
-2
paddle/fluid/operators/adagrad_op.cu
paddle/fluid/operators/adagrad_op.cu
+2
-2
paddle/fluid/operators/adagrad_op.h
paddle/fluid/operators/adagrad_op.h
+14
-0
paddle/fluid/operators/batch_norm_op.cu.cc
paddle/fluid/operators/batch_norm_op.cu.cc
+12
-9
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+4
-1
paddle/fluid/operators/cross_entropy_op.cu
paddle/fluid/operators/cross_entropy_op.cu
+9
-4
paddle/fluid/operators/distributed/grpc_variable_response.cc
paddle/fluid/operators/distributed/grpc_variable_response.cc
+2
-2
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+0
-1
paddle/fluid/operators/distributed/rpc_server.cc
paddle/fluid/operators/distributed/rpc_server.cc
+0
-32
paddle/fluid/operators/distributed/rpc_server.h
paddle/fluid/operators/distributed/rpc_server.h
+0
-18
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+3
-0
paddle/fluid/operators/distributed/variable_response.h
paddle/fluid/operators/distributed/variable_response.h
+2
-0
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+2
-1
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+2
-2
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+0
-1
paddle/fluid/operators/math/cos_sim_functor.cu
paddle/fluid/operators/math/cos_sim_functor.cu
+1
-1
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+16
-6
paddle/fluid/operators/math/cross_entropy.h
paddle/fluid/operators/math/cross_entropy.h
+21
-0
paddle/fluid/operators/math/fc_compute.h
paddle/fluid/operators/math/fc_compute.h
+2
-2
paddle/fluid/operators/math/jit_code.cc
paddle/fluid/operators/math/jit_code.cc
+37
-7
paddle/fluid/operators/math/jit_code.h
paddle/fluid/operators/math/jit_code.h
+27
-8
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+7
-7
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+114
-141
paddle/fluid/operators/math/jit_kernel_rnn.cc
paddle/fluid/operators/math/jit_kernel_rnn.cc
+5
-5
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+11
-7
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+18
-9
paddle/fluid/operators/math/selected_rows_functor.h
paddle/fluid/operators/math/selected_rows_functor.h
+0
-51
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+3
-0
paddle/fluid/operators/mean_op.cu
paddle/fluid/operators/mean_op.cu
+6
-2
paddle/fluid/operators/mean_op.h
paddle/fluid/operators/mean_op.h
+1
-2
paddle/fluid/operators/mul_op.cu.cc
paddle/fluid/operators/mul_op.cu.cc
+4
-3
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+2
-1
paddle/fluid/operators/scale_op.cu
paddle/fluid/operators/scale_op.cu
+5
-1
paddle/fluid/operators/softmax_cudnn_op.cu.cc
paddle/fluid/operators/softmax_cudnn_op.cu.cc
+2
-1
paddle/fluid/operators/softmax_op.cu.cc
paddle/fluid/operators/softmax_op.cu.cc
+2
-1
paddle/fluid/operators/sum_op.cu
paddle/fluid/operators/sum_op.cu
+4
-1
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+1
-1
paddle/fluid/operators/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt_engine_op.h
+3
-1
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+3
-1
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+1
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+6
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+0
-1
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+7
-2
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+1
-1
python/paddle/fluid/recordio_writer.py
python/paddle/fluid/recordio_writer.py
+0
-3
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+9
-8
python/paddle/fluid/tests/unittests/test_activation_op.py
python/paddle/fluid/tests/unittests/test_activation_op.py
+90
-537
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+63
-88
python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+201
-137
python/paddle/fluid/tests/unittests/test_mean_op.py
python/paddle/fluid/tests/unittests/test_mean_op.py
+25
-1
python/paddle/fluid/tests/unittests/test_mul_op.py
python/paddle/fluid/tests/unittests/test_mul_op.py
+78
-32
python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
...addle/fluid/tests/unittests/test_parallel_executor_crf.py
+13
-8
python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
...e/fluid/tests/unittests/test_parallel_executor_dry_run.py
+80
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+9
-52
python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+2
-2
python/paddle/fluid/tests/unittests/test_pool2d_op.py
python/paddle/fluid/tests/unittests/test_pool2d_op.py
+74
-100
python/paddle/fluid/tests/unittests/test_scale_op.py
python/paddle/fluid/tests/unittests/test_scale_op.py
+52
-3
python/paddle/fluid/tests/unittests/test_softmax_op.py
python/paddle/fluid/tests/unittests/test_softmax_op.py
+18
-6
python/paddle/fluid/tests/unittests/test_sum_op.py
python/paddle/fluid/tests/unittests/test_sum_op.py
+42
-4
python/setup.py.in
python/setup.py.in
+4
-4
未找到文件。
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
234a1d92
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include <cstddef> // for size_t
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -26,6 +27,7 @@ struct ExecutionStrategy {
...
@@ -26,6 +27,7 @@ struct ExecutionStrategy {
bool
allow_op_delay_
{
false
};
bool
allow_op_delay_
{
false
};
size_t
num_iteration_per_drop_scope_
{
100
};
size_t
num_iteration_per_drop_scope_
{
100
};
ExecutorType
type_
{
kDefault
};
ExecutorType
type_
{
kDefault
};
bool
dry_run_
{
false
};
};
};
}
// namespace details
}
// namespace details
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
234a1d92
...
@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
...
@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
size_t
complete
=
0
;
size_t
complete
=
0
;
while
(
op_to_run
!=
nullptr
)
{
while
(
op_to_run
!=
nullptr
)
{
try
{
try
{
op_to_run
->
Run
(
strategy_
.
use_cuda_
);
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op_to_run
->
Run
(
strategy_
.
use_cuda_
);
}
++
complete
;
++
complete
;
}
catch
(...)
{
}
catch
(...)
{
exception_
.
Catch
(
std
::
current_exception
());
exception_
.
Catch
(
std
::
current_exception
());
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
234a1d92
...
@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
...
@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
if
(
VLOG_IS_ON
(
10
))
{
if
(
VLOG_IS_ON
(
10
))
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
}
}
op
->
Run
(
strategy_
.
use_cuda_
);
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op
->
Run
(
strategy_
.
use_cuda_
);
}
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
running_ops_
--
;
running_ops_
--
;
ready_var_q
->
Extend
(
op
->
Outputs
());
ready_var_q
->
Extend
(
op
->
Outputs
());
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
234a1d92
...
@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
...
@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
// Use topological sort algorithm
// Use topological sort algorithm
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
~
ThreadedSSAGraphExecutor
()
{}
~
ThreadedSSAGraphExecutor
()
final
=
default
;
private:
private:
void
RunOp
(
const
std
::
shared_ptr
<
BlockingQueue
<
VarHandleBase
*>>
&
ready_var_q
,
void
RunOp
(
const
std
::
shared_ptr
<
BlockingQueue
<
VarHandleBase
*>>
&
ready_var_q
,
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
234a1d92
...
@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
...
@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
explicit
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
)
explicit
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
)
:
places_
(
places
)
{}
:
places_
(
places
)
{}
~
ParallelExecutorPrivate
()
{
if
(
own_local_scope_
)
{
for
(
size_t
i
=
1
;
i
<
local_scopes_
.
size
();
++
i
)
{
// Skip the first scope, since it is the global scope.
Scope
*
local_scope
=
local_scopes_
[
i
];
if
(
global_scope_
->
HasKid
(
local_scope
))
{
global_scope_
->
DeleteScope
(
local_scope
);
}
}
}
}
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
Scope
*
global_scope_
;
Scope
*
global_scope_
;
// not owned
std
::
unique_ptr
<
details
::
SSAGraphExecutor
>
executor_
;
std
::
unique_ptr
<
details
::
SSAGraphExecutor
>
executor_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
...
@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
...
@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
for
(
auto
&
p
:
member_
->
places_
)
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
}
if
(
member_
->
own_local_scope_
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
local_scopes_
.
size
();
++
i
)
{
Scope
*
local_scope
=
member_
->
local_scopes_
[
i
];
if
(
member_
->
global_scope_
->
HasKid
(
local_scope
))
{
member_
->
global_scope_
->
DeleteScope
(
local_scope
);
}
}
}
// member_ must be destructed before gcs_ since the destructor of
// member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_
.
reset
();
member_
.
reset
();
...
...
paddle/fluid/framework/threadpool.cc
浏览文件 @
234a1d92
...
@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
...
@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
ThreadPool
::~
ThreadPool
()
{
ThreadPool
::~
ThreadPool
()
{
{
{
// notify all threads to stop running
// notify all threads to stop running
std
::
lock_guard
<
std
::
mutex
>
l
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
l
(
mutex_
);
running_
=
false
;
running_
=
false
;
scheduled_
.
notify_all
();
}
}
scheduled_
.
notify_all
();
for
(
auto
&
t
:
threads_
)
{
for
(
auto
&
t
:
threads_
)
{
t
->
join
();
t
->
join
();
...
@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
...
@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
void
ThreadPool
::
TaskLoop
()
{
void
ThreadPool
::
TaskLoop
()
{
while
(
true
)
{
while
(
true
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
)
;
Task
task
;
scheduled_
.
wait
(
{
lock
,
[
this
]
{
return
!
this
->
tasks_
.
empty
()
||
!
this
->
running_
;
});
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
scheduled_
.
wait
(
lock
,
[
this
]
{
return
!
this
->
tasks_
.
empty
()
||
!
this
->
running_
;
});
if
(
!
running_
||
tasks_
.
empty
())
{
if
(
!
running_
&&
tasks_
.
empty
())
{
return
;
return
;
}
}
if
(
tasks_
.
empty
())
{
PADDLE_THROW
(
"This thread has no task to Run"
);
}
// pop a task from the task queue
// pop a task from the task queue
auto
task
=
std
::
move
(
tasks_
.
front
());
task
=
std
::
move
(
tasks_
.
front
());
tasks_
.
pop
();
tasks_
.
pop
();
lock
.
unlock
();
}
// run the task
// run the task
task
();
task
();
...
...
paddle/fluid/framework/threadpool.h
浏览文件 @
234a1d92
...
@@ -58,7 +58,7 @@ class ThreadPool {
...
@@ -58,7 +58,7 @@ class ThreadPool {
~
ThreadPool
();
~
ThreadPool
();
// Run pushes a function to the task queue and returns a std::future
// Run pushes a function to the task queue and returns a std::future
// object.
To wait for the completion of the task, call
// object. To wait for the completion of the task, call
// std::future::wait().
// std::future::wait().
template
<
typename
Callback
>
template
<
typename
Callback
>
std
::
future
<
void
>
Run
(
Callback
fn
)
{
std
::
future
<
void
>
Run
(
Callback
fn
)
{
...
@@ -69,7 +69,6 @@ class ThreadPool {
...
@@ -69,7 +69,6 @@ class ThreadPool {
template
<
typename
Callback
>
template
<
typename
Callback
>
std
::
future
<
std
::
unique_ptr
<
platform
::
EnforceNotMet
>>
RunAndGetException
(
std
::
future
<
std
::
unique_ptr
<
platform
::
EnforceNotMet
>>
RunAndGetException
(
Callback
fn
)
{
Callback
fn
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
Task
task
([
fn
]()
->
std
::
unique_ptr
<
platform
::
EnforceNotMet
>
{
Task
task
([
fn
]()
->
std
::
unique_ptr
<
platform
::
EnforceNotMet
>
{
try
{
try
{
fn
();
fn
();
...
@@ -84,7 +83,13 @@ class ThreadPool {
...
@@ -84,7 +83,13 @@ class ThreadPool {
return
nullptr
;
return
nullptr
;
});
});
std
::
future
<
std
::
unique_ptr
<
platform
::
EnforceNotMet
>>
f
=
task
.
get_future
();
std
::
future
<
std
::
unique_ptr
<
platform
::
EnforceNotMet
>>
f
=
task
.
get_future
();
tasks_
.
push
(
std
::
move
(
task
));
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
!
running_
)
{
PADDLE_THROW
(
"enqueue on stopped ThreadPool"
);
}
tasks_
.
push
(
std
::
move
(
task
));
}
scheduled_
.
notify_one
();
scheduled_
.
notify_one
();
return
f
;
return
f
;
}
}
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
234a1d92
if
(
WITH_TESTING
)
if
(
WITH_TESTING
)
include
(
test.cmake
)
# some generic cmake funtion for inference
include
(
test
s/test
.cmake
)
# some generic cmake funtion for inference
endif
()
endif
()
# analysis and tensorrt must be added before creating static library,
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
# otherwise, there would be undefined reference to them in static library.
...
...
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
浏览文件 @
234a1d92
...
@@ -18,6 +18,21 @@ namespace paddle {
...
@@ -18,6 +18,21 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
bool
to_skip_merging_optimize
(
TensorRTEngine
*
engine_
,
const
std
::
vector
<
int
>&
filters
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
std
::
string
input_name
)
{
if
(
engine_
->
itensor_quote_num
[
input_name
]
>
0
)
{
return
true
;
}
if
(
filters
[
0
]
==
1
&&
filters
[
1
]
==
1
&&
strides
[
0
]
==
1
&&
strides
[
1
]
==
1
&&
paddings
[
0
]
==
0
&&
paddings
[
1
]
==
0
)
engine_
->
itensor_quote_num
[
input_name
]
+=
1
;
return
false
;
}
class
Conv2dOpConverter
:
public
OpConverter
{
class
Conv2dOpConverter
:
public
OpConverter
{
public:
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
...
@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
...
@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Output"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Output"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Input"
).
front
());
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Input"
).
front
());
// Declare weights
// Declare weights
auto
*
Y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Filter"
).
front
());
auto
*
Y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Filter"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
Y_v
);
PADDLE_ENFORCE_NOT_NULL
(
Y_v
);
...
@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
...
@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
std
::
move
(
weight_tensor
);
std
::
move
(
weight_tensor
);
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
if
(
test_mode
||
to_skip_merging_optimize
(
engine_
,
{
filter_h
,
filter_w
},
strides
,
paddings
,
op_desc
.
Input
(
"Input"
).
front
()))
{
engine_
->
DeclareOutput
(
output_name
);
engine_
->
DeclareOutput
(
output_name
);
}
}
}
}
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
234a1d92
...
@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
...
@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
buffer_sizes_
[
name
]
=
0
;
buffer_sizes_
[
name
]
=
0
;
}
}
bool
TensorRTEngine
::
HasDeclared
(
const
std
::
string
&
name
)
{
return
buffer_sizes_
.
count
(
name
)
>
0
;
}
void
TensorRTEngine
::
DeclareOutput
(
const
std
::
string
&
name
)
{
void
TensorRTEngine
::
DeclareOutput
(
const
std
::
string
&
name
)
{
PADDLE_ENFORCE_EQ
(
0
,
buffer_sizes_
.
count
(
name
),
"duplicate output name %s"
,
PADDLE_ENFORCE_EQ
(
0
,
buffer_sizes_
.
count
(
name
),
"duplicate output name %s"
,
name
);
name
);
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
234a1d92
...
@@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase {
...
@@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase {
const
std
::
string
&
name
);
const
std
::
string
&
name
);
// Set the itensor_map_[name] as the network's output, and set its name.
// Set the itensor_map_[name] as the network's output, and set its name.
void
DeclareOutput
(
const
std
::
string
&
name
);
void
DeclareOutput
(
const
std
::
string
&
name
);
// Check if the ITensor has been declared
bool
HasDeclared
(
const
std
::
string
&
name
);
// GPU memory address for an ITensor with specific name. One can operate on
// GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted
// these memory directly for acceleration, for example, output the converted
...
@@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase {
...
@@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase {
std
::
unordered_map
<
std
::
string
/*name*/
,
std
::
unique_ptr
<
framework
::
Tensor
>>
std
::
unordered_map
<
std
::
string
/*name*/
,
std
::
unique_ptr
<
framework
::
Tensor
>>
weight_map
;
weight_map
;
// TODO: (NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into
// one conv, and then trigger bug. So, We should use strategy to avoid this
// optimization for the time being. This bug will be fixed in the future.
std
::
unordered_map
<
std
::
string
/*name*/
,
int
/*ITensor_quote_num*/
>
itensor_quote_num
;
private:
private:
// the max batch size
// the max batch size
int
max_batch_
;
int
max_batch_
;
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
234a1d92
set
(
INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
)
set
(
INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
)
function
(
download_model install_dir model_name
)
if
(
NOT EXISTS
${
install_dir
}
)
inference_download_and_uncompress
(
${
install_dir
}
${
INFERENCE_URL
}
${
model_name
}
)
endif
()
endfunction
()
function
(
download_model_and_data install_dir model_name data_name
)
function
(
download_model_and_data install_dir model_name data_name
)
if
(
NOT EXISTS
${
install_dir
}
)
if
(
NOT EXISTS
${
install_dir
}
)
inference_download_and_uncompress
(
${
install_dir
}
${
INFERENCE_URL
}
${
model_name
}
)
inference_download_and_uncompress
(
${
install_dir
}
${
INFERENCE_URL
}
${
model_name
}
)
...
@@ -13,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename)
...
@@ -13,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename)
ARGS --infer_model=
${
install_dir
}
/model --infer_data=
${
install_dir
}
/data.txt
)
ARGS --infer_model=
${
install_dir
}
/model --infer_data=
${
install_dir
}
/data.txt
)
endfunction
()
endfunction
()
function
(
inference_analysis_api_test_with_fake_data target install_dir filename model_name
)
download_model
(
${
install_dir
}
${
model_name
}
)
inference_analysis_test
(
${
target
}
SRCS
${
filename
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
install_dir
}
/model
)
endfunction
()
# RNN1
# RNN1
if
(
NOT APPLE
)
if
(
NOT APPLE
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
...
@@ -61,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
...
@@ -61,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# ocr
# ocr
set
(
OCR_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/ocr"
)
set
(
OCR_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/ocr"
)
if
(
NOT EXISTS
${
OCR_INSTALL_DIR
}
)
if
(
NOT EXISTS
${
OCR_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
OCR_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Focr.tar.gz"
)
inference_download_and_uncompress
(
${
OCR_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Focr.tar.gz"
)
endif
()
endif
()
inference_analysis_api_test
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
inference_analysis_api_test
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
# resnet50
# resnet50
set
(
RESNET50_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
)
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
if
(
NOT EXISTS
${
RESNET50_INSTALL_DIR
}
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
)
inference_download_and_uncompress
(
${
RESNET50_INSTALL_DIR
}
${
INFERENCE_URL
}
"resnet50_model.tar.gz"
)
endif
()
inference_analysis_test
(
test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
RESNET50_INSTALL_DIR
}
/model
)
# anakin
# anakin
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
浏览文件 @
234a1d92
...
@@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) {
...
@@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) {
}
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
PADDLE_ENFORCE_EQ
(
FLAGS_test_all_data
,
0
,
"Only have single batch of data."
);
SetFakeImageInput
(
inputs
,
FLAGS_infer_model
);
PaddleTensor
input
;
// channel=3, height/width=318
std
::
vector
<
int
>
shape
({
FLAGS_batch_size
,
3
,
318
,
318
});
input
.
shape
=
shape
;
input
.
dtype
=
PaddleDType
::
FLOAT32
;
// fill input data, for profile easily, do not use random data here.
size_t
size
=
FLAGS_batch_size
*
3
*
318
*
318
;
input
.
data
.
Resize
(
size
*
sizeof
(
float
));
float
*
input_data
=
static_cast
<
float
*>
(
input
.
data
.
data
());
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
*
(
input_data
+
i
)
=
static_cast
<
float
>
(
i
)
/
size
;
}
std
::
vector
<
PaddleTensor
>
input_slots
;
input_slots
.
assign
({
input
});
(
*
inputs
).
emplace_back
(
input_slots
);
}
}
// Easy for profiling independently.
// Easy for profiling independently.
...
@@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) {
...
@@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
1UL
);
size_t
size
=
GetSize
(
outputs
[
0
]);
// output is a 512-dimension feature
EXPECT_EQ
(
size
,
512
*
FLAGS_batch_size
);
}
}
}
TEST
(
Analyzer_resnet50
,
profile
)
{
profile
();
}
TEST
(
Analyzer_resnet50
,
profile
)
{
profile
();
}
...
@@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) {
...
@@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) {
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
}
}
// Compare result of NativeConfig and AnalysisConfig
// Compare result of NativeConfig and AnalysisConfig
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
234a1d92
...
@@ -25,6 +25,7 @@
...
@@ -25,6 +25,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string
(
infer_model
,
""
,
"model path"
);
DEFINE_string
(
infer_model
,
""
,
"model path"
);
...
@@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
...
@@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
return
fuse_statis
;
return
fuse_statis
;
}
}
void
SetFakeImageInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
,
const
std
::
string
&
dirname
)
{
// Set fake_image_data
PADDLE_ENFORCE_EQ
(
FLAGS_test_all_data
,
0
,
"Only have single batch of data."
);
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
GetFeedTargetShapes
(
dirname
,
true
,
"model"
,
"params"
);
int
dim1
=
feed_target_shapes
[
0
][
1
];
int
dim2
=
feed_target_shapes
[
0
][
2
];
int
dim3
=
feed_target_shapes
[
0
][
3
];
PaddleTensor
input
;
std
::
vector
<
int
>
shape
({
FLAGS_batch_size
,
dim1
,
dim2
,
dim3
});
input
.
shape
=
shape
;
input
.
dtype
=
PaddleDType
::
FLOAT32
;
// fill input data, for profile easily, do not use random data here.
size_t
size
=
FLAGS_batch_size
*
dim1
*
dim2
*
dim3
;
input
.
data
.
Resize
(
size
*
sizeof
(
float
));
float
*
input_data
=
static_cast
<
float
*>
(
input
.
data
.
data
());
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
*
(
input_data
+
i
)
=
static_cast
<
float
>
(
i
)
/
size
;
}
std
::
vector
<
PaddleTensor
>
input_slots
;
input_slots
.
assign
({
input
});
(
*
inputs
).
emplace_back
(
input_slots
);
}
void
TestOneThreadPrediction
(
void
TestOneThreadPrediction
(
const
AnalysisConfig
&
config
,
const
AnalysisConfig
&
config
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
,
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
234a1d92
...
@@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
...
@@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
}
}
}
}
TEST
(
trt_models_test
,
m
ain
)
{
TEST
(
trt_models_test
,
m
obilenet
)
{
std
::
vector
<
std
::
string
>
infer_models
=
{
"mobilenet"
,
"resnet50"
,
CompareTensorRTWithFluid
(
1
,
FLAGS_dirname
+
"/mobilenet"
);
"resnext50"
};
}
for
(
auto
&
model_dir
:
infer_models
)
{
CompareTensorRTWithFluid
(
1
,
FLAGS_dirname
+
"/"
+
model_dir
);
TEST
(
trt_models_test
,
resnet50
)
{
}
CompareTensorRTWithFluid
(
1
,
FLAGS_dirname
+
"/resnet50"
);
}
}
TEST
(
trt_models_test
,
resnext50
)
{
CompareTensorRTWithFluid
(
1
,
FLAGS_dirname
+
"/resnext50"
);
}
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/test.cmake
→
paddle/fluid/inference/test
s/test
.cmake
浏览文件 @
234a1d92
文件已移动
paddle/fluid/inference/tests/test_helper.h
浏览文件 @
234a1d92
...
@@ -18,7 +18,6 @@ limitations under the License. */
...
@@ -18,7 +18,6 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1,
...
@@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1,
std
::
unique_ptr
<
paddle
::
framework
::
ProgramDesc
>
InitProgram
(
std
::
unique_ptr
<
paddle
::
framework
::
ProgramDesc
>
InitProgram
(
paddle
::
framework
::
Executor
*
executor
,
paddle
::
framework
::
Scope
*
scope
,
paddle
::
framework
::
Executor
*
executor
,
paddle
::
framework
::
Scope
*
scope
,
const
std
::
string
&
dirname
,
const
bool
is_combined
=
false
)
{
const
std
::
string
&
dirname
,
const
bool
is_combined
=
false
,
const
std
::
string
&
prog_filename
=
"__model_combined__"
,
const
std
::
string
&
param_filename
=
"__params_combined__"
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ProgramDesc
>
inference_program
;
std
::
unique_ptr
<
paddle
::
framework
::
ProgramDesc
>
inference_program
;
if
(
is_combined
)
{
if
(
is_combined
)
{
// All parameters are saved in a single file.
// All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest.
// Hard-coding the file names of program and parameters in unittest.
// The file names should be consistent with that used in Python API
// The file names should be consistent with that used in Python API
// `fluid.io.save_inference_model`.
// `fluid.io.save_inference_model`.
std
::
string
prog_filename
=
"__model_combined__"
;
std
::
string
param_filename
=
"__params_combined__"
;
inference_program
=
inference_program
=
paddle
::
inference
::
Load
(
executor
,
scope
,
dirname
+
"/"
+
prog_filename
,
paddle
::
inference
::
Load
(
executor
,
scope
,
dirname
+
"/"
+
prog_filename
,
dirname
+
"/"
+
param_filename
);
dirname
+
"/"
+
param_filename
);
...
@@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
...
@@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
}
}
std
::
vector
<
std
::
vector
<
int64_t
>>
GetFeedTargetShapes
(
std
::
vector
<
std
::
vector
<
int64_t
>>
GetFeedTargetShapes
(
const
std
::
string
&
dirname
,
const
bool
is_combined
=
false
)
{
const
std
::
string
&
dirname
,
const
bool
is_combined
=
false
,
const
std
::
string
&
prog_filename
=
"__model_combined__"
,
const
std
::
string
&
param_filename
=
"__params_combined__"
)
{
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
executor
=
paddle
::
framework
::
Executor
(
place
);
auto
executor
=
paddle
::
framework
::
Executor
(
place
);
auto
*
scope
=
new
paddle
::
framework
::
Scope
();
auto
*
scope
=
new
paddle
::
framework
::
Scope
();
auto
inference_program
=
InitProgram
(
&
executor
,
scope
,
dirname
,
is_combined
);
auto
inference_program
=
InitProgram
(
&
executor
,
scope
,
dirname
,
is_combined
,
prog_filename
,
param_filename
);
auto
&
global_block
=
inference_program
->
Block
(
0
);
auto
&
global_block
=
inference_program
->
Block
(
0
);
const
std
::
vector
<
std
::
string
>&
feed_target_names
=
const
std
::
vector
<
std
::
string
>&
feed_target_names
=
...
@@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
...
@@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
return
feed_target_shapes
;
return
feed_target_shapes
;
}
}
void
Compile
(
paddle
::
framework
::
ProgramDesc
*
program
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
g
(
new
paddle
::
framework
::
ir
::
Graph
(
*
program
));
auto
pass
=
paddle
::
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
"graph_to_program_pass"
);
pass
->
SetNotOwned
<
paddle
::
framework
::
ProgramDesc
>
(
"program"
,
program
);
pass
->
Apply
(
std
::
move
(
g
));
}
template
<
typename
Place
,
bool
CreateVars
=
true
,
bool
PrepareContext
=
false
>
template
<
typename
Place
,
bool
CreateVars
=
true
,
bool
PrepareContext
=
false
>
void
TestInference
(
const
std
::
string
&
dirname
,
void
TestInference
(
const
std
::
string
&
dirname
,
const
std
::
vector
<
paddle
::
framework
::
LoDTensor
*>&
cpu_feeds
,
const
std
::
vector
<
paddle
::
framework
::
LoDTensor
*>&
cpu_feeds
,
...
@@ -182,7 +175,6 @@ void TestInference(const std::string& dirname,
...
@@ -182,7 +175,6 @@ void TestInference(const std::string& dirname,
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
inference_program
=
InitProgram
(
&
executor
,
scope
,
dirname
,
is_combined
);
inference_program
=
InitProgram
(
&
executor
,
scope
,
dirname
,
is_combined
);
}
}
Compile
(
inference_program
.
get
());
// Disable the profiler and print the timing information
// Disable the profiler and print the timing information
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kDefault
,
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kDefault
,
...
@@ -261,5 +253,3 @@ void TestInference(const std::string& dirname,
...
@@ -261,5 +253,3 @@ void TestInference(const std::string& dirname,
delete
scope
;
delete
scope
;
}
}
USE_PASS
(
graph_to_program_pass
);
paddle/fluid/operators/activation_op.cu
浏览文件 @
234a1d92
...
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
...
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>);
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_ACTIVATION_CUDA_KERNEL
);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_ACTIVATION_CUDA_KERNEL
);
paddle/fluid/operators/activation_op.h
浏览文件 @
234a1d92
...
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
...
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
const
Out
out_conj
=
Eigen
::
numext
::
conj
(
out
);
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out
;
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out_conj
;
}
}
};
};
...
@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
...
@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename
dX
>
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
factor
)
*
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
factor
)
*
x
.
pow
(
static_cast
<
T
>
(
factor
-
static_cast
<
T
>
(
1
)
));
x
.
pow
(
static_cast
<
T
>
(
factor
)
-
static_cast
<
T
>
(
1
));
}
}
};
};
...
...
paddle/fluid/operators/adagrad_op.cc
浏览文件 @
234a1d92
...
@@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
...
@@ -119,8 +119,8 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
auto
*
grad_merge_data
=
grad_merge
.
mutable_value
()
->
template
data
<
T
>();
auto
*
grad_merge_data
=
grad_merge
.
mutable_value
()
->
template
data
<
T
>();
// 2. m += g_m * g_m
// 2. m += g_m * g_m
math
::
scatter
::
Mul
<
platform
::
CPUDeviceContext
,
T
>
sqare_func
;
auto
grad_square
=
auto
grad_square
=
sqare_func
(
context
,
grad_merge
,
grad_merge
);
SquareSelectedRows
<
platform
::
CPUDeviceContext
,
T
>
(
context
,
grad_merge
);
math
::
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
T
>
functor
;
math
::
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
T
>
functor
;
functor
(
context
,
grad_square
,
moment
);
functor
(
context
,
grad_square
,
moment
);
...
...
paddle/fluid/operators/adagrad_op.cu
浏览文件 @
234a1d92
...
@@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
...
@@ -84,8 +84,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
auto
*
grad_merge_data
=
grad_merge
.
mutable_value
()
->
template
data
<
T
>();
auto
*
grad_merge_data
=
grad_merge
.
mutable_value
()
->
template
data
<
T
>();
framework
::
Vector
<
int64_t
>
merge_rows
(
grad_merge
.
rows
());
framework
::
Vector
<
int64_t
>
merge_rows
(
grad_merge
.
rows
());
// 2. m += g_m * g_m
// 2. m += g_m * g_m
math
::
scatter
::
Mul
<
platform
::
CUDADeviceContext
,
T
>
sqare_func
;
auto
grad_square
=
auto
grad_square
=
sqare_func
(
context
,
grad_merge
,
grad_merge
);
SquareSelectedRows
<
platform
::
CUDADeviceContext
,
T
>
(
context
,
grad_merge
);
math
::
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
T
>
functor
;
math
::
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
grad_square
,
moment
);
functor
(
context
,
grad_square
,
moment
);
...
...
paddle/fluid/operators/adagrad_op.h
浏览文件 @
234a1d92
...
@@ -28,6 +28,20 @@ struct SparseAdagradFunctor {
...
@@ -28,6 +28,20 @@ struct SparseAdagradFunctor {
framework
::
Tensor
*
moment
,
framework
::
Tensor
*
param
);
framework
::
Tensor
*
moment
,
framework
::
Tensor
*
param
);
};
};
template
<
typename
DeviceContext
,
typename
T
>
framework
::
SelectedRows
SquareSelectedRows
(
const
DeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
)
{
framework
::
SelectedRows
out
;
out
.
set_rows
(
input
.
rows
());
out
.
set_height
(
input
.
height
());
out
.
mutable_value
()
->
mutable_data
<
T
>
(
input
.
value
().
dims
(),
context
.
GetPlace
());
auto
e_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
(
out
.
mutable_value
()));
auto
e_in
=
framework
::
EigenVector
<
T
>::
Flatten
(
input
.
value
());
e_out
.
device
(
*
context
.
eigen_device
())
=
e_in
.
square
();
return
out
;
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
AdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
AdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
...
paddle/fluid/operators/batch_norm_op.cu.cc
浏览文件 @
234a1d92
...
@@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
...
@@ -219,8 +219,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_scale
->
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
if
((
N
*
H
*
W
*
D
)
==
1
)
{
if
((
N
*
H
*
W
*
D
)
==
1
)
{
...
@@ -272,8 +272,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
...
@@ -272,8 +272,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
void
*
saved_mean_data
=
saved_mean
->
template
data
<
T
>();
const
void
*
saved_mean_data
=
const
void
*
saved_var_data
=
saved_var
->
template
data
<
T
>();
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
void
*
saved_var_data
=
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
dev_ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
...
@@ -281,10 +283,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
...
@@ -281,10 +283,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
data_desc_
,
d_y
->
template
data
<
T
>(),
data_desc_
,
data_desc_
,
d_y
->
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
T
>(),
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
epsilon
,
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())
,
saved_mean_data
,
saved_var_data
));
epsilon
,
saved_mean_data
,
saved_var_data
));
// clean when exit.
// clean when exit.
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
...
@@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
...
@@ -304,4 +306,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
BatchNormKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
BatchNormKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
batch_norm_grad
,
ops
::
BatchNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
batch_norm_grad
,
ops
::
BatchNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
BatchNormGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
ops
::
BatchNormGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
BatchNormGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
234a1d92
...
@@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
...
@@ -143,9 +143,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
// Currently tensor core is only enabled using this algo
// Currently tensor core is only enabled using this algo
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
}
#endif
#endif
...
@@ -361,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
...
@@ -361,7 +363,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
REGISTER_OP_KERNEL
(
conv2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d
,
CUDNN
,
plat
::
CUDAPlace
,
REGISTER_OP_KERNEL
(
conv3d
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvOpKernel
<
float
>
,
...
...
paddle/fluid/operators/cross_entropy_op.cu
浏览文件 @
234a1d92
...
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
...
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
plat
=
paddle
::
platform
;
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
using
CUDACtx
=
paddle
::
platform
::
CUDADeviceContext
;
using
CUDACtx
=
paddle
::
platform
::
CUDADeviceContext
;
REGISTER_OP_CUDA_KERNEL
(
cross_entropy
,
REGISTER_OP_CUDA_KERNEL
(
cross_entropy
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
);
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
,
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
plat
::
float16
>
);
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
plat
::
float16
>
);
paddle/fluid/operators/distributed/grpc_variable_response.cc
浏览文件 @
234a1d92
...
@@ -286,10 +286,10 @@ int GRPCVariableResponse::Parse(Source* source) {
...
@@ -286,10 +286,10 @@ int GRPCVariableResponse::Parse(Source* source) {
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kCPU
);
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kCPU
);
}
else
if
(
profiling
==
platform
::
kDisableProfiler
&&
}
else
if
(
profiling
==
platform
::
kDisableProfiler
&&
platform
::
IsProfileEnabled
())
{
platform
::
IsProfileEnabled
())
{
// TODO(panyx0718): Should we allow to customize file dir.
platform
::
DisableProfiler
(
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kDefault
,
platform
::
EventSortingKey
::
kDefault
,
string
::
Sprintf
(
"/tmp/profile_ps_%lld"
,
listener_id
));
string
::
Sprintf
(
"%s_%lld"
,
FLAGS_rpc_server_profile_path
,
listener_id
));
}
}
break
;
break
;
}
}
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
234a1d92
...
@@ -51,7 +51,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
...
@@ -51,7 +51,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Async
// Async
if
(
!
sync_mode_
)
{
if
(
!
sync_mode_
)
{
VLOG
(
3
)
<<
"async process var: "
<<
varname
;
VLOG
(
3
)
<<
"async process var: "
<<
varname
;
rpc_server_
->
Profiler
().
OneStep
();
try
{
try
{
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
varname
].
get
(),
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
varname
].
get
(),
scope
);
scope
);
...
...
paddle/fluid/operators/distributed/rpc_server.cc
浏览文件 @
234a1d92
...
@@ -20,42 +20,10 @@
...
@@ -20,42 +20,10 @@
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32
(
rpc_server_profile_period
,
0
,
"the period of listen_and_serv to do profile"
);
DEFINE_string
(
rpc_server_profile_path
,
"/dev/null"
,
"the profile log file path"
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
distributed
{
namespace
distributed
{
RPCServerProfiler
::
RPCServerProfiler
(
int
profile_period
,
const
std
::
string
&
profile_log_path
)
:
profile_period_
(
profile_period
),
profile_log_path_
(
profile_log_path
)
{
step_
=
0
;
}
void
RPCServerProfiler
::
OneStep
()
{
PADDLE_ENFORCE_LE
(
step_
,
profile_period_
,
"step_ should not be larger then "
"profile_period_"
);
if
(
profile_period_
<=
0
)
{
return
;
}
if
(
step_
==
0
)
{
auto
pf_state
=
paddle
::
platform
::
ProfilerState
::
kCPU
;
paddle
::
platform
::
EnableProfiler
(
pf_state
);
}
if
(
step_
==
profile_period_
)
{
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kTotal
,
profile_log_path_
);
step_
=
0
;
}
else
{
step_
++
;
}
}
void
RPCServer
::
ShutDown
()
{
void
RPCServer
::
ShutDown
()
{
LOG
(
INFO
)
<<
"RPCServer ShutDown "
;
LOG
(
INFO
)
<<
"RPCServer ShutDown "
;
ShutDownImpl
();
ShutDownImpl
();
...
...
paddle/fluid/operators/distributed/rpc_server.h
浏览文件 @
234a1d92
...
@@ -23,30 +23,14 @@
...
@@ -23,30 +23,14 @@
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32
(
rpc_server_profile_period
);
DECLARE_string
(
rpc_server_profile_path
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
distributed
{
namespace
distributed
{
class
RPCServerProfiler
{
public:
RPCServerProfiler
(
int
profile_period
,
const
std
::
string
&
profile_log_path
);
void
OneStep
();
private:
const
int
profile_period_
;
std
::
string
profile_log_path_
;
int
step_
;
};
class
RPCServer
{
class
RPCServer
{
public:
public:
explicit
RPCServer
(
const
std
::
string
&
address
,
int
client_num
)
explicit
RPCServer
(
const
std
::
string
&
address
,
int
client_num
)
:
cur_cond_
(
0
),
:
cur_cond_
(
0
),
profiler_
(
FLAGS_rpc_server_profile_period
,
FLAGS_rpc_server_profile_path
),
bind_address_
(
address
),
bind_address_
(
address
),
exit_flag_
(
false
),
exit_flag_
(
false
),
selected_port_
(
0
),
selected_port_
(
0
),
...
@@ -86,7 +70,6 @@ class RPCServer {
...
@@ -86,7 +70,6 @@ class RPCServer {
void
Complete
();
void
Complete
();
void
ResetBarrierCounter
();
void
ResetBarrierCounter
();
RPCServerProfiler
&
Profiler
()
{
return
profiler_
;
}
bool
NeedResetAllVars
();
bool
NeedResetAllVars
();
...
@@ -101,7 +84,6 @@ class RPCServer {
...
@@ -101,7 +84,6 @@ class RPCServer {
std
::
unordered_map
<
std
::
string
,
int
>
rpc_cond_map_
;
std
::
unordered_map
<
std
::
string
,
int
>
rpc_cond_map_
;
std
::
atomic
<
int
>
cur_cond_
;
std
::
atomic
<
int
>
cur_cond_
;
std
::
condition_variable
rpc_cond_
;
std
::
condition_variable
rpc_cond_
;
RPCServerProfiler
profiler_
;
protected:
protected:
std
::
string
bind_address_
;
std
::
string
bind_address_
;
...
...
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
234a1d92
...
@@ -16,6 +16,9 @@
...
@@ -16,6 +16,9 @@
#include <vector>
#include <vector>
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
DEFINE_string
(
rpc_server_profile_path
,
"./profile_ps"
,
"the profile log file path"
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
distributed
{
namespace
distributed
{
...
...
paddle/fluid/operators/distributed/variable_response.h
浏览文件 @
234a1d92
...
@@ -27,6 +27,8 @@
...
@@ -27,6 +27,8 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
DECLARE_string
(
rpc_server_profile_path
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
distributed
{
namespace
distributed
{
...
...
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
234a1d92
...
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
...
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
234a1d92
...
@@ -364,7 +364,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
...
@@ -364,7 +364,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int
j
=
blockIdx
.
x
;
int
j
=
blockIdx
.
x
;
int
i
=
threadIdx
.
x
;
int
i
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
do
{
do
{
int
x_offset
=
i
*
w
+
j
;
int
x_offset
=
i
*
w
+
j
;
...
@@ -432,7 +432,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
...
@@ -432,7 +432,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
j
=
blockIdx
.
x
;
int
j
=
blockIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
ttid
=
tid
;
int
ttid
=
tid
;
while
(
true
)
{
while
(
true
)
{
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
234a1d92
...
@@ -134,7 +134,6 @@ void ListenAndServOp::RunSyncLoop(
...
@@ -134,7 +134,6 @@ void ListenAndServOp::RunSyncLoop(
rpc_service_
->
ResetBarrierCounter
();
rpc_service_
->
ResetBarrierCounter
();
while
(
true
)
{
while
(
true
)
{
rpc_service_
->
Profiler
().
OneStep
();
// Get from multiple trainers, we don't care about the order in which
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
...
...
paddle/fluid/operators/math/cos_sim_functor.cu
浏览文件 @
234a1d92
...
@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
...
@@ -51,7 +51,7 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
T
*
dy
)
const
{
T
*
dy
)
const
{
const
int
block_size
=
512
;
const
int
block_size
=
512
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
(
rows
+
block_size
-
1
)
/
block_size
);
dim3
grid
(
(
rows
+
block_size
-
1
)
/
block_size
,
1
);
CosSimDyKernel
<
T
><<<
grid
,
threads
,
0
,
ctx
.
stream
()
>>>
(
CosSimDyKernel
<
T
><<<
grid
,
threads
,
0
,
ctx
.
stream
()
>>>
(
x_norm
,
y_norm
,
x
,
y
,
z
,
dz
,
rows
,
cols
,
dy
);
x_norm
,
y_norm
,
x
,
y
,
z
,
dz
,
rows
,
cols
,
dy
);
}
}
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
234a1d92
...
@@ -21,6 +21,16 @@ namespace operators {
...
@@ -21,6 +21,16 @@ namespace operators {
namespace
math
{
namespace
math
{
namespace
{
namespace
{
__device__
__forceinline__
float
real_log
(
float
x
)
{
return
logf
(
x
);
}
__device__
__forceinline__
double
real_log
(
double
x
)
{
return
log
(
x
);
}
__device__
__forceinline__
platform
::
float16
real_log
(
const
platform
::
float16
&
val
)
{
return
static_cast
<
platform
::
float16
>
(
logf
(
static_cast
<
float
>
(
val
)));
}
template
<
typename
T
>
template
<
typename
T
>
__global__
void
CrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
int64_t
*
label
,
__global__
void
CrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
int64_t
*
label
,
const
int
N
,
const
int
D
,
const
int
N
,
const
int
D
,
...
@@ -29,8 +39,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
...
@@ -29,8 +39,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
PADDLE_ASSERT
(
label
[
i
]
>=
0
&&
label
[
i
]
<
D
||
label
[
i
]
==
ignore_index
);
PADDLE_ASSERT
(
label
[
i
]
>=
0
&&
label
[
i
]
<
D
||
label
[
i
]
==
ignore_index
);
Y
[
i
]
=
ignore_index
==
label
[
i
]
Y
[
i
]
=
ignore_index
==
label
[
i
]
?
0
?
static_cast
<
T
>
(
0
)
:
-
math
::
TolerableValue
<
T
>
()(
log
(
X
[
i
*
D
+
label
[
i
]]));
:
-
math
::
TolerableValue
<
T
>
()(
real_
log
(
X
[
i
*
D
+
label
[
i
]]));
}
}
}
}
...
@@ -38,12 +48,12 @@ template <typename T>
...
@@ -38,12 +48,12 @@ template <typename T>
__global__
void
SoftCrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
T
*
label
,
__global__
void
SoftCrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
T
*
label
,
const
int
class_num
)
{
const
int
class_num
)
{
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
idx
=
blockIdx
.
x
*
class_num
+
tid
;
int
idx
=
blockIdx
.
x
*
class_num
+
tid
;
int
end
=
blockIdx
.
x
*
class_num
+
class_num
;
int
end
=
blockIdx
.
x
*
class_num
+
class_num
;
for
(;
idx
<
end
;
idx
+=
blockDim
.
x
)
{
for
(;
idx
<
end
;
idx
+=
blockDim
.
x
)
{
val
+=
math
::
TolerableValue
<
T
>
()(
std
::
log
(
X
[
idx
]))
*
label
[
idx
];
val
+=
math
::
TolerableValue
<
T
>
()(
real_
log
(
X
[
idx
]))
*
label
[
idx
];
}
}
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
blockDim
.
x
);
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
blockDim
.
x
);
...
@@ -53,8 +63,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
...
@@ -53,8 +63,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
}
}
}
// namespace
}
// namespace
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
template
<
typename
T
>
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
public:
...
@@ -89,6 +97,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
...
@@ -89,6 +97,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/math/cross_entropy.h
浏览文件 @
234a1d92
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/hostdevice.h"
...
@@ -33,6 +34,26 @@ struct TolerableValue {
...
@@ -33,6 +34,26 @@ struct TolerableValue {
}
}
};
};
// NOTE(dzh): float16 value clip behave different.
// 1. Our ValueClipping has a hardcore threshold 1e20
// for float number. 1e20 will resulting in overflow in float16.
// 2. float16 should expose the the real number overflow to python.
// because mixed-training depends the inf/nan value to determine
// if the scale value will be adjusted.
// Also. In standard implementation of cross entropy, other
// framework not has the ValueClipping.
template
<
>
struct
TolerableValue
<
platform
::
float16
>
{
HOSTDEVICE
platform
::
float16
operator
()(
const
platform
::
float16
&
x
)
const
{
if
(
platform
::
isfinite
(
x
))
return
x
;
else
if
(
x
>
static_cast
<
platform
::
float16
>
(
0
))
return
std
::
numeric_limits
<
platform
::
float16
>::
max
();
else
return
std
::
numeric_limits
<
platform
::
float16
>::
min
();
}
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
CrossEntropyFunctor
{
class
CrossEntropyFunctor
{
public:
public:
...
...
paddle/fluid/operators/math/fc_compute.h
浏览文件 @
234a1d92
...
@@ -36,7 +36,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
...
@@ -36,7 +36,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
.
template
Get
<
jitkernel
::
VAddReluKernel
<
T
>
>
(
N
);
.
template
Get
<
jitkernel
::
VAddReluKernel
<
T
>
>
(
N
);
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
T
*
dst
=
Y
+
i
*
N
;
vaddrelu
->
Compute
(
B
,
dst
,
dst
);
vaddrelu
->
Compute
(
B
,
dst
,
dst
,
N
);
}
}
}
else
{
}
else
{
const
auto
&
vadd
=
jitkernel
::
KernelPool
::
Instance
()
const
auto
&
vadd
=
jitkernel
::
KernelPool
::
Instance
()
...
@@ -47,7 +47,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
...
@@ -47,7 +47,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
#endif
#endif
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
T
*
dst
=
Y
+
i
*
N
;
vadd
->
Compute
(
B
,
dst
,
dst
);
vadd
->
Compute
(
B
,
dst
,
dst
,
N
);
}
}
}
}
}
}
...
...
paddle/fluid/operators/math/jit_code.cc
浏览文件 @
234a1d92
...
@@ -24,19 +24,29 @@ namespace gen {
...
@@ -24,19 +24,29 @@ namespace gen {
using
namespace
platform
::
jit
;
// NOLINT
using
namespace
platform
::
jit
;
// NOLINT
bool
V
Mul
JitCode
::
init
(
int
d
)
{
bool
V
VV
JitCode
::
init
(
int
d
)
{
// It's not necessary to use avx512 since it would slow down the frequency
// It's not necessary to use avx512 since it would slow down the frequency
// and this kernel is not compute bound.
// and this kernel is not compute bound.
return
MayIUse
(
avx
);
return
MayIUse
(
avx
);
}
}
void
V
Mul
JitCode
::
generate
()
{
void
V
VV
JitCode
::
generate
()
{
// do not need push stack, and do not need save avx512reg if do not use avx512
// do not need push stack, and do not need save avx512reg if do not use avx512
int
offset
=
0
;
int
offset
=
0
;
if
(
with_relu_
)
{
vxorps
(
ymm_zero
,
ymm_zero
,
ymm_zero
);
}
for
(
int
i
=
0
;
i
<
num_
/
AVX_FLOAT_BLOCK
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
/
AVX_FLOAT_BLOCK
;
++
i
)
{
vmovups
(
ymm_src1
,
ptr
[
param1
+
offset
]);
vmovups
(
ymm_src1
,
ptr
[
param1
+
offset
]);
vmovups
(
ymm_src2
,
ptr
[
param2
+
offset
]);
vmovups
(
ymm_src2
,
ptr
[
param2
+
offset
]);
vmulps
(
ymm_dst
,
ymm_src1
,
ymm_src2
);
if
(
type_
==
operand_type
::
mul
)
{
vmulps
(
ymm_dst
,
ymm_src1
,
ymm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddps
(
ymm_dst
,
ymm_src1
,
ymm_src2
);
}
if
(
with_relu_
)
{
vmaxps
(
ymm_dst
,
ymm_zero
,
ymm_dst
);
}
vmovups
(
ptr
[
param3
+
offset
],
ymm_dst
);
vmovups
(
ptr
[
param3
+
offset
],
ymm_dst
);
offset
+=
sizeof
(
float
)
*
AVX_FLOAT_BLOCK
;
offset
+=
sizeof
(
float
)
*
AVX_FLOAT_BLOCK
;
}
}
...
@@ -44,7 +54,14 @@ void VMulJitCode::generate() {
...
@@ -44,7 +54,14 @@ void VMulJitCode::generate() {
if
(
rest
>=
4
)
{
if
(
rest
>=
4
)
{
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
vmovups
(
xmm_src1
,
ptr
[
param1
+
offset
]);
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
vmovups
(
xmm_src2
,
ptr
[
param2
+
offset
]);
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
if
(
type_
==
operand_type
::
mul
)
{
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
vmovups
(
ptr
[
param3
+
offset
],
xmm_dst
);
vmovups
(
ptr
[
param3
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
4
;
offset
+=
sizeof
(
float
)
*
4
;
rest
-=
4
;
rest
-=
4
;
...
@@ -52,7 +69,14 @@ void VMulJitCode::generate() {
...
@@ -52,7 +69,14 @@ void VMulJitCode::generate() {
if
(
rest
>=
2
)
{
if
(
rest
>=
2
)
{
vmovq
(
xmm_src1
,
ptr
[
param1
+
offset
]);
vmovq
(
xmm_src1
,
ptr
[
param1
+
offset
]);
vmovq
(
xmm_src2
,
ptr
[
param2
+
offset
]);
vmovq
(
xmm_src2
,
ptr
[
param2
+
offset
]);
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
if
(
type_
==
operand_type
::
mul
)
{
vmulps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddps
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
vmovq
(
ptr
[
param3
+
offset
],
xmm_dst
);
vmovq
(
ptr
[
param3
+
offset
],
xmm_dst
);
offset
+=
sizeof
(
float
)
*
2
;
offset
+=
sizeof
(
float
)
*
2
;
rest
-=
2
;
rest
-=
2
;
...
@@ -60,12 +84,18 @@ void VMulJitCode::generate() {
...
@@ -60,12 +84,18 @@ void VMulJitCode::generate() {
if
(
rest
>
0
)
{
if
(
rest
>
0
)
{
vmovss
(
xmm_src1
,
ptr
[
param1
+
offset
]);
vmovss
(
xmm_src1
,
ptr
[
param1
+
offset
]);
vmovss
(
xmm_src2
,
ptr
[
param2
+
offset
]);
vmovss
(
xmm_src2
,
ptr
[
param2
+
offset
]);
vmulss
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
if
(
type_
==
operand_type
::
mul
)
{
vmulss
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
else
if
(
type_
==
operand_type
::
add
)
{
vaddss
(
xmm_dst
,
xmm_src1
,
xmm_src2
);
}
if
(
with_relu_
)
{
vmaxps
(
xmm_dst
,
xmm_zero
,
xmm_dst
);
}
vmovss
(
ptr
[
param3
+
offset
],
xmm_dst
);
vmovss
(
ptr
[
param3
+
offset
],
xmm_dst
);
}
}
ret
();
ret
();
}
}
}
// namespace gen
}
// namespace gen
}
// namespace jitkernel
}
// namespace jitkernel
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/jit_code.h
浏览文件 @
234a1d92
...
@@ -14,8 +14,8 @@ limitations under the License. */
...
@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once
#pragma once
#include <string>
#include "paddle/fluid/operators/math/jit_gen.h"
#include "paddle/fluid/operators/math/jit_gen.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
...
@@ -29,28 +29,47 @@ using ymm_t = const Xbyak::Ymm;
...
@@ -29,28 +29,47 @@ using ymm_t = const Xbyak::Ymm;
using
zmm_t
=
const
Xbyak
::
Zmm
;
using
zmm_t
=
const
Xbyak
::
Zmm
;
using
Label
=
Xbyak
::
Label
;
using
Label
=
Xbyak
::
Label
;
class
VMulJitCode
:
public
JitCode
{
// function: vec = Operand(vec, vec) (maybe with relu)
typedef
enum
{
mul
=
0
,
add
}
operand_type
;
class
VVVJitCode
:
public
JitCode
{
public:
public:
DECLARE_JIT_CODE
(
VMulJitCode
);
const
char
*
name
()
const
override
{
explicit
VMulJitCode
(
int
d
,
size_t
code_size
=
256
*
1024
,
std
::
string
base
=
"VVVJitCode"
;
void
*
code_ptr
=
nullptr
)
if
(
type_
==
operand_type
::
mul
)
{
:
JitCode
(
code_size
,
code_ptr
),
num_
(
d
)
{}
base
+=
"_Mul"
;
}
else
if
(
type_
==
operand_type
::
add
)
{
base
+=
"_Add"
;
}
base
+=
(
with_relu_
?
"_relu"
:
""
);
return
base
.
c_str
();
}
explicit
VVVJitCode
(
int
d
,
operand_type
type
,
bool
with_relu
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
num_
(
d
),
type_
(
type
),
with_relu_
(
with_relu
)
{}
static
bool
init
(
int
d
);
static
bool
init
(
int
d
);
void
generate
()
override
;
void
generate
()
override
;
private:
private:
int
num_
;
int
num_
;
operand_type
type_
;
bool
with_relu_
;
reg64_t
param1
{
abi_param1
};
reg64_t
param1
{
abi_param1
};
reg64_t
param2
{
abi_param2
};
reg64_t
param2
{
abi_param2
};
reg64_t
param3
{
abi_param3
};
reg64_t
param3
{
abi_param3
};
xmm_t
xmm_src1
=
xmm_t
(
0
);
xmm_t
xmm_src1
=
xmm_t
(
0
);
xmm_t
xmm_src2
=
xmm_t
(
1
);
xmm_t
xmm_src2
=
xmm_t
(
1
);
xmm_t
xmm_dst
=
xmm_t
(
2
);
xmm_t
xmm_dst
=
xmm_t
(
1
);
xmm_t
xmm_zero
=
xmm_t
(
2
);
ymm_t
ymm_src1
=
ymm_t
(
0
);
ymm_t
ymm_src1
=
ymm_t
(
0
);
ymm_t
ymm_src2
=
ymm_t
(
1
);
ymm_t
ymm_src2
=
ymm_t
(
1
);
ymm_t
ymm_dst
=
ymm_t
(
2
);
ymm_t
ymm_dst
=
ymm_t
(
1
);
ymm_t
ymm_zero
=
ymm_t
(
2
);
};
};
}
// namespace gen
}
// namespace gen
...
...
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
234a1d92
...
@@ -71,26 +71,26 @@ class VMulKernel : public Kernel {
...
@@ -71,26 +71,26 @@ class VMulKernel : public Kernel {
template
<
typename
T
>
template
<
typename
T
>
class
VAddKernel
:
public
Kernel
{
class
VAddKernel
:
public
Kernel
{
public:
public:
v
irtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
v
oid
(
*
Compute
)(
const
T
*
,
const
T
*
,
T
*
,
int
)
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
V
Scal
Kernel
:
public
Kernel
{
class
V
AddRelu
Kernel
:
public
Kernel
{
public:
public:
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
void
(
*
Compute
)(
const
T
*
,
const
T
*
,
T
*
,
int
);
virtual
void
Compute
(
const
T
a
,
T
*
x
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
V
AddBias
Kernel
:
public
Kernel
{
class
V
Scal
Kernel
:
public
Kernel
{
public:
public:
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
T
*
x
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
class
VAdd
Relu
Kernel
:
public
Kernel
{
class
VAdd
Bias
Kernel
:
public
Kernel
{
public:
public:
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
...
paddle/fluid/operators/math/jit_kernel_blas.cc
浏览文件 @
234a1d92
...
@@ -38,6 +38,21 @@ void VMulRefer(const T* x, const T* y, T* z, int n) {
...
@@ -38,6 +38,21 @@ void VMulRefer(const T* x, const T* y, T* z, int n) {
}
}
}
}
template
<
typename
T
>
void
VAddRefer
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
template
<
typename
T
>
void
VAddReluRefer
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
z
[
i
]
=
z
[
i
]
>
0
?
z
[
i
]
:
0
;
}
}
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
template
<
typename
T
>
template
<
typename
T
>
void
VMulMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
void
VMulMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
...
@@ -46,28 +61,45 @@ template <>
...
@@ -46,28 +61,45 @@ template <>
void
VMulMKL
<
float
>
(
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
n
)
{
void
VMulMKL
<
float
>
(
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
n
)
{
platform
::
dynload
::
vsMul
(
n
,
x
,
y
,
z
);
platform
::
dynload
::
vsMul
(
n
,
x
,
y
,
z
);
}
}
template
<
>
template
<
>
void
VMulMKL
<
double
>
(
const
double
*
x
,
const
double
*
y
,
double
*
z
,
int
n
)
{
void
VMulMKL
<
double
>
(
const
double
*
x
,
const
double
*
y
,
double
*
z
,
int
n
)
{
platform
::
dynload
::
vdMul
(
n
,
x
,
y
,
z
);
platform
::
dynload
::
vdMul
(
n
,
x
,
y
,
z
);
}
}
template
<
typename
T
>
void
VAddMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
template
<
>
void
VAddMKL
<
float
>
(
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
n
)
{
platform
::
dynload
::
vsAdd
(
n
,
x
,
y
,
z
);
}
template
<
>
void
VAddMKL
<
double
>
(
const
double
*
x
,
const
double
*
y
,
double
*
z
,
int
n
)
{
platform
::
dynload
::
vdAdd
(
n
,
x
,
y
,
z
);
}
#endif
#endif
#define DECLARE_STATIC_FUNC \
static inline std::string name(int d) { \
PADDLE_THROW("DType should be either float or double"); \
} \
static inline bool useJIT(int d) { return false; } \
static inline bool useMKL(int d) { return false; }
/* VMUL JitKernel */
/* VMUL JitKernel */
template
<
typename
T
>
template
<
typename
T
>
class
VMulKernelImpl
:
public
VMulKernel
<
T
>
{
class
VMulKernelImpl
:
public
VMulKernel
<
T
>
{
public:
public:
static
inline
std
::
string
name
(
int
d
)
{
DECLARE_STATIC_FUNC
;
PADDLE_THROW
(
"DType should be either float or double"
);
}
static
inline
bool
useJIT
(
int
d
)
{
return
false
;
}
static
inline
bool
useMKL
(
int
d
)
{
return
false
;
}
explicit
VMulKernelImpl
(
int
d
)
:
VMulKernel
<
T
>
()
{
explicit
VMulKernelImpl
(
int
d
)
:
VMulKernel
<
T
>
()
{
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
if
(
useJIT
(
d
))
{
if
(
useJIT
(
d
))
{
// roughly estimate the size of code
// roughly estimate the size of code
size_t
sz
=
96
+
d
/
AVX_FLOAT_BLOCK
*
4
*
8
;
size_t
sz
=
96
+
d
/
AVX_FLOAT_BLOCK
*
4
*
8
;
jitcode_
.
reset
(
new
gen
::
VMulJitCode
(
d
,
sz
>
4096
?
sz
:
4096
));
jitcode_
.
reset
(
new
gen
::
VVVJitCode
(
d
,
gen
::
operand_type
::
mul
,
false
,
sz
>
4096
?
sz
:
4096
));
this
->
Compute
=
this
->
Compute
=
jitcode_
->
getCode
<
void
(
*
)(
const
T
*
,
const
T
*
,
T
*
,
int
)
>
();
jitcode_
->
getCode
<
void
(
*
)(
const
T
*
,
const
T
*
,
T
*
,
int
)
>
();
return
;
return
;
...
@@ -85,14 +117,14 @@ class VMulKernelImpl : public VMulKernel<T> {
...
@@ -85,14 +117,14 @@ class VMulKernelImpl : public VMulKernel<T> {
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
private:
private:
std
::
unique_ptr
<
gen
::
V
Mul
JitCode
>
jitcode_
{
nullptr
};
std
::
unique_ptr
<
gen
::
V
VV
JitCode
>
jitcode_
{
nullptr
};
#endif
#endif
};
};
#ifdef PADDLE_WITH_XBYAK
#ifdef PADDLE_WITH_XBYAK
template
<
>
template
<
>
bool
VMulKernelImpl
<
float
>::
useJIT
(
int
d
)
{
bool
VMulKernelImpl
<
float
>::
useJIT
(
int
d
)
{
return
gen
::
V
Mul
JitCode
::
init
(
d
);
return
gen
::
V
VV
JitCode
::
init
(
d
);
}
}
#endif
#endif
...
@@ -108,63 +140,93 @@ bool VMulKernelImpl<double>::useMKL(int d) {
...
@@ -108,63 +140,93 @@ bool VMulKernelImpl<double>::useMKL(int d) {
}
}
#endif
#endif
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
/* VAdd JitKernel */
template
<
typename
T
>
/* VADD JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddKernelImpl
:
public
VAddKernel
<
T
>
{
class
VAddKernelImpl
:
public
VAddKernel
<
T
>
{
public:
public:
explicit
VAddKernelImpl
(
int
d
)
:
VAddKernel
<
T
>
()
{
this
->
num_
=
d
;
}
DECLARE_STATIC_FUNC
;
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
explicit
VAddKernelImpl
(
int
d
)
:
VAddKernel
<
T
>
()
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
#ifdef PADDLE_WITH_XBYAK
z
[
i
]
=
x
[
i
]
+
y
[
i
];
if
(
useJIT
(
d
))
{
size_t
sz
=
96
+
d
/
AVX_FLOAT_BLOCK
*
4
*
8
;
jitcode_
.
reset
(
new
gen
::
VVVJitCode
(
d
,
gen
::
operand_type
::
add
,
false
,
sz
>
4096
?
sz
:
4096
));
this
->
Compute
=
jitcode_
->
getCode
<
void
(
*
)(
const
T
*
,
const
T
*
,
T
*
,
int
)
>
();
return
;
}
#endif
#ifdef PADDLE_WITH_MKLML
if
(
useMKL
(
d
))
{
this
->
Compute
=
VAddMKL
<
T
>
;
return
;
}
}
#endif
this
->
Compute
=
VAddRefer
<
T
>
;
}
}
#ifdef PADDLE_WITH_XBYAK
private:
std
::
unique_ptr
<
gen
::
VVVJitCode
>
jitcode_
{
nullptr
};
#endif
};
};
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_XBYAK
#define MKL_FLOAT(isa, block) \
template
<
>
template <> \
bool
VAddKernelImpl
<
float
>::
useJIT
(
int
d
)
{
void VAddKernelImpl<float, isa, block>::Compute( \
return
gen
::
VVVJitCode
::
init
(
d
);
const float* x, const float* y, float* z) const { \
}
platform::dynload::vsAdd(this->num_, x, y, z); \
#endif
}
#define MKL_DOUBLE(isa, block) \
#ifdef PADDLE_WITH_MKLML
template <> \
template
<
>
void VAddKernelImpl<double, isa, block>::Compute( \
bool
VAddKernelImpl
<
float
>::
useMKL
(
int
d
)
{
const double* x, const double* y, double* z) const { \
return
d
>
512
;
platform::dynload::vdAdd(this->num_, x, y, z); \
}
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
template
<
>
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
bool
VAddKernelImpl
<
double
>::
useMKL
(
int
d
)
{
return
true
;
}
#endif
#endif
#define INTRI8_FLOAT(isa) \
/* VAddRelu JitKernel */
template <> \
template
<
typename
T
>
void VAddKernelImpl<float, isa, kEQ8>::Compute( \
class
VAddReluKernelImpl
:
public
VAddReluKernel
<
T
>
{
const float* x, const float* y, float* z) const { \
public:
__m256 tmpx, tmpy; \
DECLARE_STATIC_FUNC
;
tmpx = _mm256_loadu_ps(x); \
explicit
VAddReluKernelImpl
(
int
d
)
:
VAddReluKernel
<
T
>
()
{
tmpy = _mm256_loadu_ps(y); \
#ifdef PADDLE_WITH_XBYAK
tmpx = _mm256_add_ps(tmpx, tmpy); \
if
(
useJIT
(
d
))
{
_mm256_storeu_ps(z, tmpx); \
size_t
sz
=
96
+
d
/
AVX_FLOAT_BLOCK
*
4
*
8
;
}
jitcode_
.
reset
(
new
gen
::
VVVJitCode
(
d
,
gen
::
operand_type
::
add
,
true
,
#ifdef __AVX__
sz
>
4096
?
sz
:
4096
));
INTRI8_FLOAT
(
jit
::
avx
);
this
->
Compute
=
jitcode_
->
getCode
<
void
(
*
)(
const
T
*
,
const
T
*
,
T
*
,
int
)
>
();
return
;
}
#endif
#endif
#ifdef __AVX2__
this
->
Compute
=
VAddReluRefer
<
T
>
;
INTRI8_FLOAT
(
jit
::
avx2
);
}
#ifdef PADDLE_WITH_XBYAK
private:
std
::
unique_ptr
<
gen
::
VVVJitCode
>
jitcode_
{
nullptr
};
#endif
#endif
#ifdef __AVX512F__
};
INTRI8_FLOAT
(
jit
::
avx512f
);
#ifdef PADDLE_WITH_XBYAK
template
<
>
bool
VAddReluKernelImpl
<
float
>::
useJIT
(
int
d
)
{
return
gen
::
VVVJitCode
::
init
(
d
);
}
#endif
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef DECLARE_STATIC_FUNC
#undef MKL_FLOAT
#undef MKL_DOUBLE
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vaddrelu
,
VAddReluKernel
);
/* VSCAL JitKernel */
/* VSCAL JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
...
@@ -401,98 +463,9 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
...
@@ -401,98 +463,9 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{}
};
};
/* VAddRelu JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddReluKernelImpl
:
public
VAddReluKernel
<
T
>
{
public:
explicit
VAddReluKernelImpl
(
int
d
)
:
VAddReluKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
z
[
i
]
=
z
[
i
]
>
0
?
z
[
i
]
:
0
;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx = _mm256_loadu_ps(x); \
__m256 tmpy = _mm256_loadu_ps(y); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps()); \
_mm256_storeu_ps(z, tmpy); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ16>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(y); \
tmp0 = _mm256_add_ps(tmp0, tmp1); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_loadu_ps(x + 8); \
__m256 tmp2 = _mm256_loadu_ps(y + 8); \
tmp1 = _mm256_add_ps(tmp1, tmp2); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(z, tmp0); \
_mm256_storeu_ps(z + 8, tmp1); \
}
#define INTRI_COMMON_FLOAT(isa, block) \
template <> \
VAddReluKernelImpl<float, isa, block>::VAddReluKernelImpl(int d) \
: VAddReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
} \
template <> \
void VAddReluKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmpx = _mm256_loadu_ps(x + i); \
__m256 tmpy = _mm256_loadu_ps(y + i); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, zeros); \
_mm256_storeu_ps(z + i, tmpy); \
} \
for (int i = this->end_; i < this->num_; ++i) { \
z[i] = x[i] + y[i]; \
z[i] = z[i] > 0 ? z[i] : 0; \
} \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_COMMON_FLOAT
(
jit
::
avx
,
kGT16
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
INTRI_COMMON_FLOAT
(
jit
::
avx2
,
kGT16
);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
INTRI_COMMON_FLOAT
(
jit
::
avx512f
,
kGT16
);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_COMMON_FLOAT
REGISTER_JITKERNEL_DEPRECATED
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vaddrelu
,
VAddReluKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
videntity
,
VIdentityKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
videntity
,
VIdentityKernel
);
}
// namespace jitkernel
}
// namespace jitkernel
...
...
paddle/fluid/operators/math/jit_kernel_rnn.cc
浏览文件 @
234a1d92
...
@@ -177,7 +177,7 @@ class LSTMKernelImpl : public LSTMKernel<T> {
...
@@ -177,7 +177,7 @@ class LSTMKernelImpl : public LSTMKernel<T> {
act_cand_d_
->
Compute
(
gates
,
gates
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
...
@@ -287,16 +287,16 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
...
@@ -287,16 +287,16 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
/* get fgated and igated*/
/* get fgated and igated*/
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
,
d_
);
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
,
d_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
,
d2_
);
act_gate_d2_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_gate_d2_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_
->
Compute
(
gates
,
gates
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
,
d_
);
/* get ogated*/
/* get ogated*/
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
...
@@ -310,7 +310,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
...
@@ -310,7 +310,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
/* get outgated, put W_oc * C_t on igated */
/* get outgated, put W_oc * C_t on igated */
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
,
d_
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
234a1d92
...
@@ -371,7 +371,7 @@ void lstm_ctht_better(
...
@@ -371,7 +371,7 @@ void lstm_ctht_better(
vtanh_d
->
Compute
(
gates
,
gates
);
vtanh_d
->
Compute
(
gates
,
gates
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
,
d
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
,
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
,
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
,
d
);
vadd_d
->
Compute
(
gates
+
d
,
gates
+
d2
,
ct
);
vadd_d
->
Compute
(
gates
+
d
,
gates
+
d2
,
ct
,
d
);
/* H_t = act_cell(C_t) * ogated */
/* H_t = act_cell(C_t) * ogated */
vtanh_d
->
Compute
(
ct
,
gates
+
d2
);
vtanh_d
->
Compute
(
ct
,
gates
+
d2
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
,
d
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
,
d
);
...
@@ -695,7 +695,7 @@ TEST(JitKernel, vadd) {
...
@@ -695,7 +695,7 @@ TEST(JitKernel, vadd) {
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
,
d
);
}
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
...
@@ -723,8 +723,8 @@ void vaddrelu_better(
...
@@ -723,8 +723,8 @@ void vaddrelu_better(
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd
,
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd
,
const
std
::
shared_ptr
<
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VReluKernel
<
float
>>&
vrelu
,
const
paddle
::
operators
::
math
::
jitkernel
::
VReluKernel
<
float
>>&
vrelu
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
d
)
{
vadd
->
Compute
(
x
,
y
,
z
);
vadd
->
Compute
(
x
,
y
,
z
,
d
);
vrelu
->
Compute
(
z
,
z
);
vrelu
->
Compute
(
z
,
z
);
}
}
...
@@ -752,12 +752,12 @@ TEST(JitKernel, vaddrelu) {
...
@@ -752,12 +752,12 @@ TEST(JitKernel, vaddrelu) {
auto
trefe
=
GetCurrentUS
();
auto
trefe
=
GetCurrentUS
();
auto
tmkls
=
GetCurrentUS
();
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vaddrelu_better
(
vadd
,
vrelu
,
x_data
,
y_data
,
zref_data
);
vaddrelu_better
(
vadd
,
vrelu
,
x_data
,
y_data
,
zref_data
,
d
);
}
}
auto
tmkle
=
GetCurrentUS
();
auto
tmkle
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
,
d
);
}
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
...
@@ -801,7 +801,11 @@ TEST(JitKernel, pool) {
...
@@ -801,7 +801,11 @@ TEST(JitKernel, pool) {
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_d
));
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_d
));
const
auto
&
pvmul_from_key
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulfjit4"
);
const
auto
&
pvmul_from_key
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulfjit4"
);
EXPECT_EQ
(
pvmul_f
,
pvmul_from_key
);
#if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32)
EXPECT_EQ
(
pvmul_from_key
,
nullptr
);
#else
EXPECT_EQ
(
pvmul_from_key
,
pvmul_f
);
#endif
const
auto
&
pvmul_from_key2
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulfjit"
);
const
auto
&
pvmul_from_key2
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulfjit"
);
EXPECT_TRUE
(
pvmul_from_key2
==
nullptr
);
EXPECT_TRUE
(
pvmul_from_key2
==
nullptr
);
}
}
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
234a1d92
...
@@ -19,6 +19,7 @@ limitations under the License. */
...
@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -81,7 +82,7 @@ template <typename T, int block_size>
...
@@ -81,7 +82,7 @@ template <typename T, int block_size>
__global__
void
SelectedRowsAddTensorKernel
(
const
T
*
selected_rows
,
__global__
void
SelectedRowsAddTensorKernel
(
const
T
*
selected_rows
,
const
int64_t
*
rows
,
T
*
tensor_out
,
const
int64_t
*
rows
,
T
*
tensor_out
,
int64_t
row_numel
)
{
int64_t
row_numel
)
{
const
int
ty
=
blockIdx
.
y
;
const
int
ty
=
blockIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
selected_rows
+=
ty
*
row_numel
;
selected_rows
+=
ty
*
row_numel
;
...
@@ -119,11 +120,11 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
...
@@ -119,11 +120,11 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto
*
out_data
=
output
->
data
<
T
>
();
auto
*
out_data
=
output
->
data
<
T
>
();
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
0.0
);
functor
(
context
,
output
,
static_cast
<
T
>
(
0
)
);
const
int
block_size
=
256
;
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
in1_rows
.
size
()
);
dim3
grid
(
in1_rows
.
size
(),
1
);
SelectedRowsAddTensorKernel
<
SelectedRowsAddTensorKernel
<
T
,
block_size
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
T
,
block_size
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
in1_rows
.
CUDAData
(
context
.
GetPlace
()),
out_data
,
in1_data
,
in1_rows
.
CUDAData
(
context
.
GetPlace
()),
out_data
,
...
@@ -137,6 +138,9 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
...
@@ -137,6 +138,9 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
>
template
<
typename
T
>
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
T
>
{
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
T
>
{
...
@@ -176,6 +180,8 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
...
@@ -176,6 +180,8 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
namespace
{
template
<
typename
T
,
int
block_size
>
template
<
typename
T
,
int
block_size
>
...
@@ -183,7 +189,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
...
@@ -183,7 +189,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
const
int64_t
*
rows
,
const
int64_t
*
rows
,
T
*
tensor_out
,
T
*
tensor_out
,
int64_t
row_numel
)
{
int64_t
row_numel
)
{
const
int
ty
=
blockIdx
.
y
;
const
int
ty
=
blockIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
selected_rows
+=
ty
*
row_numel
;
selected_rows
+=
ty
*
row_numel
;
...
@@ -216,7 +222,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
...
@@ -216,7 +222,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
auto
*
in2_data
=
input2
->
data
<
T
>
();
auto
*
in2_data
=
input2
->
data
<
T
>
();
const
int
block_size
=
256
;
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
in1_rows
.
size
()
);
dim3
grid
(
in1_rows
.
size
(),
1
);
SelectedRowsAddToTensorKernel
<
SelectedRowsAddToTensorKernel
<
T
,
block_size
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
T
,
block_size
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
in1_rows
.
CUDAData
(
context
.
GetPlace
()),
in2_data
,
in1_data
,
in1_rows
.
CUDAData
(
context
.
GetPlace
()),
in2_data
,
...
@@ -228,6 +234,8 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
...
@@ -228,6 +234,8 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
scatter
{
namespace
scatter
{
...
@@ -288,7 +296,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
...
@@ -288,7 +296,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context
.
GetPlace
());
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
constant_functor
(
context
,
out
.
mutable_value
(),
static_cast
<
T
>
(
0
)
);
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
...
@@ -348,7 +356,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
...
@@ -348,7 +356,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context
.
GetPlace
());
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
constant_functor
(
context
,
out
.
mutable_value
(),
static_cast
<
T
>
(
0
)
);
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
...
@@ -375,12 +383,13 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
...
@@ -375,12 +383,13 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
,
int
block_size
>
template
<
typename
T
,
int
block_size
>
__global__
void
UpdateToTensorKernel
(
const
T
*
selected_rows
,
__global__
void
UpdateToTensorKernel
(
const
T
*
selected_rows
,
const
int64_t
*
rows
,
const
ScatterOps
&
op
,
const
int64_t
*
rows
,
const
ScatterOps
&
op
,
T
*
tensor_out
,
int64_t
row_numel
)
{
T
*
tensor_out
,
int64_t
row_numel
)
{
const
int
ty
=
blockIdx
.
y
;
const
int
ty
=
blockIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
selected_rows
+=
ty
*
row_numel
;
selected_rows
+=
ty
*
row_numel
;
...
@@ -449,7 +458,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
...
@@ -449,7 +458,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
auto
*
in2_data
=
input2
->
data
<
T
>
();
auto
*
in2_data
=
input2
->
data
<
T
>
();
dim3
threads
(
platform
::
PADDLE_CUDA_NUM_THREADS
,
1
);
dim3
threads
(
platform
::
PADDLE_CUDA_NUM_THREADS
,
1
);
dim3
grid
(
1
,
in1_rows
.
size
()
);
dim3
grid
(
in1_rows
.
size
(),
1
);
UpdateToTensorKernel
<
T
,
platform
::
PADDLE_CUDA_NUM_THREADS
><<<
UpdateToTensorKernel
<
T
,
platform
::
PADDLE_CUDA_NUM_THREADS
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
in1_rows
.
cuda_data
(),
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
in1_rows
.
cuda_data
(),
op
,
in2_data
,
in1_row_numel
);
op
,
in2_data
,
in1_row_numel
);
...
...
paddle/fluid/operators/math/selected_rows_functor.h
浏览文件 @
234a1d92
...
@@ -88,57 +88,6 @@ struct MergeAdd {
...
@@ -88,57 +88,6 @@ struct MergeAdd {
framework
::
SelectedRows
*
output
);
framework
::
SelectedRows
*
output
);
};
};
template
<
typename
DeviceContext
,
typename
T
>
struct
Add
{
framework
::
SelectedRows
operator
()(
const
DeviceContext
&
context
,
const
framework
::
SelectedRows
&
input1
,
const
framework
::
SelectedRows
&
input2
)
{
framework
::
SelectedRows
out
;
out
.
set_rows
(
input1
.
rows
());
out
.
set_height
(
input1
.
height
());
out
.
mutable_value
()
->
mutable_data
<
T
>
(
input1
.
value
().
dims
(),
context
.
GetPlace
());
auto
e_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
(
out
.
mutable_value
()));
auto
e_in1
=
framework
::
EigenVector
<
T
>::
Flatten
(
input1
.
value
());
auto
e_in2
=
framework
::
EigenVector
<
T
>::
Flatten
(
input2
.
value
());
e_out
.
device
(
*
context
.
eigen_device
())
=
e_in1
+
e_in2
;
return
out
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
struct
Mul
{
// multiply two SelectedRows
framework
::
SelectedRows
operator
()(
const
DeviceContext
&
context
,
const
framework
::
SelectedRows
&
input1
,
const
framework
::
SelectedRows
&
input2
)
{
framework
::
SelectedRows
out
;
out
.
set_rows
(
input1
.
rows
());
out
.
set_height
(
input1
.
height
());
out
.
mutable_value
()
->
mutable_data
<
T
>
(
input1
.
value
().
dims
(),
context
.
GetPlace
());
auto
e_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
(
out
.
mutable_value
()));
auto
e_in1
=
framework
::
EigenVector
<
T
>::
Flatten
(
input1
.
value
());
auto
e_in2
=
framework
::
EigenVector
<
T
>::
Flatten
(
input2
.
value
());
e_out
.
device
(
*
context
.
eigen_device
())
=
e_in1
*
e_in2
;
return
out
;
}
// multiply scalar to SelectedRows
framework
::
SelectedRows
operator
()(
const
DeviceContext
&
context
,
const
framework
::
SelectedRows
&
input1
,
const
T
input2
)
{
framework
::
SelectedRows
out
;
out
.
set_rows
(
input1
.
rows
());
out
.
set_height
(
input1
.
height
());
out
.
mutable_value
()
->
mutable_data
<
T
>
(
input1
.
value
().
dims
(),
context
.
GetPlace
());
auto
e_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
(
out
.
mutable_value
()));
auto
e_in1
=
framework
::
EigenVector
<
T
>::
Flatten
(
input1
.
value
());
e_out
.
device
(
*
context
.
eigen_device
())
=
input2
*
e_in1
;
return
out
;
}
};
enum
class
ScatterOps
{
ASSIGN
,
ADD
,
SUB
,
SUBBY
,
MUL
,
DIV
,
DIVBY
};
enum
class
ScatterOps
{
ASSIGN
,
ADD
,
SUB
,
SUBBY
,
MUL
,
DIV
,
DIVBY
};
// out = seleted_rows_in / tensor
// out = seleted_rows_in / tensor
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
234a1d92
...
@@ -96,12 +96,15 @@ template class SoftmaxCUDNNFunctor<float>;
...
@@ -96,12 +96,15 @@ template class SoftmaxCUDNNFunctor<float>;
template
class
SoftmaxCUDNNFunctor
<
double
>;
template
class
SoftmaxCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/mean_op.cu
浏览文件 @
234a1d92
...
@@ -15,11 +15,15 @@ limitations under the License. */
...
@@ -15,11 +15,15 @@ limitations under the License. */
#define EIGEN_USE_GPU
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
mean
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
mean
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
mean_grad
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
mean_grad
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/mean_op.h
浏览文件 @
234a1d92
...
@@ -55,8 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
...
@@ -55,8 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG
->
mutable_data
<
T
>
(
context
.
GetPlace
());
IG
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
ig_size
=
static_cast
<
T
>
(
IG
->
numel
());
T
ig_size
=
static_cast
<
T
>
(
IG
->
numel
());
Eigen
::
DSizes
<
int
,
1
>
bcast
(
ig_size
);
Eigen
::
DSizes
<
int
,
1
>
bcast
(
static_cast
<
int
>
(
ig_size
));
EigenVector
<
T
>::
Flatten
(
*
IG
).
device
(
EigenVector
<
T
>::
Flatten
(
*
IG
).
device
(
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
())
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
())
=
(
EigenVector
<
T
>::
From
(
*
OG
)
/
ig_size
).
broadcast
(
bcast
);
(
EigenVector
<
T
>::
From
(
*
OG
)
/
ig_size
).
broadcast
(
bcast
);
...
...
paddle/fluid/operators/mul_op.cu.cc
浏览文件 @
234a1d92
...
@@ -20,6 +20,7 @@ namespace plat = paddle::platform;
...
@@ -20,6 +20,7 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL
(
mul
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
float
>
,
REGISTER_OP_CUDA_KERNEL
(
mul
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
REGISTER_OP_CUDA_KERNEL
(
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
234a1d92
...
@@ -178,7 +178,8 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
...
@@ -178,7 +178,8 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
REGISTER_OP_KERNEL
(
pool2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d
,
CUDNN
,
plat
::
CUDAPlace
,
REGISTER_OP_KERNEL
(
pool3d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
...
...
paddle/fluid/operators/scale_op.cu
浏览文件 @
234a1d92
...
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
scale
,
scale
,
...
@@ -20,4 +22,6 @@ REGISTER_OP_CUDA_KERNEL(
...
@@ -20,4 +22,6 @@ REGISTER_OP_CUDA_KERNEL(
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
int64_t
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/softmax_cudnn_op.cu.cc
浏览文件 @
234a1d92
...
@@ -80,4 +80,5 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
...
@@ -80,4 +80,5 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops
::
SoftmaxCUDNNKernel
<
plat
::
float16
>
);
ops
::
SoftmaxCUDNNKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
softmax_grad
,
CUDNN
,
plat
::
CUDAPlace
,
REGISTER_OP_KERNEL
(
softmax_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
SoftmaxGradCUDNNKernel
<
float
>
,
ops
::
SoftmaxGradCUDNNKernel
<
float
>
,
ops
::
SoftmaxGradCUDNNKernel
<
double
>
);
ops
::
SoftmaxGradCUDNNKernel
<
double
>
,
ops
::
SoftmaxGradCUDNNKernel
<
plat
::
float16
>
);
paddle/fluid/operators/softmax_op.cu.cc
浏览文件 @
234a1d92
...
@@ -23,4 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
...
@@ -23,4 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
SoftmaxKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
SoftmaxKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
softmax_grad
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.cu
浏览文件 @
234a1d92
...
@@ -11,10 +11,13 @@ limitations under the License. */
...
@@ -11,10 +11,13 @@ limitations under the License. */
#define EIGEN_USE_GPU
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
sum
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
sum
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.h
浏览文件 @
234a1d92
...
@@ -61,7 +61,7 @@ class SumKernel : public framework::OpKernel<T> {
...
@@ -61,7 +61,7 @@ class SumKernel : public framework::OpKernel<T> {
if
(
start
!=
2
)
{
if
(
start
!=
2
)
{
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
0.0
);
out
,
static_cast
<
T
>
(
0
)
);
}
}
}
}
...
...
paddle/fluid/operators/tensorrt_engine_op.h
浏览文件 @
234a1d92
...
@@ -223,7 +223,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
...
@@ -223,7 +223,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Add outputs
// Add outputs
for
(
auto
&
output
:
output_maps
)
{
for
(
auto
&
output
:
output_maps
)
{
engine
->
DeclareOutput
(
output
);
if
(
!
engine
->
HasDeclared
(
output
))
{
engine
->
DeclareOutput
(
output
);
}
}
}
engine
->
FreezeNetwork
();
engine
->
FreezeNetwork
();
...
...
paddle/fluid/platform/init.cc
浏览文件 @
234a1d92
...
@@ -116,6 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
...
@@ -116,6 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
#endif
#endif
#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
))
{
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
))
{
#ifndef __AVX__
#ifndef __AVX__
LOG
(
WARNING
)
<<
"AVX is available, Please re-compile on local machine"
;
LOG
(
WARNING
)
<<
"AVX is available, Please re-compile on local machine"
;
...
@@ -157,8 +158,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
...
@@ -157,8 +158,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
AVX_GUIDE
(
AVX
,
NonAVX
);
AVX_GUIDE
(
AVX
,
NonAVX
);
}
}
#endif
#endif
#undef AVX_GUIDE
#undef AVX_GUIDE
#endif
}
}
void
InitGLOG
(
const
std
::
string
&
prog_name
)
{
void
InitGLOG
(
const
std
::
string
&
prog_name
)
{
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
234a1d92
...
@@ -226,7 +226,7 @@ RecordBlock::~RecordBlock() {
...
@@ -226,7 +226,7 @@ RecordBlock::~RecordBlock() {
void
EnableProfiler
(
ProfilerState
state
)
{
void
EnableProfiler
(
ProfilerState
state
)
{
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
"Can't en
bale prof
ling, since the input state is "
,
"Can't en
able profi
ling, since the input state is "
,
"ProfilerState::kDisabled"
);
"ProfilerState::kDisabled"
);
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
234a1d92
...
@@ -742,7 +742,12 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -742,7 +742,12 @@ All parameter, weight, gradient are variables in Paddle.
will clean up the temp variables at the end of the current iteration.
will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient,
2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`.
in this case, you should reduce `num_iteration_per_drop_scope`.
)DOC"
);
)DOC"
)
.
def_property
(
"_dry_run"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
dry_run_
;
},
[](
ExecutionStrategy
&
self
,
bool
dry_run
)
{
self
.
dry_run_
=
dry_run
;
});
exec_strategy
.
def_property
(
exec_strategy
.
def_property
(
"use_experimental_executor"
,
"use_experimental_executor"
,
...
...
python/paddle/fluid/__init__.py
浏览文件 @
234a1d92
...
@@ -118,7 +118,6 @@ def __bootstrap__():
...
@@ -118,7 +118,6 @@ def __bootstrap__():
]
]
if
core
.
is_compiled_with_dist
():
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'rpc_server_profile_period'
)
read_env_flags
.
append
(
'rpc_server_profile_path'
)
read_env_flags
.
append
(
'rpc_server_profile_path'
)
read_env_flags
.
append
(
'enable_rpc_profiler'
)
read_env_flags
.
append
(
'enable_rpc_profiler'
)
read_env_flags
.
append
(
'rpc_send_thread_num'
)
read_env_flags
.
append
(
'rpc_send_thread_num'
)
...
...
python/paddle/fluid/io.py
浏览文件 @
234a1d92
...
@@ -65,7 +65,7 @@ def is_persistable(var):
...
@@ -65,7 +65,7 @@ def is_persistable(var):
Examples:
Examples:
.. code-block:: python
.. code-block:: python
param = fluid.default_main_program().global_block().var('fc.
w
')
param = fluid.default_main_program().global_block().var('fc.
b
')
res = fluid.io.is_persistable(param)
res = fluid.io.is_persistable(param)
"""
"""
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
...
@@ -625,8 +625,13 @@ def save_inference_model(dirname,
...
@@ -625,8 +625,13 @@ def save_inference_model(dirname,
main_program
.
_distributed_lookup_table
,
main_program
.
_distributed_lookup_table
,
main_program
.
_endpoints
)
main_program
.
_endpoints
)
if
not
os
.
path
.
isdir
(
dirname
):
# when a pserver and a trainer running on the same machine, mkdir may conflict
try
:
os
.
makedirs
(
dirname
)
os
.
makedirs
(
dirname
)
except
OSError
as
e
:
if
e
.
errno
!=
errno
.
EEXIST
:
raise
if
model_filename
is
not
None
:
if
model_filename
is
not
None
:
model_basename
=
os
.
path
.
basename
(
model_filename
)
model_basename
=
os
.
path
.
basename
(
model_filename
)
else
:
else
:
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
234a1d92
...
@@ -60,7 +60,7 @@ def data(name,
...
@@ -60,7 +60,7 @@ def data(name,
For example if shape=[1], the resulting shape is [-1, 1].
For example if shape=[1], the resulting shape is [-1, 1].
2. If shape contains -1, such as shape=[1, -1],
2. If shape contains -1, such as shape=[1, -1],
append_batch_size will be enforced to be be False (ineffective).
append_batch_size will be enforced to be be False (ineffective).
dtype(
int|float
): The type of data : float32, float_16, int etc
dtype(
basestring
): The type of data : float32, float_16, int etc
type(VarType): The output type. By default it is LOD_TENSOR.
type(VarType): The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence.
lod_level(int): The LoD Level. 0 means the input data is not a sequence.
stop_gradient(bool): A boolean that mentions whether gradient should flow.
stop_gradient(bool): A boolean that mentions whether gradient should flow.
...
...
python/paddle/fluid/recordio_writer.py
浏览文件 @
234a1d92
...
@@ -41,9 +41,6 @@ def convert_reader_to_recordio_file(
...
@@ -41,9 +41,6 @@ def convert_reader_to_recordio_file(
"""
"""
Convert a Python Reader to a recordio file.
Convert a Python Reader to a recordio file.
Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
details.
Examples:
Examples:
>>> import paddle.fluid as fluid
>>> import paddle.fluid as fluid
...
...
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
234a1d92
...
@@ -54,14 +54,6 @@ def get_numeric_gradient(place,
...
@@ -54,14 +54,6 @@ def get_numeric_gradient(place,
def
product
(
dim
):
def
product
(
dim
):
return
six
.
moves
.
reduce
(
lambda
a
,
b
:
a
*
b
,
dim
,
1
)
return
six
.
moves
.
reduce
(
lambda
a
,
b
:
a
*
b
,
dim
,
1
)
def
get_output
():
sum
=
[]
op
.
run
(
scope
,
place
)
for
output_name
in
output_names
:
sum
.
append
(
np
.
array
(
scope
.
find_var
(
output_name
).
get_tensor
()).
mean
())
return
np
.
array
(
sum
).
sum
()
/
len
(
output_names
)
tensor_to_check
=
scope
.
find_var
(
input_to_check
).
get_tensor
()
tensor_to_check
=
scope
.
find_var
(
input_to_check
).
get_tensor
()
tensor_size
=
product
(
tensor_to_check
.
shape
())
tensor_size
=
product
(
tensor_to_check
.
shape
())
tensor_to_check_dtype
=
tensor_to_check
.
_dtype
()
tensor_to_check_dtype
=
tensor_to_check
.
_dtype
()
...
@@ -77,6 +69,15 @@ def get_numeric_gradient(place,
...
@@ -77,6 +69,15 @@ def get_numeric_gradient(place,
raise
ValueError
(
"Not supported data type "
+
str
(
raise
ValueError
(
"Not supported data type "
+
str
(
tensor_to_check_dtype
))
tensor_to_check_dtype
))
def
get_output
():
sum
=
[]
op
.
run
(
scope
,
place
)
for
output_name
in
output_names
:
sum
.
append
(
np
.
array
(
scope
.
find_var
(
output_name
).
get_tensor
()).
astype
(
tensor_to_check_dtype
).
mean
())
return
tensor_to_check_dtype
(
np
.
array
(
sum
).
sum
()
/
len
(
output_names
))
gradient_flat
=
np
.
zeros
(
shape
=
(
tensor_size
,
),
dtype
=
tensor_to_check_dtype
)
gradient_flat
=
np
.
zeros
(
shape
=
(
tensor_size
,
),
dtype
=
tensor_to_check_dtype
)
def
__get_elem__
(
tensor
,
i
):
def
__get_elem__
(
tensor
,
i
):
...
...
python/paddle/fluid/tests/unittests/test_activation_op.py
浏览文件 @
234a1d92
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
234a1d92
...
@@ -223,106 +223,81 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
...
@@ -223,106 +223,81 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
#----------------Conv2dCUDNN----------------
#----------------Conv2dCUDNN----------------
class
TestCUDNN
(
TestConv2dOp
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestFP16CUDNN
(
TestConv2dOp
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
def
create_test_cudnn_class
(
parent
,
cls_name
):
if
core
.
is_compiled_with_cuda
():
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
place
=
core
.
CUDAPlace
(
0
)
"core is not compiled with CUDA"
)
if
core
.
is_float16_supported
(
place
):
class
TestCUDNNCase
(
parent
):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
cls_name
=
"{0}"
.
format
(
cls_name
)
TestCUDNNCase
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestCUDNNCase
class
TestCUDNNWithPad
(
TestWithPad
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestFP16CUDNNWithPad
(
TestWithPad
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
class
TestCUDNNWithStride
(
TestWithStride
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestFP16CUDNNWithStride
(
TestWithStride
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
create_test_cudnn_class
(
TestConv2dOp
,
"TestPool2DCUDNNOp"
)
create_test_cudnn_class
(
TestWithPad
,
"TestPool2DCUDNNOpCase1"
)
create_test_cudnn_class
(
TestWithStride
,
"TestPool2DCUDNNOpCase2"
)
create_test_cudnn_class
(
TestWithGroup
,
"TestPool2DCUDNNOpCase3"
)
create_test_cudnn_class
(
TestWith1x1
,
"TestPool2DCUDNNOpCase4"
)
create_test_cudnn_class
(
TestWithInput1x1Filter1x1
,
"TestPool2DCUDNNOpCase4"
)
class
TestCUDNNWithGroup
(
TestWithGroup
):
#----------------Conv2dCUDNN----------------
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestFP16CUDNNWithGroup
(
TestWithGroup
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
class
TestCUDNNWith1x1
(
TestWith1x1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
def
create_test_cudnn_fp16_class
(
parent
,
cls_name
,
grad_check
=
True
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestConv2DCUDNNFp16
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
class
TestFP16CUDNNWith1x1
(
TestWith1x1
):
def
test_check_output
(
self
):
def
init_kernel_type
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
use_cudnn
=
True
place
=
core
.
CUDAPlace
(
0
)
self
.
dtype
=
np
.
float16
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
def
test_check_output
(
self
):
def
test_check_grad_no_filter
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
if
core
.
is_float16_supported
(
place
)
and
grad_check
:
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
self
.
check_grad_with_place
(
place
,
[
'Input'
],
'Output'
,
class
TestCUDNNWithInput1x1Filter1x1
(
TestWithInput1x1Filter1x1
):
max_relative_error
=
0.02
,
def
init_kernel_type
(
self
):
no_grad_set
=
set
([
'Filter'
]))
self
.
use_cudnn
=
True
def
test_check_grad_no_input
(
self
):
class
TestFP16CUDNNWithInput1x1Filter1x1
(
TestWithInput1x1Filter1x1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
if
core
.
is_float16_supported
(
place
)
and
grad_check
:
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
self
.
check_grad_with_place
(
place
,
[
'Filter'
],
'Output'
,
max_relative_error
=
0.02
,
no_grad_set
=
set
([
'Input'
]))
cls_name
=
"{0}"
.
format
(
cls_name
)
TestConv2DCUDNNFp16
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestConv2DCUDNNFp16
create_test_cudnn_fp16_class
(
TestConv2dOp
,
"TestPool2DCUDNNFp16Op"
,
grad_check
=
False
)
create_test_cudnn_fp16_class
(
TestWithPad
,
"TestPool2DCUDNNFp16OpCase1"
,
grad_check
=
False
)
create_test_cudnn_fp16_class
(
TestWithStride
,
"TestPool2DCUDNNFp16OpCase2"
,
grad_check
=
False
)
create_test_cudnn_fp16_class
(
TestWithGroup
,
"TestPool2DCUDNNFp16OpCase3"
,
grad_check
=
False
)
create_test_cudnn_fp16_class
(
TestWith1x1
,
"TestPool2DCUDNNFp16OpCase4"
,
grad_check
=
False
)
create_test_cudnn_fp16_class
(
TestWithInput1x1Filter1x1
,
"TestPool2DCUDNNFp16OpCase4"
,
grad_check
=
False
)
# -------TestDepthwiseConv
class
TestDepthwiseConv
(
TestConv2dOp
):
class
TestDepthwiseConv
(
TestConv2dOp
):
...
...
python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
浏览文件 @
234a1d92
...
@@ -16,28 +16,58 @@ from __future__ import print_function
...
@@ -16,28 +16,58 @@ from __future__ import print_function
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
,
randomize_probability
from
op_test
import
OpTest
,
randomize_probability
class
TestCrossEntropyOp
1
(
OpTest
):
class
TestCrossEntropyOp
(
OpTest
):
"""Test cross-entropy with discrete one-hot labels.
"""Test cross-entropy with discrete one-hot labels.
"""
"""
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"cross_entropy"
self
.
op_type
=
"cross_entropy"
batch_size
=
30
self
.
soft_label
=
False
class_num
=
10
self
.
ignore_index
=
-
100
self
.
dtype
=
np
.
float64
self
.
batch_size
=
30
self
.
class_num
=
10
self
.
init_dtype_type
()
self
.
init_attr_type
()
self
.
init_bs_class_num
()
self
.
init_x
()
self
.
init_label
()
self
.
get_cross_entropy
()
self
.
inputs
=
{
"X"
:
self
.
x
,
"Label"
:
self
.
label
}
self
.
outputs
=
{
"Y"
:
self
.
cross_entropy
}
self
.
attrs
=
{
"soft_label"
:
self
.
soft_label
,
"ignore_index"
:
self
.
ignore_index
}
def
init_x
(
self
):
self
.
x
=
randomize_probability
(
self
.
batch_size
,
self
.
class_num
,
dtype
=
self
.
dtype
)
def
init_label
(
self
):
self
.
label
=
np
.
random
.
randint
(
0
,
self
.
class_num
,
(
self
.
batch_size
,
1
),
dtype
=
"int64"
)
def
get_cross_entropy
(
self
):
self
.
cross_entropy
=
np
.
asmatrix
(
[[
-
np
.
log
(
self
.
x
[
i
][
self
.
label
[
i
][
0
]])]
for
i
in
range
(
self
.
x
.
shape
[
0
])],
dtype
=
"float64"
)
X
=
randomize_probability
(
batch_size
,
class_num
,
dtype
=
'float64'
)
def
init_attr_type
(
self
):
pass
label
=
np
.
random
.
randint
(
0
,
class_num
,
(
batch_size
,
1
),
dtype
=
"int64"
)
def
init_dtype_type
(
self
):
cross_entropy
=
np
.
asmatrix
(
pass
[[
-
np
.
log
(
X
[
i
][
label
[
i
][
0
]])]
for
i
in
range
(
X
.
shape
[
0
])],
dtype
=
"float64"
)
self
.
inputs
=
{
"X"
:
X
,
"Label"
:
label
}
def
init_bs_class_num
(
self
):
self
.
outputs
=
{
"Y"
:
cross_entropy
}
pass
self
.
attrs
=
{
"soft_label"
:
False
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -46,197 +76,231 @@ class TestCrossEntropyOp1(OpTest):
...
@@ -46,197 +76,231 @@ class TestCrossEntropyOp1(OpTest):
self
.
check_grad
([
"X"
],
"Y"
,
numeric_grad_delta
=
0.001
)
self
.
check_grad
([
"X"
],
"Y"
,
numeric_grad_delta
=
0.001
)
class
TestCrossEntropyOp2
(
OpTest
):
class
TestCrossEntropyOp2
(
TestCrossEntropyOp
):
"""Test cross-entropy with vectorized soft labels.
"""Test cross-entropy with vectorized soft labels.
"""
"""
def
setUp
(
self
):
def
init_label
(
self
):
self
.
op_type
=
"cross_entropy"
self
.
label
=
np
.
random
.
uniform
(
batch_size
=
5
0.1
,
1.0
,
[
self
.
batch_size
,
self
.
class_num
]).
astype
(
self
.
dtype
)
class_num
=
37
self
.
label
/=
self
.
label
.
sum
(
axis
=
1
,
keepdims
=
True
)
X
=
randomize_probability
(
batch_size
,
class_num
)
def
get_cross_entropy
(
self
):
label
=
np
.
random
.
uniform
(
0.1
,
1.0
,
self
.
cross_entropy
=
(
-
self
.
label
*
np
.
log
(
self
.
x
)).
sum
(
[
batch_size
,
class_num
]).
astype
(
"float32"
)
axis
=
1
,
keepdims
=
True
).
astype
(
self
.
dtype
)
label
/=
label
.
sum
(
axis
=
1
,
keepdims
=
True
)
cross_entropy
=
(
-
label
*
np
.
log
(
X
)).
sum
(
axis
=
1
,
keepdims
=
True
).
astype
(
"float32"
)
self
.
inputs
=
{
"X"
:
X
,
"Label"
:
label
}
def
init_attr_type
(
self
):
self
.
outputs
=
{
"Y"
:
cross_entropy
}
self
.
soft_label
=
True
self
.
attrs
=
{
"soft_label"
:
True
}
def
test_check_output
(
self
):
def
init_dtype_type
(
self
):
self
.
check_output
()
self
.
dtype
=
np
.
float32
def
init_bs_class_num
(
self
):
self
.
batch_size
=
5
self
.
class_num
=
37
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
(
self
.
check_grad
(
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
class
TestCrossEntropyOp3
(
OpTest
):
class
TestCrossEntropyOp3
(
TestCrossEntropyOp
):
"""Test cross-entropy with vectorized one-hot representation of labels.
"""Test cross-entropy with vectorized one-hot representation of labels.
"""
"""
def
setUp
(
self
):
def
init_label
(
self
):
self
.
op_type
=
"cross_entropy"
self
.
label_index
=
np
.
random
.
randint
(
0
,
self
.
class_num
,
batch_size
=
5
(
self
.
batch_size
))
class_num
=
17
self
.
label
=
np
.
zeros
(
self
.
x
.
shape
).
astype
(
self
.
dtype
)
self
.
label
[
np
.
arange
(
self
.
batch_size
),
self
.
label_index
]
=
1
X
=
randomize_probability
(
batch_size
,
class_num
)
def
get_cross_entropy
(
self
):
label_index
=
np
.
random
.
randint
(
self
.
cross_entropy
=
np
.
asmatrix
(
0
,
class_num
,
(
batch_size
),
dtype
=
"int32"
)
[[
-
np
.
log
(
self
.
x
[
i
][
self
.
label_index
[
i
]])]
label
=
np
.
zeros
(
X
.
shape
)
for
i
in
range
(
self
.
x
.
shape
[
0
])]).
astype
(
self
.
dtype
)
label
[
np
.
arange
(
batch_size
),
label_index
]
=
1
cross_entropy
=
np
.
asmatrix
(
def
init_attr_type
(
self
):
[[
-
np
.
log
(
X
[
i
][
label_index
[
i
]])]
for
i
in
range
(
X
.
shape
[
0
])],
self
.
soft_label
=
True
dtype
=
"float32"
)
cross_entropy2
=
(
-
label
*
np
.
log
(
X
)).
sum
(
axis
=
1
,
keepdims
=
True
).
astype
(
"float32"
)
self
.
inputs
=
{
"X"
:
X
,
"Label"
:
label
.
astype
(
np
.
float32
)}
def
init_dtype_type
(
self
):
self
.
outputs
=
{
"Y"
:
cross_entropy
}
self
.
dtype
=
np
.
float32
self
.
attrs
=
{
"soft_label"
:
True
}
def
test_check_output
(
self
):
def
init_bs_class_num
(
self
):
self
.
check_output
()
self
.
batch_size
=
5
self
.
class_num
=
17
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
(
self
.
check_grad
(
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
class
TestCrossEntropyOp4
(
OpTest
):
class
TestCrossEntropyOp4
(
TestCrossEntropyOp
):
"""Test high rank tensor cross-entropy with discrete one-hot labels.
"""Test high rank tensor cross-entropy with discrete one-hot labels.
"""
"""
def
setUp
(
self
):
def
init_x
(
self
):
self
.
op_type
=
"cross_entropy"
self
.
shape
=
[
10
,
2
,
4
]
shape
=
[
10
,
2
,
4
]
self
.
ins_num
=
np
.
prod
(
np
.
array
(
self
.
shape
))
ins_num
=
np
.
prod
(
np
.
array
(
shape
))
self
.
X_2d
=
randomize_probability
(
self
.
ins_num
,
class_num
=
10
self
.
class_num
).
astype
(
self
.
dtype
)
self
.
x
=
self
.
X_2d
.
reshape
(
self
.
shape
+
[
self
.
class_num
])
X_2d
=
randomize_probability
(
ins_num
,
class_num
,
dtype
=
'float64'
)
def
init_label
(
self
):
self
.
label_2d
=
np
.
random
.
randint
(
0
,
self
.
class_num
,
(
self
.
ins_num
,
1
),
dtype
=
"int64"
)
self
.
label
=
self
.
label_2d
.
reshape
(
self
.
shape
+
[
1
])
label_2d
=
np
.
random
.
randint
(
0
,
class_num
,
(
ins_num
,
1
),
dtype
=
"int64"
)
def
get_cross_entropy
(
self
):
cross_entropy_2d
=
np
.
asmatrix
(
cross_entropy_2d
=
np
.
asmatrix
(
[[
-
np
.
log
(
X_2d
[
i
][
label_2d
[
i
][
0
]])]
for
i
in
range
(
X_2d
.
shape
[
0
])],
[[
-
np
.
log
(
self
.
X_2d
[
i
][
self
.
label_2d
[
i
][
0
]])]
dtype
=
"float64"
)
for
i
in
range
(
self
.
X_2d
.
shape
[
0
])]).
astype
(
self
.
dtype
)
self
.
cross_entropy
=
np
.
array
(
cross_entropy_2d
).
reshape
(
self
.
shape
+
[
1
])
X
=
X_2d
.
reshape
(
shape
+
[
class_num
])
def
init_attr_type
(
self
):
label
=
label_2d
.
reshape
(
shape
+
[
1
])
self
.
soft_label
=
False
cross_entropy
=
np
.
array
(
cross_entropy_2d
).
reshape
(
shape
+
[
1
])
self
.
inputs
=
{
"X"
:
X
,
"Label"
:
label
}
def
init_dtype_type
(
self
):
self
.
outputs
=
{
"Y"
:
cross_entropy
}
self
.
dtype
=
np
.
float64
self
.
attrs
=
{
"soft_label"
:
False
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
def
init_bs_class_num
(
self
):
self
.
c
heck_grad
([
"X"
],
"Y"
,
numeric_grad_delta
=
0.001
)
self
.
c
lass_num
=
10
class
TestCrossEntropyOp5
(
OpTest
):
class
TestCrossEntropyOp5
(
TestCrossEntropyOp
):
"""Test high rank tensor cross-entropy with vectorized soft labels.
"""Test high rank tensor cross-entropy with vectorized soft labels.
"""
"""
def
setUp
(
self
):
def
init_x
(
self
):
self
.
op_type
=
"cross_entropy"
self
.
shape
=
[
4
,
3
]
shape
=
[
4
,
3
]
self
.
ins_num
=
np
.
prod
(
np
.
array
(
self
.
shape
))
ins_num
=
np
.
prod
(
np
.
array
(
shape
))
self
.
X_2d
=
randomize_probability
(
self
.
ins_num
,
class_num
=
37
self
.
class_num
).
astype
(
self
.
dtype
)
self
.
x
=
self
.
X_2d
.
reshape
(
self
.
shape
+
[
self
.
class_num
])
X_2d
=
randomize_probability
(
ins_num
,
class_num
)
def
init_label
(
self
):
label_2d
=
np
.
random
.
uniform
(
0.1
,
1.0
,
self
.
label_2d
=
np
.
random
.
uniform
(
[
ins_num
,
class_num
]).
astype
(
"float32"
)
0.1
,
1.0
,
[
self
.
ins_num
,
self
.
class_num
]).
astype
(
self
.
dtype
)
label_2d
/=
label_2d
.
sum
(
axis
=
1
,
keepdims
=
True
)
self
.
label_2d
/=
self
.
label_2d
.
sum
(
axis
=
1
,
keepdims
=
True
)
cross_entropy_2d
=
(
-
label_2d
*
np
.
log
(
X_2d
)).
sum
(
self
.
label
=
self
.
label_2d
.
reshape
(
self
.
shape
+
[
self
.
class_num
])
axis
=
1
,
keepdims
=
True
).
astype
(
"float32"
)
X
=
X_2d
.
reshape
(
shape
+
[
class_num
])
def
get_cross_entropy
(
self
):
label
=
label_2d
.
reshape
(
shape
+
[
class_num
])
cross_entropy_2d
=
(
-
self
.
label_2d
*
np
.
log
(
self
.
X_2d
)).
sum
(
cross_entropy
=
np
.
array
(
cross_entropy_2d
).
reshape
(
shape
+
[
1
])
axis
=
1
,
keepdims
=
True
).
astype
(
self
.
dtype
)
self
.
cross_entropy
=
np
.
array
(
cross_entropy_2d
).
reshape
(
self
.
shape
+
[
1
])
self
.
inputs
=
{
"X"
:
X
,
"Label"
:
label
}
def
init_attr_type
(
self
):
self
.
outputs
=
{
"Y"
:
cross_entropy
}
self
.
soft_label
=
True
self
.
attrs
=
{
"soft_label"
:
True
}
def
test_check_output
(
self
):
def
init_dtype_type
(
self
):
self
.
check_output
()
self
.
dtype
=
np
.
float32
def
init_bs_class_num
(
self
):
self
.
class_num
=
37
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
(
self
.
check_grad
(
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
class
TestCrossEntropyOp6
(
OpTest
):
class
TestCrossEntropyOp6
(
TestCrossEntropyOp
):
"""Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
"""Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
"""
"""
def
setUp
(
self
):
def
init_x
(
self
):
self
.
op_type
=
"cross_entropy"
self
.
shape
=
[
4
,
3
,
2
]
shape
=
[
4
,
3
,
2
]
self
.
ins_num
=
np
.
prod
(
np
.
array
(
self
.
shape
))
ins_num
=
np
.
prod
(
np
.
array
(
shape
))
self
.
X_2d
=
randomize_probability
(
self
.
ins_num
,
class_num
=
17
self
.
class_num
).
astype
(
self
.
dtype
)
self
.
x
=
self
.
X_2d
.
reshape
(
self
.
shape
+
[
self
.
class_num
])
X_2d
=
randomize_probability
(
ins_num
,
class_num
)
label_index_2d
=
np
.
random
.
randint
(
def
init_label
(
self
):
0
,
class_num
,
(
ins_num
),
dtype
=
"int32"
)
self
.
label_index_2d
=
np
.
random
.
randint
(
label_2d
=
np
.
zeros
(
X_2d
.
shape
)
0
,
self
.
class_num
,
(
self
.
ins_num
),
dtype
=
"int64"
)
label_2d
[
np
.
arange
(
ins_num
),
label_index_2d
]
=
1
label_2d
=
np
.
zeros
(
self
.
X_2d
.
shape
)
label_2d
[
np
.
arange
(
self
.
ins_num
),
self
.
label_index_2d
]
=
1
self
.
label
=
label_2d
.
reshape
(
self
.
shape
+
[
self
.
class_num
]).
astype
(
self
.
dtype
)
def
get_cross_entropy
(
self
):
cross_entropy_2d
=
np
.
asmatrix
(
cross_entropy_2d
=
np
.
asmatrix
(
[[
-
np
.
log
(
X_2d
[
i
][
label_index_2d
[
i
]])]
[[
-
np
.
log
(
self
.
X_2d
[
i
][
self
.
label_index_2d
[
i
]])]
for
i
in
range
(
X_2d
.
shape
[
0
])],
for
i
in
range
(
self
.
X_2d
.
shape
[
0
])])
dtype
=
"float32"
)
self
.
cross_entropy
=
np
.
array
(
cross_entropy_2d
).
reshape
(
self
.
shape
+
[
1
]).
astype
(
self
.
dtype
)
X
=
X_2d
.
reshape
(
shape
+
[
class_num
])
def
init_attr_type
(
self
):
label
=
label_2d
.
reshape
(
shape
+
[
class_num
])
self
.
soft_label
=
True
cross_entropy
=
np
.
array
(
cross_entropy_2d
).
reshape
(
shape
+
[
1
])
self
.
inputs
=
{
"X"
:
X
,
"Label"
:
label
.
astype
(
np
.
float32
)}
def
init_dtype_type
(
self
):
self
.
outputs
=
{
"Y"
:
cross_entropy
}
self
.
dtype
=
np
.
float32
self
.
attrs
=
{
"soft_label"
:
True
}
def
test_check_output
(
self
):
def
init_bs_class_num
(
self
):
self
.
c
heck_output
()
self
.
c
lass_num
=
17
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
(
self
.
check_grad
(
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
[
"X"
],
"Y"
,
max_relative_error
=
0.05
,
numeric_grad_delta
=
0.001
)
class
TestCrossEntropyOp7
(
OpTest
):
class
TestCrossEntropyOp7
(
TestCrossEntropyOp
):
"""Test cross-entropy with ignore index.
"""Test cross-entropy with ignore index.
"""
"""
def
setUp
(
self
):
def
init_label
(
self
):
self
.
op_type
=
"cross_entropy"
self
.
label
=
np
.
random
.
randint
(
batch_size
=
30
0
,
self
.
class_num
,
(
self
.
batch_size
,
1
),
dtype
=
"int64"
)
class_num
=
10
ignore_index
=
3
def
get_cross_entropy
(
self
):
self
.
cross_entropy
=
np
.
asmatrix
(
X
=
randomize_probability
(
batch_size
,
class_num
,
dtype
=
'float64'
)
[[
-
np
.
log
(
self
.
x
[
i
][
self
.
label
[
i
][
0
]])]
if
self
.
label
[
i
][
0
]
!=
self
.
ignore_index
else
[
0
]
label
=
np
.
random
.
randint
(
0
,
class_num
,
(
batch_size
,
1
),
dtype
=
"int64"
)
for
i
in
range
(
self
.
x
.
shape
[
0
])]).
astype
(
self
.
dtype
)
cross_entropy
=
np
.
asmatrix
(
[[
-
np
.
log
(
X
[
i
][
label
[
i
][
0
]])]
def
init_attr_type
(
self
):
if
label
[
i
][
0
]
!=
ignore_index
else
[
0
]
self
.
soft_label
=
False
for
i
in
range
(
X
.
shape
[
0
])],
self
.
ignore_index
=
3
dtype
=
"float64"
)
self
.
inputs
=
{
"X"
:
X
,
"Label"
:
label
}
def
init_dtype_type
(
self
):
self
.
outputs
=
{
"Y"
:
cross_entropy
}
self
.
dtype
=
np
.
float64
self
.
attrs
=
{
"soft_label"
:
False
,
"ignore_index"
:
ignore_index
}
def
init_bs_class_num
(
self
):
def
test_check_output
(
self
):
self
.
batch_size
=
30
self
.
check_output
()
self
.
class_num
=
10
def
test_check_grad
(
self
):
self
.
check_grad
([
"X"
],
"Y"
,
numeric_grad_delta
=
0.001
)
# Add Fp16 test
def
create_test_class
(
parent
,
cls_name
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestCrossEntropyFP16Op
(
parent
):
def
init_dtype_type
(
self
):
return
np
.
float16
def
test_check_output
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-1
)
def
test_check_grad
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'X'
],
'Y'
,
max_relative_error
=
0.9
)
cls_name
=
"{0}"
.
format
(
cls_name
)
TestCrossEntropyFP16Op
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestCrossEntropyFP16Op
create_test_class
(
TestCrossEntropyOp
,
"TestCrossEntropyF16Op"
)
#create_test_class(TestCrossEntropyOp2, "TestCrossEntropyF16Op2")
create_test_class
(
TestCrossEntropyOp3
,
"TestCrossEntropyF16Op3"
)
create_test_class
(
TestCrossEntropyOp4
,
"TestCrossEntropyF16Op4"
)
#create_test_class(TestCrossEntropyOp5, "TestCrossEntropyF16Op5")
create_test_class
(
TestCrossEntropyOp6
,
"TestCrossEntropyF16Op6"
)
create_test_class
(
TestCrossEntropyOp7
,
"TestCrossEntropyF16Op7"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_mean_op.py
浏览文件 @
234a1d92
...
@@ -17,14 +17,20 @@ from __future__ import print_function
...
@@ -17,14 +17,20 @@ from __future__ import print_function
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
from
op_test
import
OpTest
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
class
TestMeanOp
(
OpTest
):
class
TestMeanOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"mean"
self
.
op_type
=
"mean"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
10
)).
astype
(
"float32"
)}
self
.
dtype
=
np
.
float32
self
.
init_dtype_type
()
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
10
)).
astype
(
self
.
dtype
)}
self
.
outputs
=
{
'Out'
:
np
.
mean
(
self
.
inputs
[
"X"
])}
self
.
outputs
=
{
'Out'
:
np
.
mean
(
self
.
inputs
[
"X"
])}
def
init_dtype_type
(
self
):
pass
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -32,5 +38,23 @@ class TestMeanOp(OpTest):
...
@@ -32,5 +38,23 @@ class TestMeanOp(OpTest):
self
.
check_grad
([
'X'
],
'Out'
)
self
.
check_grad
([
'X'
],
'Out'
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestFP16MeanOp
(
TestMeanOp
):
def
init_dtype_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-3
)
def
test_checkout_grad
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'X'
],
'Out'
,
max_relative_error
=
0.8
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_mul_op.py
浏览文件 @
234a1d92
...
@@ -23,12 +23,17 @@ from op_test import OpTest
...
@@ -23,12 +23,17 @@ from op_test import OpTest
class
TestMulOp
(
OpTest
):
class
TestMulOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"mul"
self
.
op_type
=
"mul"
self
.
dtype
=
np
.
float32
self
.
init_dtype_type
()
self
.
inputs
=
{
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
2
,
5
)).
astype
(
"float32"
),
'X'
:
np
.
random
.
random
((
2
,
5
)).
astype
(
self
.
dtype
),
'Y'
:
np
.
random
.
random
((
5
,
3
)).
astype
(
"float32"
)
'Y'
:
np
.
random
.
random
((
5
,
3
)).
astype
(
self
.
dtype
)
}
}
self
.
outputs
=
{
'Out'
:
np
.
dot
(
self
.
inputs
[
'X'
],
self
.
inputs
[
'Y'
])}
self
.
outputs
=
{
'Out'
:
np
.
dot
(
self
.
inputs
[
'X'
],
self
.
inputs
[
'Y'
])}
def
init_dtype_type
(
self
):
pass
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -47,9 +52,11 @@ class TestMulOp(OpTest):
...
@@ -47,9 +52,11 @@ class TestMulOp(OpTest):
class
TestMulOp2
(
OpTest
):
class
TestMulOp2
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"mul"
self
.
op_type
=
"mul"
self
.
dtype
=
np
.
float32
self
.
init_dtype_type
()
self
.
inputs
=
{
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
3
,
4
,
4
,
3
)).
astype
(
"float32"
),
'X'
:
np
.
random
.
random
((
3
,
4
,
4
,
3
)).
astype
(
self
.
dtype
),
'Y'
:
np
.
random
.
random
((
2
,
6
,
1
,
2
,
3
)).
astype
(
"float32"
)
'Y'
:
np
.
random
.
random
((
2
,
6
,
1
,
2
,
3
)).
astype
(
self
.
dtype
)
}
}
self
.
attrs
=
{
self
.
attrs
=
{
'x_num_col_dims'
:
2
,
'x_num_col_dims'
:
2
,
...
@@ -60,6 +67,9 @@ class TestMulOp2(OpTest):
...
@@ -60,6 +67,9 @@ class TestMulOp2(OpTest):
result
=
result
.
reshape
(
3
,
4
,
1
,
2
,
3
)
result
=
result
.
reshape
(
3
,
4
,
1
,
2
,
3
)
self
.
outputs
=
{
'Out'
:
result
}
self
.
outputs
=
{
'Out'
:
result
}
def
init_dtype_type
(
self
):
pass
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -75,40 +85,76 @@ class TestMulOp2(OpTest):
...
@@ -75,40 +85,76 @@ class TestMulOp2(OpTest):
[
'X'
],
'Out'
,
max_relative_error
=
0.5
,
no_grad_set
=
set
(
'Y'
))
[
'X'
],
'Out'
,
max_relative_error
=
0.5
,
no_grad_set
=
set
(
'Y'
))
class
TestFP16MulOp1
(
OpTest
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
def
setUp
(
self
):
"core is not compiled with CUDA"
)
self
.
op_type
=
"mul"
class
TestFP16MulOp1
(
TestMulOp
):
x
=
np
.
random
.
random
((
3
,
5
)).
astype
(
"float16"
)
def
init_dtype_type
(
self
):
y
=
np
.
random
.
random
((
5
,
4
)).
astype
(
"float16"
)
self
.
dtype
=
np
.
float16
self
.
inputs
=
{
'X'
:
x
.
view
(
np
.
float16
),
'Y'
:
y
.
view
(
np
.
float16
)}
self
.
outputs
=
{
'Out'
:
np
.
dot
(
x
,
y
)}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-1
)
self
.
check_output_with_place
(
place
,
atol
=
1e-1
)
def
test_check_grad_normal
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'X'
,
'Y'
],
'Out'
,
max_relative_error
=
0.5
)
class
TestFP16MulOp2
(
OpTest
):
def
test_check_grad_ingore_x
(
self
):
def
setUp
(
self
):
place
=
core
.
CUDAPlace
(
0
)
self
.
op_type
=
"mul"
if
core
.
is_float16_supported
(
place
):
x
=
np
.
random
.
random
((
3
,
4
,
4
,
3
)).
astype
(
"float16"
)
self
.
check_grad_with_place
(
y
=
np
.
random
.
random
((
2
,
6
,
1
,
2
,
3
)).
astype
(
"float16"
)
place
,
[
'Y'
],
self
.
inputs
=
{
'X'
:
x
.
view
(
np
.
float16
),
'Y'
:
y
.
view
(
np
.
float16
)}
'Out'
,
self
.
attrs
=
{
max_relative_error
=
0.5
,
'x_num_col_dims'
:
2
,
no_grad_set
=
set
(
"X"
))
'y_num_col_dims'
:
2
,
}
def
test_check_grad_ingore_y
(
self
):
result
=
np
.
dot
(
x
.
reshape
(
3
*
4
,
4
*
3
),
y
.
reshape
(
2
*
6
,
1
*
2
*
3
))
place
=
core
.
CUDAPlace
(
0
)
result
=
result
.
reshape
(
3
,
4
,
1
,
2
,
3
)
if
core
.
is_float16_supported
(
place
):
self
.
outputs
=
{
'Out'
:
result
}
self
.
check_grad_with_place
(
place
,
[
'X'
],
'Out'
,
max_relative_error
=
0.5
,
no_grad_set
=
set
(
'Y'
))
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestFP16MulOp2
(
TestMulOp2
):
def
init_dtype_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-1
)
self
.
check_output_with_place
(
place
,
atol
=
2e-1
)
def
test_check_grad_normal
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'X'
,
'Y'
],
'Out'
,
max_relative_error
=
0.9
)
def
test_check_grad_ingore_x
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'Y'
],
'Out'
,
max_relative_error
=
0.5
,
no_grad_set
=
set
(
"X"
))
def
test_check_grad_ingore_y
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'X'
],
'Out'
,
max_relative_error
=
0.9
,
no_grad_set
=
set
(
'Y'
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
浏览文件 @
234a1d92
...
@@ -16,6 +16,7 @@ from __future__ import print_function
...
@@ -16,6 +16,7 @@ from __future__ import print_function
import
paddle.dataset.conll05
as
conll05
import
paddle.dataset.conll05
as
conll05
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
unittest
import
unittest
import
paddle
import
paddle
import
numpy
as
np
import
numpy
as
np
...
@@ -177,32 +178,36 @@ class TestCRFModel(unittest.TestCase):
...
@@ -177,32 +178,36 @@ class TestCRFModel(unittest.TestCase):
def
test_update_sparse_parameter_all_reduce
(
self
):
def
test_update_sparse_parameter_all_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
self
.
check_network_convergence
(
if
core
.
is_compiled_with_cuda
():
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
def
test_update_dense_parameter_all_reduce
(
self
):
def
test_update_dense_parameter_all_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
self
.
check_network_convergence
(
if
core
.
is_compiled_with_cuda
():
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
def
test_update_sparse_parameter_reduce
(
self
):
def
test_update_sparse_parameter_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
self
.
check_network_convergence
(
if
core
.
is_compiled_with_cuda
():
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
def
test_update_dense_parameter_reduce
(
self
):
def
test_update_dense_parameter_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
self
.
check_network_convergence
(
if
core
.
is_compiled_with_cuda
():
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
0 → 100644
浏览文件 @
234a1d92
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
import
unittest
import
logging
import
six
class
TestBase
(
unittest
.
TestCase
):
def
main
(
self
,
network_func
,
iter
=
100
,
iter_per_pe
=
100
,
use_gpu
=
True
,
use_experimental_executor
=
False
):
if
use_gpu
and
not
fluid
.
core
.
is_compiled_with_cuda
():
logging
.
warning
(
"Paddle is not compiled with CUDA, skip GPU unittests"
)
return
main_prog
=
fluid
.
Program
()
startup_prog
=
fluid
.
Program
()
scope
=
fluid
.
Scope
()
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
scope_guard
(
scope
):
loss
=
network_func
()
fluid
.
Executor
(
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()).
run
(
startup_prog
)
for
_
in
six
.
moves
.
xrange
(
iter
):
exe_strategy
=
fluid
.
ExecutionStrategy
()
exe_strategy
.
_dry_run
=
True
exe_strategy
.
use_experimental_executor
=
use_experimental_executor
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
loss_name
=
loss
.
name
,
main_program
=
main_prog
,
exec_strategy
=
exe_strategy
)
for
_
in
six
.
moves
.
xrange
(
iter_per_pe
):
pe
.
run
([])
class
TestMNISTDryRun
(
TestBase
):
def
test_mnist_dry_run
(
self
):
for
use_gpu
in
(
False
,
True
):
for
use_experimental_executor
in
(
False
,
True
):
self
.
main
(
network_func
=
TestMNISTDryRun
.
network_func
,
use_gpu
=
use_gpu
,
use_experimental_executor
=
use_experimental_executor
)
@
staticmethod
def
network_func
():
img
=
fluid
.
layers
.
data
(
name
=
'img'
,
shape
=
[
784
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
hidden
=
img
for
_
in
six
.
moves
.
xrange
(
10
):
hidden
=
fluid
.
layers
.
fc
(
input
=
img
,
size
=
200
,
act
=
'tanh'
)
prediction
=
fluid
.
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
fluid
.
optimizer
.
Adam
().
minimize
(
avg_loss
)
return
avg_loss
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
234a1d92
...
@@ -14,30 +14,18 @@
...
@@ -14,30 +14,18 @@
from
__future__
import
print_function
from
__future__
import
print_function
from
parallel_executor_test_base
import
TestParallelExecutorBase
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
import
paddle
import
paddle.dataset.mnist
as
mnist
import
unittest
import
unittest
import
os
MNIST_RECORDIO_FILE
=
"./mnist_test_pe.recordio"
import
numpy
as
np
import
paddle.fluid.core
as
core
import
os
import
paddle.fluid
as
fluid
from
parallel_executor_test_base
import
TestParallelExecutorBase
def
simple_fc_net
(
use_feed
):
def
simple_fc_net
(
use_feed
):
if
use_feed
:
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
784
],
dtype
=
'float32'
)
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
784
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
else
:
reader
=
fluid
.
layers
.
open_files
(
filenames
=
[
MNIST_RECORDIO_FILE
],
shapes
=
[[
-
1
,
784
],
[
-
1
,
1
]],
lod_levels
=
[
0
,
0
],
dtypes
=
[
'float32'
,
'int64'
])
reader
=
fluid
.
layers
.
io
.
double_buffer
(
reader
)
img
,
label
=
fluid
.
layers
.
read_file
(
reader
)
hidden
=
img
hidden
=
img
for
_
in
range
(
4
):
for
_
in
range
(
4
):
hidden
=
fluid
.
layers
.
fc
(
hidden
=
fluid
.
layers
.
fc
(
...
@@ -53,17 +41,8 @@ def simple_fc_net(use_feed):
...
@@ -53,17 +41,8 @@ def simple_fc_net(use_feed):
def
fc_with_batchnorm
(
use_feed
):
def
fc_with_batchnorm
(
use_feed
):
if
use_feed
:
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
784
],
dtype
=
'float32'
)
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
784
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
else
:
reader
=
fluid
.
layers
.
open_files
(
filenames
=
[
MNIST_RECORDIO_FILE
],
shapes
=
[[
-
1
,
784
],
[
-
1
,
1
]],
lod_levels
=
[
0
,
0
],
dtypes
=
[
'float32'
,
'int64'
])
reader
=
fluid
.
layers
.
io
.
double_buffer
(
reader
)
img
,
label
=
fluid
.
layers
.
read_file
(
reader
)
hidden
=
img
hidden
=
img
for
_
in
range
(
1
):
for
_
in
range
(
1
):
...
@@ -88,19 +67,6 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -88,19 +67,6 @@ class TestMNIST(TestParallelExecutorBase):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
# Convert mnist to recordio file
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
reader
=
paddle
.
batch
(
mnist
.
train
(),
batch_size
=
4
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
# order is image and label
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
784
]),
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
),
],
place
=
fluid
.
CPUPlace
())
fluid
.
recordio_writer
.
convert_reader_to_recordio_file
(
MNIST_RECORDIO_FILE
,
reader
,
feeder
)
def
_init_data
(
self
):
def
_init_data
(
self
):
np
.
random
.
seed
(
5
)
np
.
random
.
seed
(
5
)
...
@@ -111,10 +77,6 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -111,10 +77,6 @@ class TestMNIST(TestParallelExecutorBase):
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_cuda
):
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_cuda
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
return
self
.
check_network_convergence
(
model
,
use_cuda
=
use_cuda
,
use_reduce
=
True
)
self
.
check_network_convergence
(
model
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
,
use_reduce
=
True
)
img
,
label
=
self
.
_init_data
()
img
,
label
=
self
.
_init_data
()
...
@@ -140,9 +102,6 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -140,9 +102,6 @@ class TestMNIST(TestParallelExecutorBase):
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
return
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
)
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
)
img
,
label
=
self
.
_init_data
()
img
,
label
=
self
.
_init_data
()
...
@@ -199,8 +158,6 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -199,8 +158,6 @@ class TestMNIST(TestParallelExecutorBase):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
return
self
.
check_network_convergence
(
fc_with_batchnorm
,
use_cuda
=
use_cuda
)
img
,
label
=
self
.
_init_data
()
img
,
label
=
self
.
_init_data
()
self
.
check_network_convergence
(
self
.
check_network_convergence
(
...
...
python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
浏览文件 @
234a1d92
...
@@ -15,10 +15,10 @@
...
@@ -15,10 +15,10 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
from
test_pool2d_op
import
TestPool2
d
_Op
,
TestCase1
,
TestCase2
,
TestCase3
,
TestCase4
,
TestCase5
from
test_pool2d_op
import
TestPool2
D
_Op
,
TestCase1
,
TestCase2
,
TestCase3
,
TestCase4
,
TestCase5
class
TestMKLDNNCase1
(
TestPool2
d
_Op
):
class
TestMKLDNNCase1
(
TestPool2
D
_Op
):
def
init_kernel_type
(
self
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
self
.
use_mkldnn
=
True
...
...
python/paddle/fluid/tests/unittests/test_pool2d_op.py
浏览文件 @
234a1d92
...
@@ -81,7 +81,7 @@ def avg_pool2D_forward_naive(x,
...
@@ -81,7 +81,7 @@ def avg_pool2D_forward_naive(x,
return
out
return
out
class
TestPool2
d
_Op
(
OpTest
):
class
TestPool2
D
_Op
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"pool2d"
self
.
op_type
=
"pool2d"
self
.
use_cudnn
=
False
self
.
use_cudnn
=
False
...
@@ -160,7 +160,7 @@ class TestPool2d_Op(OpTest):
...
@@ -160,7 +160,7 @@ class TestPool2d_Op(OpTest):
self
.
exclusive
=
True
self
.
exclusive
=
True
class
TestCase1
(
TestPool2
d
_Op
):
class
TestCase1
(
TestPool2
D
_Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
shape
=
[
2
,
3
,
7
,
7
]
self
.
shape
=
[
2
,
3
,
7
,
7
]
self
.
ksize
=
[
3
,
3
]
self
.
ksize
=
[
3
,
3
]
...
@@ -175,7 +175,7 @@ class TestCase1(TestPool2d_Op):
...
@@ -175,7 +175,7 @@ class TestCase1(TestPool2d_Op):
self
.
global_pool
=
False
self
.
global_pool
=
False
class
TestCase2
(
TestPool2
d
_Op
):
class
TestCase2
(
TestPool2
D
_Op
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
self
.
shape
=
[
2
,
3
,
7
,
7
]
self
.
shape
=
[
2
,
3
,
7
,
7
]
self
.
ksize
=
[
3
,
3
]
self
.
ksize
=
[
3
,
3
]
...
@@ -190,7 +190,7 @@ class TestCase2(TestPool2d_Op):
...
@@ -190,7 +190,7 @@ class TestCase2(TestPool2d_Op):
self
.
global_pool
=
False
self
.
global_pool
=
False
class
TestCase3
(
TestPool2
d
_Op
):
class
TestCase3
(
TestPool2
D
_Op
):
def
init_pool_type
(
self
):
def
init_pool_type
(
self
):
self
.
pool_type
=
"max"
self
.
pool_type
=
"max"
self
.
pool2D_forward_naive
=
max_pool2D_forward_naive
self
.
pool2D_forward_naive
=
max_pool2D_forward_naive
...
@@ -208,127 +208,98 @@ class TestCase5(TestCase2):
...
@@ -208,127 +208,98 @@ class TestCase5(TestCase2):
self
.
pool2D_forward_naive
=
max_pool2D_forward_naive
self
.
pool2D_forward_naive
=
max_pool2D_forward_naive
#--------------------test pool2d--------------------
#--------------------test pool2d cudnn--------------------
class
TestCUDNNCase1
(
TestPool2d_Op
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestFP16CUDNNCase1
(
TestPool2d_Op
):
def
create_test_cudnn_class
(
parent
):
def
init_kernel_type
(
self
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
self
.
use_cudnn
=
True
"core is not compiled with CUDA"
)
self
.
dtype
=
np
.
float16
class
TestCUDNNCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
def
test_check_output
(
self
):
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CUDNNOp"
)
if
core
.
is_compiled_with_cuda
():
TestCUDNNCase
.
__name__
=
cls_name
place
=
core
.
CUDAPlace
(
0
)
globals
()[
cls_name
]
=
TestCUDNNCase
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
class
TestCUDNNCase2
(
TestCase1
):
create_test_cudnn_class
(
TestPool2D_Op
)
def
init_kernel_type
(
self
):
create_test_cudnn_class
(
TestCase1
)
self
.
use_cudnn
=
True
create_test_cudnn_class
(
TestCase2
)
create_test_cudnn_class
(
TestCase3
)
create_test_cudnn_class
(
TestCase4
)
create_test_cudnn_class
(
TestCase5
)
#--------------------test pool2d cudnn_fp16--------------------
class
TestFP16CUDNNCase2
(
TestCase1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
def
create_test_cudnn_fp16_class
(
parent
,
check_grad
=
True
):
if
core
.
is_compiled_with_cuda
():
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
place
=
core
.
CUDAPlace
(
0
)
"core is not compiled with CUDA"
)
if
core
.
is_float16_supported
(
place
):
class
TestCUDNNFp16Case
(
parent
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
class
TestCUDNNCase3
(
TestCase2
):
def
test_check_grad
(
self
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestFP16CUDNNCase3
(
TestCase2
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
if
core
.
is_float16_supported
(
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
place
)
and
self
.
pool_type
!=
"max"
and
check_grad
:
self
.
check_grad_with_place
(
place
,
set
([
'X'
]),
'Out'
,
max_relative_error
=
0.07
)
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CUDNNFp16Op"
)
TestCUDNNFp16Case
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestCUDNNFp16Case
class
TestCUDNNCase4
(
TestCase3
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
create_test_cudnn_fp16_class
(
TestPool2D_Op
)
create_test_cudnn_fp16_class
(
TestCase1
,
check_grad
=
False
)
create_test_cudnn_fp16_class
(
TestCase2
)
create_test_cudnn_fp16_class
(
TestCase3
)
create_test_cudnn_fp16_class
(
TestCase4
)
create_test_cudnn_fp16_class
(
TestCase5
)
class
TestFP16CUDNNCase4
(
TestCase3
):
#--------------------test pool2d use ceil mode--------------------
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
def
create_test_cudnn_use_ceil_class
(
parent
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestPool2DUseCeilCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestCUDNNCase5
(
TestCase4
):
def
init_ceil_mode
(
self
):
def
init_kernel_type
(
self
):
self
.
ceil_mode
=
True
self
.
use_cudnn
=
True
class
TestFP16CUDNNCase5
(
TestCase4
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
class
TestCUDNNCase6
(
TestCase5
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
class
TestFP16CUDNNCase6
(
TestCase5
):
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CUDNNOpCeilMode"
)
def
init_kernel_type
(
self
):
TestPool2DUseCeilCase
.
__name__
=
cls_name
self
.
use_cudnn
=
True
globals
()[
cls_name
]
=
TestPool2DUseCeilCase
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
create_test_cudnn_use_ceil_class
(
TestPool2D_Op
)
create_test_cudnn_use_ceil_class
(
TestCase1
)
class
TestCeilModeCase1
(
TestCUDNNCase1
):
def
init_ceil_mode
(
self
):
self
.
ceil_mode
=
True
def
create_test_use_ceil_class
(
parent
):
class
TestPool2DUseCeilCase
(
parent
):
def
init_ceil_mode
(
self
):
self
.
ceil_mode
=
True
class
TestCeilModeCase2
(
TestCUDNNCase2
):
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CeilModeCast"
)
def
init_ceil_mode
(
self
):
TestPool2DUseCeilCase
.
__name__
=
cls_name
self
.
ceil_mode
=
Tru
e
globals
()[
cls_name
]
=
TestPool2DUseCeilCas
e
class
TestCeilModeCase3
(
TestCase1
):
create_test_use_ceil_class
(
TestCase1
)
def
init_ceil_mode
(
self
):
create_test_use_ceil_class
(
TestCase2
)
self
.
ceil_mode
=
True
class
TestCeilModeCase4
(
TestCase2
):
def
init_ceil_mode
(
self
):
self
.
ceil_mode
=
True
class
TestAvgInclude
(
TestCase2
):
class
TestAvgInclude
(
TestCase2
):
...
@@ -336,7 +307,10 @@ class TestAvgInclude(TestCase2):
...
@@ -336,7 +307,10 @@ class TestAvgInclude(TestCase2):
self
.
exclusive
=
False
self
.
exclusive
=
False
class
TestCUDNNAvgInclude
(
TestCUDNNCase3
):
class
TestCUDNNAvgInclude
(
TestCase2
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
def
init_exclusive
(
self
):
def
init_exclusive
(
self
):
self
.
exclusive
=
False
self
.
exclusive
=
False
...
...
python/paddle/fluid/tests/unittests/test_scale_op.py
浏览文件 @
234a1d92
...
@@ -24,9 +24,16 @@ from paddle.fluid.op import Operator
...
@@ -24,9 +24,16 @@ from paddle.fluid.op import Operator
class
TestScaleOp
(
OpTest
):
class
TestScaleOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"scale"
self
.
op_type
=
"scale"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
10
)).
astype
(
"float32"
)}
self
.
dtype
=
np
.
float32
self
.
init_dtype_type
()
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
10
)).
astype
(
self
.
dtype
)}
self
.
attrs
=
{
'scale'
:
-
2.3
}
self
.
attrs
=
{
'scale'
:
-
2.3
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
*
self
.
attrs
[
'scale'
]}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
*
self
.
dtype
(
self
.
attrs
[
'scale'
])
}
def
init_dtype_type
(
self
):
pass
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -36,9 +43,15 @@ class TestScaleOp(OpTest):
...
@@ -36,9 +43,15 @@ class TestScaleOp(OpTest):
class
TestScaleOpSelectedRows
(
unittest
.
TestCase
):
class
TestScaleOpSelectedRows
(
unittest
.
TestCase
):
def
init_dtype_type
(
self
):
pass
def
check_with_place
(
self
,
place
,
in_name
,
out_name
):
def
check_with_place
(
self
,
place
,
in_name
,
out_name
):
scope
=
core
.
Scope
()
scope
=
core
.
Scope
()
self
.
dtype
=
np
.
float32
self
.
init_dtype_type
()
# create and initialize Grad Variable
# create and initialize Grad Variable
in_height
=
10
in_height
=
10
in_rows
=
[
0
,
4
,
7
]
in_rows
=
[
0
,
4
,
7
]
...
@@ -49,7 +62,7 @@ class TestScaleOpSelectedRows(unittest.TestCase):
...
@@ -49,7 +62,7 @@ class TestScaleOpSelectedRows(unittest.TestCase):
in_selected_rows
.
set_height
(
in_height
)
in_selected_rows
.
set_height
(
in_height
)
in_selected_rows
.
set_rows
(
in_rows
)
in_selected_rows
.
set_rows
(
in_rows
)
in_array
=
np
.
random
.
random
(
in_array
=
np
.
random
.
random
(
(
len
(
in_rows
),
in_row_numel
)).
astype
(
"float32"
)
(
len
(
in_rows
),
in_row_numel
)).
astype
(
self
.
dtype
)
in_tensor
=
in_selected_rows
.
get_tensor
()
in_tensor
=
in_selected_rows
.
get_tensor
()
in_tensor
.
set
(
in_array
,
place
)
in_tensor
.
set
(
in_array
,
place
)
...
@@ -87,5 +100,41 @@ class TestScaleOpSelectedRows(unittest.TestCase):
...
@@ -87,5 +100,41 @@ class TestScaleOpSelectedRows(unittest.TestCase):
self
.
check_with_place
(
place
,
'in'
,
'in'
)
self
.
check_with_place
(
place
,
'in'
,
'in'
)
# Add FP16 test
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestScaleFp16Op
(
TestScaleOp
):
def
init_dtype_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
0.002
)
def
test_check_grad
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
"X"
],
"Out"
,
max_relative_error
=
0.05
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestScaleFp16OpSelectedRows
(
TestScaleOpSelectedRows
):
def
init_dtype_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_scale_selected_rows
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_with_place
(
place
,
'in'
,
'out'
)
def
test_scale_selected_rows_inplace
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_with_place
(
place
,
'in'
,
'in'
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_softmax_op.py
浏览文件 @
234a1d92
...
@@ -62,12 +62,11 @@ class TestSoftmaxOp(OpTest):
...
@@ -62,12 +62,11 @@ class TestSoftmaxOp(OpTest):
self
.
check_output
()
self
.
check_output
()
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
if
self
.
use_cudnn
or
self
.
dtype
==
np
.
float16
:
return
if
self
.
use_cudnn
:
place
=
core
.
CUDAPlace
(
0
)
place
=
core
.
CUDAPlace
(
0
)
self
.
check_grad_with_place
(
if
core
.
is_float16_supported
(
place
):
place
,
[
"X"
],
"Out"
,
max_relative_error
=
0.01
)
self
.
check_grad_with_place
(
place
,
[
"X"
],
"Out"
,
max_relative_error
=
0.01
)
else
:
else
:
self
.
check_grad
([
"X"
],
"Out"
,
max_relative_error
=
0.01
)
self
.
check_grad
([
"X"
],
"Out"
,
max_relative_error
=
0.01
)
...
@@ -103,10 +102,23 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
...
@@ -103,10 +102,23 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
if
core
.
is_float16_supported
(
place
):
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
# FIXME: If the x_shape is [10, 10], gradient failed.
def
test_check_grad
(
self
):
pass
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
"core is not compiled with CUDA"
)
class
TestSoftmaxFP16Op2
(
TestSoftmaxFP16Op
):
class
TestSoftmaxFP16Op2
(
TestSoftmaxOp
):
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
1e-3
)
def
get_x_shape
(
self
):
def
get_x_shape
(
self
):
return
[
2
,
3
,
4
,
5
]
return
[
2
,
3
,
4
,
5
]
...
...
python/paddle/fluid/tests/unittests/test_sum_op.py
浏览文件 @
234a1d92
...
@@ -24,16 +24,20 @@ from paddle.fluid.op import Operator
...
@@ -24,16 +24,20 @@ from paddle.fluid.op import Operator
class
TestSumOp
(
OpTest
):
class
TestSumOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"sum"
self
.
op_type
=
"sum"
self
.
init_kernel_type
()
self
.
use_mkldnn
=
False
self
.
use_mkldnn
=
False
self
.
init_kernel_type
()
self
.
init_kernel_type
()
x0
=
np
.
random
.
random
((
3
,
4
)).
astype
(
'float32'
)
x0
=
np
.
random
.
random
((
3
,
4
)).
astype
(
self
.
dtype
)
x1
=
np
.
random
.
random
((
3
,
4
)).
astype
(
'float32'
)
x1
=
np
.
random
.
random
((
3
,
4
)).
astype
(
self
.
dtype
)
x2
=
np
.
random
.
random
((
3
,
4
)).
astype
(
'float32'
)
x2
=
np
.
random
.
random
((
3
,
4
)).
astype
(
self
.
dtype
)
self
.
inputs
=
{
"X"
:
[(
"x0"
,
x0
),
(
"x1"
,
x1
),
(
"x2"
,
x2
)]}
self
.
inputs
=
{
"X"
:
[(
"x0"
,
x0
),
(
"x1"
,
x1
),
(
"x2"
,
x2
)]}
y
=
x0
+
x1
+
x2
y
=
x0
+
x1
+
x2
self
.
outputs
=
{
'Out'
:
y
}
self
.
outputs
=
{
'Out'
:
y
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
}
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
float32
def
test_check_output
(
self
):
def
test_check_output
(
self
):
self
.
check_output
()
self
.
check_output
()
...
@@ -59,8 +63,11 @@ class TestSelectedRowsSumOp(OpTest):
...
@@ -59,8 +63,11 @@ class TestSelectedRowsSumOp(OpTest):
self
.
check_input_and_optput
(
core
.
Scope
(),
place
,
inplace
,
False
,
False
,
self
.
check_input_and_optput
(
core
.
Scope
(),
place
,
inplace
,
False
,
False
,
False
)
False
)
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
float32
def
_get_array
(
self
,
row_num
,
row_numel
):
def
_get_array
(
self
,
row_num
,
row_numel
):
array
=
np
.
ones
((
row_num
,
row_numel
)).
astype
(
"float32"
)
array
=
np
.
ones
((
row_num
,
row_numel
)).
astype
(
self
.
dtype
)
for
i
in
range
(
row_num
):
for
i
in
range
(
row_num
):
array
[
i
]
*=
i
array
[
i
]
*=
i
return
array
return
array
...
@@ -129,5 +136,36 @@ class TestSelectedRowsSumOp(OpTest):
...
@@ -129,5 +136,36 @@ class TestSelectedRowsSumOp(OpTest):
self
.
check_with_place
(
place
,
inplace
)
self
.
check_with_place
(
place
,
inplace
)
class
TestFP16SumOp
(
TestSumOp
):
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
# FIXME: Because of the precision fp16, max_relative_error
# should be 0.15 here.
def
test_check_grad
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad
([
'x0'
],
'Out'
,
max_relative_error
=
0.15
)
class
TestFP16SelectedRowsSumOp
(
TestSelectedRowsSumOp
):
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_w_is_selected_rows
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
for
inplace
in
[
True
,
False
]:
self
.
check_with_place
(
place
,
inplace
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/setup.py.in
浏览文件 @
234a1d92
...
@@ -14,7 +14,8 @@ RC = 0
...
@@ -14,7 +14,8 @@ RC = 0
def git_commit():
def git_commit():
try:
try:
cmd = ['git', 'rev-parse', 'HEAD']
cmd = ['git', 'rev-parse', 'HEAD']
git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
except:
except:
git_commit = 'Unknown'
git_commit = 'Unknown'
git_commit = git_commit.decode()
git_commit = git_commit.decode()
...
@@ -44,7 +45,7 @@ def get_patch():
...
@@ -44,7 +45,7 @@ def get_patch():
def is_taged():
def is_taged():
try:
try:
cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE
, cwd="@PADDLE_SOURCE_DIR@"
).communicate()[0].strip()
git_tag = git_tag.decode()
git_tag = git_tag.decode()
except:
except:
return False
return False
...
@@ -55,8 +56,7 @@ def is_taged():
...
@@ -55,8 +56,7 @@ def is_taged():
return False
return False
def write_version_py(filename='paddle/version.py'):
def write_version_py(filename='paddle/version.py'):
cnt = '''
cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
#
#
full_version = '%(major)d.%(minor)d.%(patch)s'
full_version = '%(major)d.%(minor)d.%(patch)s'
major = '%(major)d'
major = '%(major)d'
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录