Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
814a7590
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
814a7590
编写于
3月 05, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
merge develop
test=develop
上级
597dc65e
caadd058
变更
25
隐藏空白更改
内联
并排
Showing
25 changed file
with
319 addition
and
211 deletion
+319
-211
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+2
-2
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+3
-2
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+0
-5
paddle/fluid/memory/CMakeLists.txt
paddle/fluid/memory/CMakeLists.txt
+1
-1
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+0
-20
paddle/fluid/operators/conv_transpose_op.cc
paddle/fluid/operators/conv_transpose_op.cc
+6
-0
paddle/fluid/operators/ngraph/ops/activation_op.h
paddle/fluid/operators/ngraph/ops/activation_op.h
+1
-1
paddle/fluid/operators/reader/buffered_reader.cc
paddle/fluid/operators/reader/buffered_reader.cc
+9
-13
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+10
-9
paddle/fluid/operators/sequence_ops/sequence_erase_op.h
paddle/fluid/operators/sequence_ops/sequence_erase_op.h
+10
-8
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+9
-54
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+1
-12
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+6
-6
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+34
-34
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
...tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+66
-40
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+3
-0
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
...e/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+5
-0
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
...luid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+123
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
...luid/tests/unittests/test_parallel_executor_fetch_feed.py
+5
-1
python/paddle/fluid/tests/unittests/test_pass_builder.py
python/paddle/fluid/tests/unittests/test_pass_builder.py
+3
-0
python/paddle/fluid/tests/unittests/test_py_func_op.py
python/paddle/fluid/tests/unittests/test_py_func_op.py
+4
-0
python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
...on/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+15
-0
tools/timeline.py
tools/timeline.py
+1
-1
未找到文件。
paddle/fluid/API.spec
浏览文件 @
814a7590
...
@@ -11,7 +11,7 @@ paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None,
...
@@ -11,7 +11,7 @@ paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '
78e512cabeda9c7f42cb7c7e88967ae7
'))
paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '
f5369953dd0c443961cf79f7a00e1a03
'))
paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
814a7590
...
@@ -38,10 +38,10 @@ if(WITH_GPU)
...
@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library
(
tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context
)
nv_library
(
tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context
)
add_dependencies
(
tensor tensor_util
)
add_dependencies
(
tensor tensor_util
)
else
()
else
()
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
profiler
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
endif
(
WIN32
)
endif
(
WIN32
)
else
()
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
profiler
)
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
)
endif
()
endif
()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
814a7590
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#pragma once
#pragma once
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -76,11 +77,11 @@ struct BuildStrategy {
...
@@ -76,11 +77,11 @@ struct BuildStrategy {
bool
fuse_relu_depthwise_conv_
{
false
};
bool
fuse_relu_depthwise_conv_
{
false
};
bool
memory_optimize_
{
fals
e
};
bool
memory_optimize_
{
tru
e
};
// TODO(dzhwinter):
// TODO(dzhwinter):
// make enable_inplace, memory_optimize_
// make enable_inplace, memory_optimize_
// memory_early_delete_ true by default
// memory_early_delete_ true by default
bool
enable_inplace_
{
fals
e
};
bool
enable_inplace_
{
tru
e
};
bool
enable_sequential_execution_
{
false
};
bool
enable_sequential_execution_
{
false
};
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
814a7590
...
@@ -18,7 +18,6 @@
...
@@ -18,7 +18,6 @@
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -138,19 +137,16 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -138,19 +137,16 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:GPU->CPU"
);
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:CPU->GPU"
);
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:GPU->GPU"
);
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
<<
dst_place
;
...
@@ -161,7 +157,6 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -161,7 +157,6 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
}
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:CUDAPinned->GPU"
);
auto
src_pinned_place
=
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
src_place
);
auto
src_pinned_place
=
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_pinned_place
,
src_ptr
,
size
,
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_pinned_place
,
src_ptr
,
size
,
...
...
paddle/fluid/memory/CMakeLists.txt
浏览文件 @
814a7590
add_subdirectory
(
detail
)
add_subdirectory
(
detail
)
add_subdirectory
(
allocation
)
add_subdirectory
(
allocation
)
cc_library
(
malloc SRCS malloc.cc DEPS place enforce allocator_facade
profiler
)
cc_library
(
malloc SRCS malloc.cc DEPS place enforce allocator_facade
)
cc_library
(
memcpy SRCS memcpy.cc DEPS place
)
cc_library
(
memcpy SRCS memcpy.cc DEPS place
)
cc_library
(
memory
cc_library
(
memory
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
814a7590
...
@@ -15,7 +15,6 @@ limitations under the License. */
...
@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy
#include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
...
@@ -30,23 +29,14 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
...
@@ -30,23 +29,14 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template
<
>
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:GPU->CPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:GPU->CPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
// FIXME(zjl): do we really need it?
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
...
@@ -61,10 +51,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
...
@@ -61,10 +51,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
dst_place
.
device
);
platform
::
SetDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:CPU->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:CPU->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
// FIXME(zjl): do we really need it?
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
...
@@ -80,19 +68,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
...
@@ -80,19 +68,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if
(
dst_place
==
src_place
)
{
if
(
dst_place
==
src_place
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync(same_gpu):GPU->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync(same_gpu):GPU->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
);
}
}
}
else
{
}
else
{
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyPeerAsync:GPU->GPU"
);
platform
::
GpuMemcpyPeerAsync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
platform
::
GpuMemcpyPeerAsync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
,
stream
);
num
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyPeerSync:GPU->GPU"
);
platform
::
GpuMemcpyPeerSync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
platform
::
GpuMemcpyPeerSync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
);
num
);
}
}
...
@@ -127,10 +111,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
...
@@ -127,10 +111,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:GPU->CUDAPinned"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:GPU->CUDAPinned"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
}
}
}
}
...
@@ -142,10 +124,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
...
@@ -142,10 +124,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
dst_place
.
device
);
platform
::
SetDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:CUDAPinned->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:CUDAPinned->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
}
}
}
}
...
...
paddle/fluid/operators/conv_transpose_op.cc
浏览文件 @
814a7590
...
@@ -127,6 +127,12 @@ void Conv2DTransposeOpMaker::Make() {
...
@@ -127,6 +127,12 @@ void Conv2DTransposeOpMaker::Make() {
"output feature channels,"
"output feature channels,"
"H is the height of the filter, and W is the width of the filter. "
"H is the height of the filter, and W is the width of the filter. "
"We enforce groups number == 1 in the convolution transpose scenario."
);
"We enforce groups number == 1 in the convolution transpose scenario."
);
AddInput
(
"Bias"
,
"(Tensor) Bias to be added to each output of filter application."
"The format of output tensor is X (one-dimensional) of size equal"
"to the number of output channels. Only used with MKL-DNN."
)
.
AsDispensable
();
AddOutput
(
"Output"
,
AddOutput
(
"Output"
,
"(Tensor) The output tensor of convolution transpose operator. "
"(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW."
);
"The format of output tensor is also NCHW."
);
...
...
paddle/fluid/operators/ngraph/ops/activation_op.h
浏览文件 @
814a7590
...
@@ -55,4 +55,4 @@ void BuildTanhGradNode(
...
@@ -55,4 +55,4 @@ void BuildTanhGradNode(
}
// namespace paddle
}
// namespace paddle
REGISTER_NG_OP
(
relu_grad
,
BuildReluGradNode
);
REGISTER_NG_OP
(
relu_grad
,
BuildReluGradNode
);
REGISTER_NG_OP
(
t
han
_grad
,
BuildTanhGradNode
);
REGISTER_NG_OP
(
t
anh
_grad
,
BuildTanhGradNode
);
paddle/fluid/operators/reader/buffered_reader.cc
浏览文件 @
814a7590
...
@@ -17,7 +17,6 @@
...
@@ -17,7 +17,6 @@
#include <vector>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
reader
{
namespace
reader
{
...
@@ -51,10 +50,9 @@ BufferedReader::BufferedReader(
...
@@ -51,10 +50,9 @@ BufferedReader::BufferedReader(
.
Get
(
place_
)))
.
Get
(
place_
)))
->
stream
();
->
stream
();
events
.
resize
(
buffer_size
);
events
.
resize
(
buffer_size
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream
));
for
(
auto
&
event
:
events
)
for
(
auto
&
event
:
events
)
{
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
}
PADDLE_ENFORCE
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
));
}
}
#endif
#endif
cpu_buffer_
.
resize
(
buffer_size
);
cpu_buffer_
.
resize
(
buffer_size
);
...
@@ -86,15 +84,12 @@ void BufferedReader::ReadAsync(size_t i) {
...
@@ -86,15 +84,12 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync
// NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream, because TensorCopySync
// TensorCopySync would block other stream
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if
(
platform
::
is_gpu_place
(
place_
))
{
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
stream
,
events
[
i
],
0
));
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
stream
,
events
[
i
],
0
));
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
gpu
.
resize
(
cpu
.
size
());
gpu
.
resize
(
cpu
.
size
());
platform
::
RecordEvent
record_event
(
"BufferedReader:MemoryCopy"
);
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
gpu
[
i
].
Resize
(
cpu
[
i
].
dims
());
gpu
[
i
].
Resize
(
cpu
[
i
].
dims
());
gpu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
gpu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
...
@@ -103,19 +98,20 @@ void BufferedReader::ReadAsync(size_t i) {
...
@@ -103,19 +98,20 @@ void BufferedReader::ReadAsync(size_t i) {
auto
gpu_ptr
=
gpu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
());
auto
gpu_ptr
=
gpu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
());
auto
size
=
auto
size
=
cpu
[
i
].
numel
()
*
paddle
::
framework
::
SizeOfType
(
cpu
[
i
].
type
());
cpu
[
i
].
numel
()
*
paddle
::
framework
::
SizeOfType
(
cpu
[
i
].
type
());
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
{
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
cpu_place
),
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
cpu_ptr
,
size
,
stream
);
}
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
{
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
cpu_place
),
cpu_ptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
size
,
stream
);
}
else
{
else
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
boost
::
get
<
platform
::
CPUPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
0
);
}
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
}
}
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
...
...
paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
浏览文件 @
814a7590
...
@@ -64,8 +64,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -64,8 +64,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
lod
=
in
->
lod
();
auto
lod
=
in
->
lod
();
PADDLE_ENFORCE_EQ
(
lod
.
size
(),
1UL
,
"Only support one level sequence now."
);
PADDLE_ENFORCE_EQ
(
lod
[
lod
.
size
()
-
1
].
back
(),
(
size_t
)
in
->
numel
(),
PADDLE_ENFORCE_EQ
(
lod
[
0
].
back
(),
(
size_t
)
in
->
numel
(),
"The actual size mismatches with the LoD information."
);
"The actual size mismatches with the LoD information."
);
auto
tokens
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"tokens"
);
auto
tokens
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"tokens"
);
auto
in_len
=
in
->
numel
();
auto
in_len
=
in
->
numel
();
...
@@ -85,10 +84,9 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -85,10 +84,9 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
num_erased
.
begin
()
+
1
);
num_erased
.
begin
()
+
1
);
// Copy LoD to GPU
// Copy LoD to GPU
auto
lod0
=
lod
[
0
];
auto
last_lod
=
lod
[
lod
.
size
()
-
1
];
auto
lod_len
=
lod0
.
size
();
auto
lod_len
=
last_lod
.
size
();
const
size_t
*
dev_in_lod_ptr
=
lod0
.
CUDAData
(
ctx
.
GetPlace
());
const
size_t
*
dev_in_lod_ptr
=
last_lod
.
CUDAData
(
ctx
.
GetPlace
());
// Calc output LoD
// Calc output LoD
thrust
::
device_vector
<
size_t
>
dev_out_lod
(
lod_len
);
thrust
::
device_vector
<
size_t
>
dev_out_lod
(
lod_len
);
size_t
*
dev_out_lod_ptr
=
thrust
::
raw_pointer_cast
(
dev_out_lod
.
data
());
size_t
*
dev_out_lod_ptr
=
thrust
::
raw_pointer_cast
(
dev_out_lod
.
data
());
...
@@ -96,13 +94,16 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -96,13 +94,16 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
num_erased_ptr
,
dev_in_lod_ptr
,
lod_len
,
dev_out_lod_ptr
);
num_erased_ptr
,
dev_in_lod_ptr
,
lod_len
,
dev_out_lod_ptr
);
// Set LoD for output
// Set LoD for output
std
::
vector
<
size_t
>
out_l
od0
(
dev_out_lod
.
begin
(),
dev_out_lod
.
end
());
std
::
vector
<
size_t
>
out_l
ast_lod
(
dev_out_lod
.
begin
(),
dev_out_lod
.
end
());
framework
::
LoD
out_lod
;
framework
::
LoD
out_lod
;
out_lod
.
push_back
(
out_lod0
);
for
(
size_t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
out_lod
.
push_back
(
lod
[
i
]);
}
out_lod
.
push_back
(
out_last_lod
);
out
->
set_lod
(
out_lod
);
out
->
set_lod
(
out_lod
);
// Set output
// Set output
out
->
Resize
({
static_cast
<
int64_t
>
(
out_l
od0
.
back
()),
1
});
out
->
Resize
({
static_cast
<
int64_t
>
(
out_l
ast_lod
.
back
()),
1
});
auto
out_dat
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
out_dat
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
SetOutput
<<<
(
in_len
-
1
)
/
PADDLE_CUDA_NUM_THREADS
+
1
,
SetOutput
<<<
(
in_len
-
1
)
/
PADDLE_CUDA_NUM_THREADS
+
1
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_dat
,
in_len
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_dat
,
in_len
,
...
...
paddle/fluid/operators/sequence_ops/sequence_erase_op.h
浏览文件 @
814a7590
...
@@ -28,19 +28,18 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
...
@@ -28,19 +28,18 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
auto
*
out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
*
out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
lod
=
in
->
lod
();
auto
lod
=
in
->
lod
();
PADDLE_ENFORCE_EQ
(
lod
.
size
(),
1UL
,
"Only support one level sequence now."
);
PADDLE_ENFORCE_EQ
(
lod
[
lod
.
size
()
-
1
].
back
(),
(
size_t
)
in
->
numel
(),
PADDLE_ENFORCE_EQ
(
lod
[
0
].
back
(),
(
size_t
)
in
->
numel
(),
"The actual size mismatches with the LoD information."
);
"The actual size mismatches with the LoD information."
);
auto
tokens
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"tokens"
);
auto
tokens
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"tokens"
);
auto
in_len
=
in
->
numel
();
auto
in_len
=
in
->
numel
();
auto
in_dat
=
in
->
data
<
T
>
();
auto
in_dat
=
in
->
data
<
T
>
();
auto
l
od0
=
lod
[
0
];
auto
l
ast_lod
=
lod
[
lod
.
size
()
-
1
];
std
::
vector
<
size_t
>
num_erased
(
in_len
+
1
,
0
);
std
::
vector
<
size_t
>
num_erased
(
in_len
+
1
,
0
);
std
::
vector
<
size_t
>
out_l
od0
(
1
,
0
);
std
::
vector
<
size_t
>
out_l
ast_lod
(
1
,
0
);
for
(
size_t
i
=
0
;
i
<
l
od0
.
size
()
-
1
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
l
ast_lod
.
size
()
-
1
;
++
i
)
{
size_t
num_out
=
0
;
size_t
num_out
=
0
;
for
(
auto
j
=
l
od0
[
i
]
+
1
;
j
<=
lod0
[
i
+
1
];
++
j
)
{
for
(
auto
j
=
l
ast_lod
[
i
]
+
1
;
j
<=
last_lod
[
i
+
1
];
++
j
)
{
num_erased
[
j
]
=
num_erased
[
j
-
1
];
num_erased
[
j
]
=
num_erased
[
j
-
1
];
if
(
std
::
find
(
tokens
.
begin
(),
tokens
.
end
(),
in_dat
[
j
-
1
])
!=
if
(
std
::
find
(
tokens
.
begin
(),
tokens
.
end
(),
in_dat
[
j
-
1
])
!=
tokens
.
end
())
{
tokens
.
end
())
{
...
@@ -49,7 +48,7 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
...
@@ -49,7 +48,7 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
num_out
+=
1
;
num_out
+=
1
;
}
}
}
}
out_l
od0
.
push_back
(
out_lod0
.
back
()
+
num_out
);
out_l
ast_lod
.
push_back
(
out_last_lod
.
back
()
+
num_out
);
}
}
auto
out_len
=
in_len
-
num_erased
[
in_len
];
auto
out_len
=
in_len
-
num_erased
[
in_len
];
...
@@ -62,7 +61,10 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
...
@@ -62,7 +61,10 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
}
}
}
}
framework
::
LoD
out_lod
;
framework
::
LoD
out_lod
;
out_lod
.
push_back
(
out_lod0
);
for
(
size_t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
out_lod
.
push_back
(
lod
[
i
]);
}
out_lod
.
push_back
(
out_last_lod
);
out
->
set_lod
(
out_lod
);
out
->
set_lod
(
out_lod
);
}
}
};
};
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
814a7590
...
@@ -30,6 +30,7 @@ limitations under the License. */
...
@@ -30,6 +30,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -221,24 +222,19 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
...
@@ -221,24 +222,19 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
}
}
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
{
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
// -1 device id represents
ActiveKind
api call
// -1 device id represents
CUDA
api call
tracer
->
Add
ActiveKind
Records
(
tracer
->
Add
CPU
Records
(
DriverKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
DriverKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
GetThreadIdFromSystemThreadId
(
api
->
threadId
),
GetThreadIdFromSystemThreadId
(
api
->
threadId
));
api
->
correlationId
);
}
break
;
break
;
}
}
case
CUPTI_ACTIVITY_KIND_RUNTIME
:
{
case
CUPTI_ACTIVITY_KIND_RUNTIME
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
{
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
// -1 device id represents ActiveKind api call
tracer
->
AddCPURecords
(
tracer
->
AddActiveKindRecords
(
RuntimeKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
RuntimeKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
GetThreadIdFromSystemThreadId
(
api
->
threadId
),
GetThreadIdFromSystemThreadId
(
api
->
threadId
));
api
->
correlationId
);
}
break
;
break
;
}
}
default:
{
break
;
}
default:
{
break
;
}
...
@@ -317,25 +313,6 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -317,25 +313,6 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id
,
correlation_id
,
bytes
});
stream_id
,
correlation_id
,
bytes
});
}
}
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
{
if
(
anno
.
empty
())
{
VLOG
(
1
)
<<
"Empty timeline annotation."
;
return
;
}
thread_local
std
::
forward_list
<
ActiveKindRecord
>
*
local_active_kind_records
=
nullptr
;
if
(
local_active_kind_records
==
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
active_kind_records_
.
emplace_front
();
local_active_kind_records
=
&
active_kind_records_
.
front
();
}
// lock is not needed, only one thread call this function.
local_active_kind_records
->
push_front
(
ActiveKindRecord
{
anno
,
start_ns
,
end_ns
,
device_id
,
thread_id
,
correlation_id
});
}
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
int64_t
device_id
,
int64_t
stream_id
,
int64_t
device_id
,
int64_t
stream_id
,
uint32_t
correlation_id
)
{
uint32_t
correlation_id
)
{
...
@@ -378,7 +355,6 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -378,7 +355,6 @@ class DeviceTracerImpl : public DeviceTracer {
}
}
const
std
::
vector
<
int
>
cbids
{
const
std
::
vector
<
int
>
cbids
{
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
...
@@ -409,7 +385,6 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -409,7 +385,6 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_
.
clear
();
correlations_
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
active_kind_records_
)
tmp
.
clear
();
}
}
void
GenEventKernelCudaElapsedTime
()
{
void
GenEventKernelCudaElapsedTime
()
{
...
@@ -462,7 +437,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -462,7 +437,7 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
for
(
auto
&
tmp
:
cpu_records_
)
{
for
(
auto
&
tmp
:
cpu_records_
)
for
(
const
CPURecord
&
r
:
tmp
)
{
for
(
const
CPURecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
CPU
);
event
->
set_type
(
proto
::
Event
::
CPU
);
...
@@ -472,24 +447,6 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -472,24 +447,6 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
}
for
(
auto
&
tmp
:
active_kind_records_
)
{
for
(
const
ActiveKindRecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
CPU
);
auto
c
=
correlations_
.
find
(
r
.
correlation_id
);
if
(
c
!=
correlations_
.
end
()
&&
c
->
second
!=
nullptr
)
{
event
->
set_name
(
c
->
second
->
name
());
event
->
set_detail_info
(
r
.
name
);
}
else
{
event
->
set_name
(
r
.
name
);
}
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
miss
=
find
=
0
;
miss
=
find
=
0
;
for
(
const
MemRecord
&
r
:
mem_records_
)
{
for
(
const
MemRecord
&
r
:
mem_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
...
@@ -553,7 +510,6 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -553,7 +510,6 @@ class DeviceTracerImpl : public DeviceTracer {
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
ActiveKindRecord
>>
active_kind_records_
;
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
correlations_pairs
;
correlations_pairs
;
std
::
unordered_map
<
uint32_t
,
Event
*>
correlations_
;
std
::
unordered_map
<
uint32_t
,
Event
*>
correlations_
;
...
@@ -657,7 +613,6 @@ void initCuptiCbidStr() {
...
@@ -657,7 +613,6 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaSetupArgument_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaSetupArgument_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunch_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunch_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaDeviceGetPCIBusId_v4010
);
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
814a7590
...
@@ -63,14 +63,7 @@ class DeviceTracer {
...
@@ -63,14 +63,7 @@ class DeviceTracer {
uint32_t
correlation_id
;
uint32_t
correlation_id
;
uint64_t
bytes
;
uint64_t
bytes
;
};
};
struct
ActiveKindRecord
{
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
end_ns
;
int64_t
device_id
;
int64_t
thread_id
;
uint32_t
correlation_id
;
};
virtual
~
DeviceTracer
()
{}
virtual
~
DeviceTracer
()
{}
// Needs to be called once before use.
// Needs to be called once before use.
virtual
void
Enable
()
=
0
;
virtual
void
Enable
()
=
0
;
...
@@ -92,10 +85,6 @@ class DeviceTracer {
...
@@ -92,10 +85,6 @@ class DeviceTracer {
virtual
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
virtual
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
)
=
0
;
int64_t
thread_id
)
=
0
;
virtual
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
// added before for human readability.
...
...
python/paddle/fluid/__init__.py
浏览文件 @
814a7590
...
@@ -132,7 +132,7 @@ def __bootstrap__():
...
@@ -132,7 +132,7 @@ def __bootstrap__():
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
,
'pe_profile_fname'
,
'warpctc_dir'
,
'print_sub_graph_dir'
,
'pe_profile_fname'
,
'warpctc_dir'
,
'inner_op_parallelism'
,
'enable_parallel_graph'
,
'inner_op_parallelism'
,
'enable_parallel_graph'
,
'multiple_of_cupti_buffer_size'
'multiple_of_cupti_buffer_size'
,
'enable_subgraph_optimize'
]
]
if
'Darwin'
not
in
sysstr
:
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
read_env_flags
.
append
(
'use_pinned_memory'
)
...
...
python/paddle/fluid/compiler.py
浏览文件 @
814a7590
...
@@ -206,12 +206,12 @@ class CompiledProgram(object):
...
@@ -206,12 +206,12 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
# if turn on python memory optimize, turn off the inplace_pass.
if
self
.
_build_strategy
.
memory_optimize
is
None
:
# memory_optimize and enable_inplace default are True, but we can disable them on purpose
self
.
_build_strategy
.
memory_optimize
=
False
\
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
:
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
Tru
e
self
.
_build_strategy
.
memory_optimize
=
Fals
e
if
self
.
_build_strategy
.
enable_inplace
is
None
:
self
.
_build_strategy
.
enable_inplace
=
False
\
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
:
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
Tru
e
self
.
_build_strategy
.
enable_inplace
=
Fals
e
# TODO(wuyi): trainer endpoings should be passed in through
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
# build_strategy, not program.xxx.
...
...
python/paddle/fluid/executor.py
浏览文件 @
814a7590
...
@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
...
@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
class
Executor
(
object
):
class
Executor
(
object
):
"""
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
ParallelExecutor.
Python executor takes a program, adds feed operators and fetch operators to this program according
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want
to get after program run
. Note: the executor will run all
the variables(or names) that user want
s to get after program runs
. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
variables. The contents in local scope may be discarded after every minibatch forward/backward
But the global scope variables will be persistent through different runs.
finished. But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Example:
Example:
.. code-block:: python
# First create the Executor.
.. code-block:: python
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# Run the startup program once and only once.
exe = fluid.Executor(place)
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
# Run the main program directly without compile.
exe.run(fluid.default_startup_program())
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
# Run the main program directly without compile.
fetch_list=[loss.name])
loss, = exe.run(fluid.default_main_program(),
# Or, compiled the program and run. See `CompiledProgram` for more detail.
feed=feed_dict,
compiled_prog = compiler.CompiledProgram(
fetch_list=[loss.name])
fluid.default_main_program()).with_data_parallel(
# Or, compiled the program and run. See `CompiledProgram` for more detail.
loss_name=loss.name)
compiled_prog = compiler.CompiledProgram(
loss, = exe.run(compiled_prog,
fluid.default_main_program()).with_data_parallel(
feed=feed_dict,
loss_name=loss.name)
fetch_list=[loss.name])
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
"""
"""
def
__init__
(
self
,
place
):
def
__init__
(
self
,
place
):
...
@@ -382,6 +379,12 @@ class Executor(object):
...
@@ -382,6 +379,12 @@ class Executor(object):
]
]
return
outs
return
outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def
close
(
self
):
def
close
(
self
):
"""
"""
Close this executor.
Close this executor.
...
@@ -389,9 +392,6 @@ class Executor(object):
...
@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method.
You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example:
Example:
>>> cpu = core.CPUPlace()
>>> cpu = core.CPUPlace()
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
浏览文件 @
814a7590
...
@@ -15,36 +15,22 @@
...
@@ -15,36 +15,22 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.test_conv2d_transpose_op
import
TestConv2dTransposeOp
,
TestWithPad
,
TestWithStride
from
paddle.fluid.tests.unittests.test_conv2d_transpose_op
import
conv2dtranspose_forward_naive
,
TestConv2dTransposeOp
class
TestMKLDNN
(
TestConv2dTransposeOp
):
def
conv2d_bias_naive
(
out
,
bias
):
def
init_op_type
(
self
):
_
,
out_c
,
_
,
_
=
out
.
shape
self
.
is_test
=
True
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
self
.
op_type
=
"conv2d_transpose"
self
.
_cpu_only
=
True
def
test_check_grad
(
self
):
return
def
test_check_grad_no_input
(
self
):
for
l
in
range
(
out_c
):
return
out
[:,
l
,
:,
:]
=
out
[:,
l
,
:,
:]
+
bias
[
l
]
return
out
def
test_check_grad_no_filter
(
self
):
return
class
TestMKLDNNWithPad
(
TestWithPad
):
class
TestConv2dTransposeMKLDNNOp
(
TestConv2dTransposeOp
):
def
init_op_type
(
self
):
self
.
is_test
=
True
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
self
.
op_type
=
"conv2d_transpose"
self
.
_cpu_only
=
True
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
return
return
...
@@ -54,24 +40,64 @@ class TestMKLDNNWithPad(TestWithPad):
...
@@ -54,24 +40,64 @@ class TestMKLDNNWithPad(TestWithPad):
def
test_check_grad_no_filter
(
self
):
def
test_check_grad_no_filter
(
self
):
return
return
class
TestMKLDNNWithStride
(
TestWithStride
):
def
init_op_type
(
self
):
def
init_op_type
(
self
):
self
.
is_test
=
True
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
self
.
data_format
=
"NCHW"
self
.
op_type
=
"conv2d_transpose"
self
.
op_type
=
"conv2d_transpose"
self
.
_cpu_only
=
True
self
.
_cpu_only
=
True
def
test_check_grad
(
self
):
def
init_test_case
(
self
):
return
self
.
use_mkldnn
=
True
self
.
is_test
=
True
def
test_check_grad_no_input
(
self
):
self
.
pad
=
[
0
,
0
]
return
self
.
fuse_bias
=
False
self
.
bias_size
=
None
def
test_check_grad_no_filter
(
self
):
self
.
fuse_relu
=
False
return
self
.
stride
=
[
1
,
1
]
self
.
dilations
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
if
__name__
==
'__main__'
:
f_c
=
self
.
input_size
[
1
]
unittest
.
main
()
self
.
filter_size
=
[
f_c
,
6
,
3
,
3
]
self
.
groups
=
1
def
setUp
(
self
):
TestConv2dTransposeOp
.
setUp
(
self
)
output
=
self
.
outputs
[
'Output'
]
if
self
.
fuse_bias
and
self
.
bias_size
is
not
None
:
bias
=
np
.
random
.
random
(
self
.
bias_size
).
astype
(
self
.
dtype
)
output
=
conv2d_bias_naive
(
output
,
bias
)
output
=
output
.
astype
(
self
.
dtype
)
self
.
attrs
[
'fuse_bias'
]
=
self
.
fuse_bias
self
.
inputs
[
'Bias'
]
=
OpTest
.
np_dtype_to_fluid_dtype
(
bias
)
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output
,
0
).
astype
(
self
.
dtype
)
self
.
attrs
[
'fuse_bias'
]
=
self
.
fuse_bias
self
.
attrs
[
'fuse_relu'
]
=
self
.
fuse_relu
self
.
outputs
[
'Output'
]
=
output
class
TestMKLDNNFuseBias
(
TestConv2dTransposeMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dTransposeMKLDNNOp
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
fuse_bias
=
True
self
.
bias_size
=
[
6
]
class
TestMKLDNNWithPad
(
TestConv2dTransposeMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dTransposeMKLDNNOp
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
10
,
10
]
class
TestMKLDNNWithStride
(
TestConv2dTransposeMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dTransposeMKLDNNOp
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
6
,
6
]
# NCHW
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
814a7590
...
@@ -115,6 +115,9 @@ class TestDistRunnerBase(object):
...
@@ -115,6 +115,9 @@ class TestDistRunnerBase(object):
strategy
.
allow_op_delay
=
False
strategy
.
allow_op_delay
=
False
build_stra
=
fluid
.
BuildStrategy
()
build_stra
=
fluid
.
BuildStrategy
()
# FIXME force disable enable_inplace and memory_optimize
build_stra
.
enable_inplace
=
False
build_stra
.
memory_optimize
=
False
if
args
.
use_reduce
:
if
args
.
use_reduce
:
build_stra
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
build_stra
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
...
...
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
浏览文件 @
814a7590
...
@@ -123,6 +123,9 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -123,6 +123,9 @@ class TestMNIST(TestParallelExecutorBase):
# NOTE(dzh):
# NOTE(dzh):
# need to make it compatible with elewise fuse act
# need to make it compatible with elewise fuse act
# FIXME (liuwei12)
# the new memory optimize strategy will crash this unittest
# add enable_inplace=False here to force pass the unittest
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
model
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
{
"image"
:
img
,
...
@@ -131,6 +134,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -131,6 +134,7 @@ class TestMNIST(TestParallelExecutorBase):
fuse_elewise_add_act_ops
=
False
,
fuse_elewise_add_act_ops
=
False
,
memory_opt
=
False
,
memory_opt
=
False
,
use_ir_memory_optimize
=
False
,
use_ir_memory_optimize
=
False
,
enable_inplace
=
False
,
optimizer
=
_optimizer
)
optimizer
=
_optimizer
)
fuse_op_first_loss
,
fuse_op_last_loss
=
self
.
check_network_convergence
(
fuse_op_first_loss
,
fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
model
,
...
@@ -140,6 +144,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -140,6 +144,7 @@ class TestMNIST(TestParallelExecutorBase):
fuse_elewise_add_act_ops
=
True
,
fuse_elewise_add_act_ops
=
True
,
memory_opt
=
False
,
memory_opt
=
False
,
use_ir_memory_optimize
=
False
,
use_ir_memory_optimize
=
False
,
enable_inplace
=
False
,
optimizer
=
_optimizer
)
optimizer
=
_optimizer
)
for
loss
in
zip
(
not_fuse_op_first_loss
,
fuse_op_first_loss
):
for
loss
in
zip
(
not_fuse_op_first_loss
,
fuse_op_first_loss
):
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
0 → 100644
浏览文件 @
814a7590
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
from
__future__
import
print_function
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
unittest
import
paddle.fluid.core
as
core
from
paddle.fluid
import
compiler
,
Program
,
program_guard
from
paddle.fluid.executor
import
Executor
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.optimizer
import
MomentumOptimizer
from
ir_memory_optimize_net_base
import
TestIrMemOptBase
class
TestIrMemoryOptimizeIfElseOp
(
unittest
.
TestCase
):
def
check_network_convergence
(
self
,
use_cuda
=
True
,
py_opt
=
False
,
iter_num
=
5
):
prog
=
Program
()
startup_prog
=
Program
()
prog
.
random_seed
=
100
startup_prog
.
random_seed
=
100
with
program_guard
(
prog
,
startup_prog
):
image
=
layers
.
data
(
name
=
'x'
,
shape
=
[
784
],
dtype
=
'float32'
)
label
=
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'int64'
)
limit
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
'int64'
,
value
=
5
)
cond
=
layers
.
less_than
(
x
=
label
,
y
=
limit
)
ie
=
layers
.
IfElse
(
cond
)
with
ie
.
true_block
():
true_image
=
ie
.
input
(
image
)
hidden
=
layers
.
fc
(
input
=
true_image
,
size
=
100
,
act
=
'tanh'
)
prob
=
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
ie
.
output
(
prob
)
with
ie
.
false_block
():
false_image
=
ie
.
input
(
image
)
hidden
=
layers
.
fc
(
input
=
false_image
,
size
=
200
,
act
=
'tanh'
)
prob
=
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
ie
.
output
(
prob
)
prob
=
ie
()
loss
=
layers
.
cross_entropy
(
input
=
prob
[
0
],
label
=
label
)
avg_loss
=
layers
.
mean
(
loss
)
optimizer
=
MomentumOptimizer
(
learning_rate
=
0.001
,
momentum
=
0.9
)
optimizer
.
minimize
(
avg_loss
,
startup_prog
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
200
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
Executor
(
place
)
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
.
use_cuda
=
use_cuda
if
py_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
train_cp
=
compiler
.
CompiledProgram
(
fluid
.
default_main_program
())
train_cp
=
train_cp
.
with_data_parallel
(
loss_name
=
avg_loss
.
name
,
exec_strategy
=
exec_strategy
)
fetch_list
=
[
avg_loss
.
name
]
exe
.
run
(
startup_prog
)
PASS_NUM
=
100
loop
=
0
ret
=
[]
for
pass_id
in
range
(
PASS_NUM
):
for
data
in
train_reader
():
x_data
=
np
.
array
([
x
[
0
]
for
x
in
data
]).
astype
(
"float32"
)
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
"int64"
)
y_data
=
y_data
.
reshape
((
y_data
.
shape
[
0
],
1
))
outs
=
exe
.
run
(
train_cp
,
feed
=
{
'x'
:
x_data
,
'y'
:
y_data
},
fetch_list
=
[
avg_loss
])
loop
+=
1
ret
.
append
(
outs
[
0
])
if
iter_num
==
loop
:
return
ret
return
ret
def
test_ifelse
(
self
):
ret1
=
self
.
check_network_convergence
(
False
,
True
)
print
(
ret1
)
ret2
=
self
.
check_network_convergence
(
False
,
False
)
print
(
ret2
)
self
.
assertTrue
(
np
.
allclose
(
ret1
,
ret2
))
if
fluid
.
core
.
is_compiled_with_cuda
():
ret1
=
self
.
check_network_convergence
(
True
,
True
)
print
(
ret1
)
ret2
=
self
.
check_network_convergence
(
True
,
False
)
print
(
ret2
)
self
.
assertTrue
(
np
.
allclose
(
ret1
,
ret2
))
#self.assertEqual(ret1, ret2)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
浏览文件 @
814a7590
...
@@ -59,8 +59,12 @@ class TestFetchAndFeed(unittest.TestCase):
...
@@ -59,8 +59,12 @@ class TestFetchAndFeed(unittest.TestCase):
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
#FIXME force disable enable_inplace and memory_optimize to pass the unittest
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
enable_inplace
=
False
build_strategy
.
memory_optimize
=
False
train_cp
=
compiler
.
CompiledProgram
(
main_program
).
with_data_parallel
(
train_cp
=
compiler
.
CompiledProgram
(
main_program
).
with_data_parallel
(
loss_name
=
loss
.
name
)
loss_name
=
loss
.
name
,
build_strategy
=
build_strategy
)
run_parallel_exe
(
train_cp
,
exe
,
use_cuda
,
data
,
label
,
loss
)
run_parallel_exe
(
train_cp
,
exe
,
use_cuda
,
data
,
label
,
loss
)
...
...
python/paddle/fluid/tests/unittests/test_pass_builder.py
浏览文件 @
814a7590
...
@@ -96,6 +96,9 @@ class TestPassBuilder(unittest.TestCase):
...
@@ -96,6 +96,9 @@ class TestPassBuilder(unittest.TestCase):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
self
.
assertFalse
(
build_strategy
.
fuse_elewise_add_act_ops
)
self
.
assertFalse
(
build_strategy
.
fuse_elewise_add_act_ops
)
build_strategy
.
fuse_elewise_add_act_ops
=
True
build_strategy
.
fuse_elewise_add_act_ops
=
True
#FIXME: currently fuse_elewise_add_act_ops not compatible with below options
build_strategy
.
enable_inplace
=
False
build_strategy
.
memory_optimize
=
False
pass_builder
=
build_strategy
.
_finalize_strategy_and_create_passes
()
pass_builder
=
build_strategy
.
_finalize_strategy_and_create_passes
()
self
.
assertTrue
(
"fuse_elewise_add_act_pass"
in
self
.
assertTrue
(
"fuse_elewise_add_act_pass"
in
[
p
.
type
()
for
p
in
pass_builder
.
all_passes
()])
[
p
.
type
()
for
p
in
pass_builder
.
all_passes
()])
...
...
python/paddle/fluid/tests/unittests/test_py_func_op.py
浏览文件 @
814a7590
...
@@ -142,6 +142,10 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
...
@@ -142,6 +142,10 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
fluid
.
default_startup_program
())
#FIXME force use old memory optimzie strategy here to pass the unittest
#since open the new strategy will crash the unittest
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
train_cp
=
compiler
.
CompiledProgram
(
fluid
.
default_main_program
())
train_cp
=
compiler
.
CompiledProgram
(
fluid
.
default_main_program
())
if
use_parallel_executor
:
if
use_parallel_executor
:
train_cp
=
train_cp
.
with_data_parallel
(
loss_name
=
loss
.
name
)
train_cp
=
train_cp
.
with_data_parallel
(
loss_name
=
loss
.
name
)
...
...
python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
浏览文件 @
814a7590
...
@@ -49,6 +49,21 @@ class TestSequenceEraseOpInt32(OpTest):
...
@@ -49,6 +49,21 @@ class TestSequenceEraseOpInt32(OpTest):
self
.
check_output
()
self
.
check_output
()
class
TestSequenceEraseOpInt32LoD2
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"sequence_erase"
in_seq
=
np
.
random
.
randint
(
0
,
10
,
(
30
,
1
)).
astype
(
"int32"
)
lod
=
[[
1
,
3
],
[
9
,
4
,
11
,
6
]]
tokens
=
[
2
,
3
,
5
]
out_seq
,
new_lod0
=
sequence_erase
(
in_seq
,
lod
[
-
1
],
tokens
)
self
.
attrs
=
{
'tokens'
:
tokens
}
self
.
inputs
=
{
'X'
:
(
in_seq
,
lod
)}
self
.
outputs
=
{
'Out'
:
(
out_seq
,
lod
[:
-
1
]
+
[
new_lod0
])}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestSequenceEraseOpInt64
(
OpTest
):
class
TestSequenceEraseOpInt64
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"sequence_erase"
self
.
op_type
=
"sequence_erase"
...
...
tools/timeline.py
浏览文件 @
814a7590
...
@@ -131,7 +131,7 @@ class Timeline(object):
...
@@ -131,7 +131,7 @@ class Timeline(object):
if
(
k
,
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
if
(
k
,
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
k
,
event
.
device_id
,
"CPU"
)]
=
pid
self
.
_devices
[(
k
,
event
.
device_id
,
"CPU"
)]
=
pid
# -1 device id represents CUDA
API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
# -1 device id represents CUDA
api call
if
event
.
device_id
==
-
1
:
if
event
.
device_id
==
-
1
:
self
.
_chrome_trace
.
emit_pid
(
"%s:cuda_api"
%
k
,
pid
)
self
.
_chrome_trace
.
emit_pid
(
"%s:cuda_api"
%
k
,
pid
)
else
:
else
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录