Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
d8a939d8
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d8a939d8
编写于
3月 04, 2019
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into core_opt_choose_kernel
上级
d4674dab
cae6614c
变更
38
展开全部
显示空白变更内容
内联
并排
Showing
38 changed file
with
911 addition
and
863 deletion
+911
-863
paddle/fluid/API.spec
paddle/fluid/API.spec
+500
-500
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+2
-2
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+3
-1
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+7
-0
paddle/fluid/memory/CMakeLists.txt
paddle/fluid/memory/CMakeLists.txt
+1
-1
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+20
-0
paddle/fluid/operators/ngraph/ngraph_bridge.cc
paddle/fluid/operators/ngraph/ngraph_bridge.cc
+1
-0
paddle/fluid/operators/ngraph/ngraph_bridge.h
paddle/fluid/operators/ngraph/ngraph_bridge.h
+1
-0
paddle/fluid/operators/ngraph/ops/accuracy_op.h
paddle/fluid/operators/ngraph/ops/accuracy_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/activation_op.h
paddle/fluid/operators/ngraph/ops/activation_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/batch_norm_op.h
paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/binary_unary_op.h
paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/conv2d_op.h
paddle/fluid/operators/ngraph/ops/conv2d_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/mean_op.h
paddle/fluid/operators/ngraph/ops/mean_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/momentum_op.h
paddle/fluid/operators/ngraph/ops/momentum_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/mul_op.h
paddle/fluid/operators/ngraph/ops/mul_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/pool2d_op.h
paddle/fluid/operators/ngraph/ops/pool2d_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/scale_op.h
paddle/fluid/operators/ngraph/ops/scale_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/softmax_op.h
paddle/fluid/operators/ngraph/ops/softmax_op.h
+2
-0
paddle/fluid/operators/ngraph/ops/top_k_op.h
paddle/fluid/operators/ngraph/ops/top_k_op.h
+2
-0
paddle/fluid/operators/reader/buffered_reader.cc
paddle/fluid/operators/reader/buffered_reader.cc
+14
-9
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+54
-9
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+12
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+14
-17
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+43
-29
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+34
-34
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+0
-9
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+2
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+19
-140
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
...dle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+118
-23
python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
...dle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+18
-0
tools/check_doc_approval.py
tools/check_doc_approval.py
+0
-85
tools/diff_api.py
tools/diff_api.py
+6
-0
tools/print_signatures.py
tools/print_signatures.py
+11
-1
tools/timeline.py
tools/timeline.py
+1
-1
未找到文件。
paddle/fluid/API.spec
浏览文件 @
d8a939d8
此差异已折叠。
点击以展开。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
d8a939d8
...
...
@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library
(
tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context
)
add_dependencies
(
tensor tensor_util
)
else
()
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
profiler
)
endif
(
WIN32
)
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
)
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
profiler
)
endif
()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
d8a939d8
...
...
@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
...
...
@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
std
::
vector
<
FetchOpHandle
*>
fetch_ops
;
for
(
auto
&
fetch_var_name
:
fetch_tensors
)
{
for
(
auto
&
var_map
:
graph_
->
Get
<
details
::
GraphVars
>
(
"vars"
))
{
for
(
auto
&
var_map
:
graph_
->
Get
<
details
::
GraphVars
>
(
details
::
kGraphVars
))
{
auto
it
=
var_map
.
find
(
fetch_var_name
);
if
(
it
!=
var_map
.
end
())
{
fetched_vars
[
fetch_var_name
].
push_back
(
*
it
->
second
.
rbegin
());
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
d8a939d8
...
...
@@ -14,8 +14,11 @@
#include "paddle/fluid/framework/tensor_util.h"
#include <algorithm>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:GPU->CPU"
);
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:CPU->GPU"
);
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:GPU->GPU"
);
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
...
...
@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:CUDAPinned->GPU"
);
auto
src_pinned_place
=
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_pinned_place
,
src_ptr
,
size
,
...
...
paddle/fluid/memory/CMakeLists.txt
浏览文件 @
d8a939d8
add_subdirectory
(
detail
)
add_subdirectory
(
allocation
)
cc_library
(
malloc SRCS malloc.cc DEPS place enforce allocator_facade
)
cc_library
(
malloc SRCS malloc.cc DEPS place enforce allocator_facade
profiler
)
cc_library
(
memcpy SRCS memcpy.cc DEPS place
)
cc_library
(
memory
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
d8a939d8
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
memory
{
...
...
@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:GPU->CPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:GPU->CPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
...
...
@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:CPU->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:CPU->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
...
...
@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if
(
dst_place
==
src_place
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync(same_gpu):GPU->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
,
stream
);
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync(same_gpu):GPU->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
);
}
}
else
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyPeerAsync:GPU->GPU"
);
platform
::
GpuMemcpyPeerAsync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
,
stream
);
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyPeerSync:GPU->GPU"
);
platform
::
GpuMemcpyPeerSync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
);
}
...
...
@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:GPU->CUDAPinned"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:GPU->CUDAPinned"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
}
}
...
...
@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:CUDAPinned->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:CUDAPinned->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
}
}
...
...
paddle/fluid/operators/ngraph/ngraph_bridge.cc
浏览文件 @
d8a939d8
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm>
#include <functional>
#include <memory>
#include <vector>
#include "ngraph/ngraph.hpp"
...
...
paddle/fluid/operators/ngraph/ngraph_bridge.h
浏览文件 @
d8a939d8
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
...
...
paddle/fluid/operators/ngraph/ops/accuracy_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
...
...
paddle/fluid/operators/ngraph/ops/activation_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
...
...
paddle/fluid/operators/ngraph/ops/batch_norm_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
...
...
paddle/fluid/operators/ngraph/ops/binary_unary_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
...
...
paddle/fluid/operators/ngraph/ops/conv2d_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
...
...
paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
浏览文件 @
d8a939d8
...
...
@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
...
...
paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
...
...
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
...
...
paddle/fluid/operators/ngraph/ops/mean_op.h
浏览文件 @
d8a939d8
...
...
@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
...
...
paddle/fluid/operators/ngraph/ops/momentum_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
...
...
paddle/fluid/operators/ngraph/ops/mul_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
...
...
paddle/fluid/operators/ngraph/ops/pool2d_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
...
...
paddle/fluid/operators/ngraph/ops/scale_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
...
...
paddle/fluid/operators/ngraph/ops/softmax_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
...
...
paddle/fluid/operators/ngraph/ops/top_k_op.h
浏览文件 @
d8a939d8
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
...
...
paddle/fluid/operators/reader/buffered_reader.cc
浏览文件 @
d8a939d8
...
...
@@ -13,9 +13,11 @@
// limitations under the License.
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include <memory>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
...
...
@@ -49,9 +51,10 @@ BufferedReader::BufferedReader(
.
Get
(
place_
)))
->
stream
();
events
.
resize
(
buffer_size
);
for
(
auto
&
event
:
events
)
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream
));
for
(
auto
&
event
:
events
)
{
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
PADDLE_ENFORCE
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
));
}
}
#endif
cpu_buffer_
.
resize
(
buffer_size
);
...
...
@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream
// TensorCopySync would block other stream, because TensorCopySync
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
stream
,
events
[
i
],
0
));
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
gpu
.
resize
(
cpu
.
size
());
platform
::
RecordEvent
record_event
(
"BufferedReader:MemoryCopy"
);
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
gpu
[
i
].
Resize
(
cpu
[
i
].
dims
());
gpu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
...
...
@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) {
auto
gpu_ptr
=
gpu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
());
auto
size
=
cpu
[
i
].
numel
()
*
paddle
::
framework
::
SizeOfType
(
cpu
[
i
].
type
());
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
}
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
else
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
}
else
{
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
0
);
stream
);
}
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
}
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
d8a939d8
...
...
@@ -30,7 +30,6 @@ limitations under the License. */
#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
...
...
@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
}
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
// -1 device id represents
CUDA
api call
tracer
->
Add
CPU
Records
(
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
{
// -1 device id represents
ActiveKind
api call
tracer
->
Add
ActiveKind
Records
(
DriverKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
GetThreadIdFromSystemThreadId
(
api
->
threadId
));
GetThreadIdFromSystemThreadId
(
api
->
threadId
),
api
->
correlationId
);
}
break
;
}
case
CUPTI_ACTIVITY_KIND_RUNTIME
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
tracer
->
AddCPURecords
(
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
{
// -1 device id represents ActiveKind api call
tracer
->
AddActiveKindRecords
(
RuntimeKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
GetThreadIdFromSystemThreadId
(
api
->
threadId
));
GetThreadIdFromSystemThreadId
(
api
->
threadId
),
api
->
correlationId
);
}
break
;
}
default:
{
break
;
}
...
...
@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id
,
correlation_id
,
bytes
});
}
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
{
if
(
anno
.
empty
())
{
VLOG
(
1
)
<<
"Empty timeline annotation."
;
return
;
}
thread_local
std
::
forward_list
<
ActiveKindRecord
>
*
local_active_kind_records
=
nullptr
;
if
(
local_active_kind_records
==
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
active_kind_records_
.
emplace_front
();
local_active_kind_records
=
&
active_kind_records_
.
front
();
}
// lock is not needed, only one thread call this function.
local_active_kind_records
->
push_front
(
ActiveKindRecord
{
anno
,
start_ns
,
end_ns
,
device_id
,
thread_id
,
correlation_id
});
}
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
int64_t
device_id
,
int64_t
stream_id
,
uint32_t
correlation_id
)
{
...
...
@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer {
}
const
std
::
vector
<
int
>
cbids
{
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
...
...
@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
active_kind_records_
)
tmp
.
clear
();
}
void
GenEventKernelCudaElapsedTime
()
{
...
...
@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_device_id
(
r
.
device_id
);
}
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
for
(
auto
&
tmp
:
cpu_records_
)
for
(
auto
&
tmp
:
cpu_records_
)
{
for
(
const
CPURecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
CPU
);
...
...
@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
for
(
auto
&
tmp
:
active_kind_records_
)
{
for
(
const
ActiveKindRecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
CPU
);
auto
c
=
correlations_
.
find
(
r
.
correlation_id
);
if
(
c
!=
correlations_
.
end
()
&&
c
->
second
!=
nullptr
)
{
event
->
set_name
(
c
->
second
->
name
());
event
->
set_detail_info
(
r
.
name
);
}
else
{
event
->
set_name
(
r
.
name
);
}
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
miss
=
find
=
0
;
for
(
const
MemRecord
&
r
:
mem_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
...
...
@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer {
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
ActiveKindRecord
>>
active_kind_records_
;
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
correlations_pairs
;
std
::
unordered_map
<
uint32_t
,
Event
*>
correlations_
;
...
...
@@ -613,6 +657,7 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaSetupArgument_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunch_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaDeviceGetPCIBusId_v4010
);
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
d8a939d8
...
...
@@ -63,7 +63,14 @@ class DeviceTracer {
uint32_t
correlation_id
;
uint64_t
bytes
;
};
struct
ActiveKindRecord
{
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
end_ns
;
int64_t
device_id
;
int64_t
thread_id
;
uint32_t
correlation_id
;
};
virtual
~
DeviceTracer
()
{}
// Needs to be called once before use.
virtual
void
Enable
()
=
0
;
...
...
@@ -85,6 +92,10 @@ class DeviceTracer {
virtual
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
)
=
0
;
virtual
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
d8a939d8
...
...
@@ -415,10 +415,11 @@ function assert_api_not_changed() {
source
.env/bin/activate
pip
install
${
PADDLE_ROOT
}
/build/python/dist/
*
whl
python
${
PADDLE_ROOT
}
/tools/print_signatures.py paddle.fluid,paddle.reader
>
new.spec
if
[
"
$1
"
==
"cp35-cp35m"
]
||
[
"
$1
"
==
"cp36-cp36m"
]
||
[
"
$1
"
==
"cp37-cp37m"
]
;
then
# Use sed to make python2 and python3 sepc keeps the same
sed
-i
's/arg0: str/arg0: unicode/g'
new.spec
sed
-i
"s/
\(
.*Transpiler.*
\)
.__init__ ArgSpec(args=
\[
'self'].*/
\1
.__init__ /g"
new.spec
sed
-i
"s/
\(
.*Transpiler.*
\)
.__init__
(
ArgSpec(args=
\[
'self'].*/
\1
.__init__ /g"
new.spec
fi
# ComposeNotAligned has significant difference between py2 and py3
sed
-i
'/.*ComposeNotAligned.*/d'
new.spec
...
...
@@ -452,11 +453,20 @@ function assert_api_spec_approvals() {
echo
"checking
${
API_FILE
}
change, PR:
${
GIT_PR_ID
}
, changes:
${
API_CHANGE
}
"
if
[
${
API_CHANGE
}
]
&&
[
"
${
GIT_PR_ID
}
"
!=
""
]
;
then
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
if
[
"
$API_FILE
"
==
"paddle/fluid/API.spec"
]
;
then
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews?per_page
=
10000 |
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 2 2887803 35982308
`
else
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews?per_page
=
10000 |
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 1 2887803
`
fi
echo
"current pr
${
GIT_PR_ID
}
got approvals:
${
APPROVALS
}
"
if
[
"
${
APPROVALS
}
"
==
"FALSE"
]
;
then
if
[
"
$API_FILE
"
==
"paddle/fluid/API.spec"
]
;
then
echo
"You must have panyx0718 and shanyi15 approval for the api change!
${
API_FILE
}
"
else
echo
"You must have panyx0718 approval for the api change!
${
API_FILE
}
"
fi
exit
1
fi
fi
...
...
@@ -472,19 +482,6 @@ function assert_api_spec_approvals() {
exit
1
fi
fi
pip
install
${
PADDLE_ROOT
}
/build/opt/paddle/share/wheels/
*
.whl
CHECK_DOCK_MD5
=
`
python
${
PADDLE_ROOT
}
/tools/check_doc_approval.py
`
if
[
"True"
!=
${
CHECK_DOCK_MD5
}
]
;
then
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews?per_page
=
10000 |
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 1 35982308
`
echo
"current pr
${
GIT_PR_ID
}
got approvals:
${
APPROVALS
}
"
if
[
"
${
APPROVALS
}
"
==
"FALSE"
]
;
then
echo
"You must have shanyi15 approval for the api doc change! "
exit
1
fi
echo
${
CHECK_DOCK_MD5
}
>
/root/.cache/doc_md5.txt
fi
}
...
...
python/paddle/fluid/compiler.py
浏览文件 @
d8a939d8
...
...
@@ -17,7 +17,6 @@ import os
import
six
import
sys
from
..
import
compat
as
cpt
from
.
import
framework
from
.
import
core
from
.
import
framework
...
...
@@ -36,6 +35,30 @@ def _place_obj(place):
return
p
def
_is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
def
get_available_places
(
use_cuda
):
if
use_cuda
:
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())]
places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
places
,
"no place for execution"
return
places
class
CompiledProgram
(
object
):
"""
Compiles to Graph for execution.
...
...
@@ -127,8 +150,7 @@ class CompiledProgram(object):
self
.
_exec_strategy
=
ExecutionStrategy
()
if
self
.
_build_strategy
is
None
:
self
.
_build_strategy
=
BuildStrategy
()
self
.
_build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
self
.
_program
)
self
.
_build_strategy
.
is_distribution
=
_is_pserver_mode
(
self
.
_program
)
return
self
def
with_inference_optimize
(
self
,
config
):
...
...
@@ -153,9 +175,9 @@ class CompiledProgram(object):
def
_with_distributed
(
self
):
raise
NotImplementedError
()
def
_compile_data_parallel
(
self
):
def
_compile_data_parallel
(
self
,
use_cuda
=
False
,
scope
=
None
):
if
self
.
_share_vars_from
:
if
s
elf
.
_s
cope
:
if
scope
:
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
if
not
self
.
_share_vars_from
.
_is_data_parallel
:
raise
ValueError
(
"share_vars_from is not data parallel. Cannot "
...
...
@@ -166,23 +188,11 @@ class CompiledProgram(object):
"var to share."
)
self
.
_local_scopes
=
self
.
_share_vars_from
.
_executor
.
local_scopes
()
else
:
assert
scope
is
not
None
,
""
self
.
_local_scopes
=
[]
self
.
_exec_strategy
.
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
)
if
self
.
_exec_strategy
.
use_cuda
:
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())
]
self
.
_places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
self
.
_places
,
"no place for execution"
self
.
_exec_strategy
.
use_cuda
=
use_cuda
self
.
_places
=
get_available_places
(
self
.
_exec_strategy
.
use_cuda
)
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
use_cuda
:
...
...
@@ -197,9 +207,11 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if
self
.
_build_strategy
.
memory_optimize
is
None
:
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
memory_optimize
=
False
\
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
enable_inplace
is
None
:
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
self
.
_build_strategy
.
enable_inplace
=
False
\
if
self
.
_program
and
self
.
_program
.
_is_mem_optimized
else
True
# TODO(wuyi): trainer endpoings should be passed in through
# build_strategy, not program.xxx.
...
...
@@ -221,12 +233,12 @@ class CompiledProgram(object):
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
return
core
.
ParallelExecutor
(
places
,
return
core
.
ParallelExecutor
(
places
,
set
(
self
.
_persistable_vars
),
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
,
self
.
_graph
)
if
self
.
_loss_name
else
six
.
u
(
''
),
scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
,
self
.
_graph
)
def
_compile_inference
(
self
):
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
...
...
@@ -253,7 +265,9 @@ class CompiledProgram(object):
self
.
_scope
=
scope
self
.
_place
=
place
if
self
.
_is_data_parallel
:
self
.
_executor
=
self
.
_compile_data_parallel
()
self
.
_executor
=
self
.
_compile_data_parallel
(
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
),
scope
=
self
.
_scope
)
elif
self
.
_is_inference
:
self
.
_executor
=
self
.
_compile_inference
()
else
:
...
...
python/paddle/fluid/executor.py
浏览文件 @
d8a939d8
...
...
@@ -261,20 +261,20 @@ def _as_lodtensor(data, place):
class
Executor
(
object
):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
ParallelExecutor.
Python executor takes a program, add feed operators and fetch operators to this program according
An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
Python executor takes a program, adds feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want
to get after program run
. Note: the executor will run all
the variables(or names) that user want
s to get after program runs
. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The contents in local scope may be discarded after every minibatch forward/backward
finished. But the global scope variables will be persistent through different runs.
Example:
.. code-block:: python
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
...
...
@@ -297,9 +297,6 @@ class Executor(object):
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
"""
def
__init__
(
self
,
place
):
...
...
@@ -382,6 +379,12 @@ class Executor(object):
]
return
outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def
close
(
self
):
"""
Close this executor.
...
...
@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example:
>>> cpu = core.CPUPlace()
...
...
python/paddle/fluid/framework.py
浏览文件 @
d8a939d8
...
...
@@ -87,15 +87,6 @@ def _current_expected_place():
return
_imperative_current_expected_place_
def
is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
NameScope
(
object
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
self
.
_children
=
dict
()
...
...
python/paddle/fluid/io.py
浏览文件 @
d8a939d8
...
...
@@ -468,9 +468,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
# `prog` can be a program defined by the user
prog = fluid.default_main_program()
fluid.io.save_persistables(executor=exe, dirname=param_path,
main_program=
None
)
main_program=
prog
)
"""
if
main_program
and
main_program
.
_is_distributed
:
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
d8a939d8
...
...
@@ -13,15 +13,11 @@
# limitations under the License.
from
__future__
import
print_function
import
multiprocessing
from
.
import
core
from
.
import
framework
from
.
import
executor
from
..
import
compat
as
cpt
import
warnings
from
.
import
compiler
import
sys
import
six
import
os
__all__
=
[
'ParallelExecutor'
]
...
...
@@ -97,99 +93,27 @@ class ParallelExecutor(object):
'Please use CompiledProgram and Executor. CompiledProgram '
'is a central place for optimization and Executor is the '
'unified executor. Example can be found in compiler.py.
\n
'
)
# step1: get places, the places are used in run too.
self
.
_places
=
[]
if
use_cuda
:
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())
]
self
.
_places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
self
.
_places
,
"no place for execution"
# step2: init exec_strategy
if
exec_strategy
is
None
:
exec_strategy
=
ExecutionStrategy
()
exec_strategy
.
use_cuda
=
use_cuda
if
exec_strategy
.
num_threads
==
0
:
if
use_cuda
:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
exec_strategy
.
num_threads
=
len
(
self
.
_places
)
*
4
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exec_strategy
.
num_threads
=
cpu_num
*
2
# step3: init build_strategy
if
build_strategy
is
None
:
build_strategy
=
BuildStrategy
()
build_strategy
.
num_trainers
=
num_trainers
build_strategy
.
trainer_id
=
trainer_id
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
main_program
)
or
num_trainers
>
1
# step4: get main_program, scope, local_scopes
main
=
main_program
if
main_program
\
else
framework
.
default_main_program
()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if
build_strategy
.
memory_optimize
is
None
:
build_strategy
.
memory_optimize
=
False
if
main
.
_is_mem_optimized
else
True
if
build_strategy
.
enable_inplace
is
None
:
build_strategy
.
enable_inplace
=
False
if
main
.
_is_mem_optimized
else
True
scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
if
share_vars_from
and
not
isinstance
(
share_vars_from
,
ParallelExecutor
):
raise
TypeError
(
"share_vars_from must be ParallelExecutor."
)
local_scopes
=
share_vars_from
.
executor
.
local_scopes
()
\
if
share_vars_from
else
[]
# step5: check trainers_endpoints, it is used for distribution.
trainers_endpoints
=
main
.
_trainers_endpoints
if
num_trainers
>
1
and
trainers_endpoints
:
assert
num_trainers
==
len
(
trainers_endpoints
),
"num_trainers == len(endpoints)"
build_strategy
.
trainers_endpoints
=
trainers_endpoints
# step6: get persistable_vars, places. persistable_vars
# need be broadcast to other local_scope.
persistable_vars
=
set
([
cpt
.
to_text
(
v
.
name
)
for
v
in
[
var
for
var
in
main
.
list_vars
()
if
var
.
persistable
and
var
.
type
!=
core
.
VarDesc
.
VarType
.
RAW
]
])
def
place_obj
(
place
):
p
=
core
.
Place
()
p
.
set_place
(
place
)
return
p
places
=
list
(
map
(
place_obj
,
self
.
_places
))
# step7: init ParallelExecutor
# ParallelExecutor API will be deprecated, don't support parallel graph.
self
.
_graph
=
core
.
Graph
(
main
.
desc
)
self
.
_places
=
compiler
.
get_available_places
(
use_cuda
)
self
.
_scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
self
.
executor
=
core
.
ParallelExecutor
(
places
,
persistable_vars
,
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
build_strategy
,
self
.
_graph
)
main_program
=
main_program
if
main_program
is
not
None
\
else
framework
.
default_main_program
()
self
.
scope
=
scope
self
.
_compiled_program
=
compiler
.
CompiledProgram
(
main_program
)
self
.
_compiled_program
.
with_data_parallel
(
loss_name
=
loss_name
,
build_strategy
=
build_strategy
,
exec_strategy
=
exec_strategy
,
share_vars_from
=
share_vars_from
)
self
.
_place
=
core
.
CUDAPlace
(
0
)
if
use_cuda
else
core
.
CPUPlace
()
self
.
_executor
=
executor
.
Executor
(
self
.
_place
)
self
.
_compiled_program
.
_compile
(
place
=
self
.
_place
,
scope
=
self
.
_scope
)
def
run
(
self
,
fetch_list
,
feed
=
None
,
feed_dict
=
None
,
return_numpy
=
True
):
"""
...
...
@@ -256,56 +180,11 @@ class ParallelExecutor(object):
loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))
"""
if
feed
is
None
and
feed_dict
is
not
None
:
feed
=
feed_dict
print
(
"`feed_dict` is deprecated. Please use `feed=`"
,
file
=
sys
.
stderr
)
if
isinstance
(
feed
,
dict
):
feed_tensor_dict
=
dict
()
for
feed_name
in
feed
:
feed_tensor
=
feed
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor
.
set
(
feed
[
feed_name
],
core
.
CPUPlace
())
feed_tensor_dict
[
feed_name
]
=
feed_tensor
self
.
executor
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
self
.
_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
raise
TypeError
(
"Each element of feed list should be a dict"
)
res_dict
=
dict
()
for
feed_name
in
each
:
tensor
=
each
[
feed_name
]
if
not
isinstance
(
tensor
,
core
.
LoDTensor
):
tmp
=
core
.
LoDTensor
()
tmp
.
set
(
tensor
,
self
.
_places
[
i
])
tensor
=
tmp
res_dict
[
feed_name
]
=
tensor
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_name
=
'fetch'
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
if
return_numpy
:
return
executor
.
as_numpy
(
arr
)
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
return
self
.
_executor
.
run
(
program
=
self
.
_compiled_program
,
scope
=
self
.
_scope
,
feed
=
feed
,
fetch_list
=
fetch_list
,
return_numpy
=
return_numpy
)
@
property
def
device_count
(
self
):
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
浏览文件 @
d8a939d8
...
...
@@ -15,44 +15,139 @@
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.test_conv2d_op
import
TestConv2dOp
,
TestWithPad
,
TestWithStride
,
TestWithGroup
,
TestWith1x1
,
TestWithInput1x1Filter1x1
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.test_conv2d_op
import
TestConv2dOp
class
TestMKLDNN
(
TestConv2dOp
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
def
conv2d_bias_naive
(
out
,
bias
):
_
,
out_c
,
_
,
_
=
out
.
shape
for
l
in
range
(
out_c
):
out
[:,
l
,
:,
:]
=
out
[:,
l
,
:,
:]
+
bias
[
l
]
return
out
class
TestMKLDNNWithPad
(
TestWithPad
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
def
conv2d_residual_naive
(
out
,
residual
):
assert
out
.
shape
==
residual
.
shape
out
=
np
.
add
(
out
,
residual
)
return
out
class
TestMKLDNNWithStride
(
TestWithStride
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
class
TestConv2dMKLDNNOp
(
TestConv2dOp
):
def
init_group
(
self
):
self
.
groups
=
1
class
TestMKLDNNWithGroup
(
TestWithGroup
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
self
.
use_mkldnn
=
True
self
.
_cpu_only
=
True
def
init_test_case
(
self
):
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
class
TestMKLDNNWith1x1
(
TestWith1x1
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
def
setUp
(
self
):
self
.
fuse_bias
=
False
self
.
bias_size
=
None
self
.
fuse_relu
=
False
self
.
fuse_residual_connection
=
False
self
.
input_residual_size
=
None
TestConv2dOp
.
setUp
(
self
)
output
=
self
.
outputs
[
'Output'
]
class
TestMKLDNNWithInput1x1Filter1x1
(
TestWithInput1x1Filter1x1
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
#mkldnn only support either conv-sum-relu, or conv-relu.
if
self
.
fuse_bias
and
self
.
bias_size
is
not
None
:
bias
=
np
.
random
.
random
(
self
.
bias_size
).
astype
(
self
.
dtype
)
output
=
conv2d_bias_naive
(
output
,
bias
)
output
=
output
.
astype
(
self
.
dtype
)
self
.
attrs
[
'fuse_bias'
]
=
self
.
fuse_bias
self
.
inputs
[
'Bias'
]
=
OpTest
.
np_dtype_to_fluid_dtype
(
bias
)
if
self
.
fuse_residual_connection
and
self
.
input_residual_size
is
not
None
:
input_residual
=
np
.
random
.
random
(
self
.
input_residual_size
).
astype
(
self
.
dtype
)
output
=
conv2d_residual_naive
(
output
,
input_residual
)
self
.
attrs
[
'fuse_residual_connection'
]
=
self
.
fuse_residual_connection
self
.
inputs
[
'ResidualData'
]
=
OpTest
.
np_dtype_to_fluid_dtype
(
input_residual
)
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output
,
0
).
astype
(
self
.
dsttype
)
output
=
output
.
astype
(
self
.
dtype
)
self
.
attrs
[
'fuse_bias'
]
=
self
.
fuse_bias
self
.
attrs
[
'fuse_relu'
]
=
self
.
fuse_relu
self
.
attrs
[
'fuse_residual_connection'
]
=
self
.
fuse_residual_connection
self
.
outputs
[
'Output'
]
=
output
class
TestWithFuse
(
TestConv2dMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dMKLDNNOp
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
fuse_bias
=
True
self
.
bias_size
=
[
6
]
self
.
fuse_residual_connection
=
True
self
.
input_residual_size
=
[
2
,
6
,
5
,
5
]
def
test_check_grad
(
self
):
pass
def
test_check_grad_no_filter
(
self
):
pass
def
test_check_grad_no_input
(
self
):
pass
class
TestWithPadWithBias
(
TestConv2dMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dMKLDNNOp
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
6
,
6
]
class
TestWithStride
(
TestConv2dMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dMKLDNNOp
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
6
,
6
]
class
TestWithGroup
(
TestConv2dMKLDNNOp
):
def
init_group
(
self
):
self
.
groups
=
3
class
TestWith1x1
(
TestConv2dMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dMKLDNNOp
.
init_test_case
(
self
)
self
.
filter_size
=
[
6
,
3
,
1
,
1
]
class
TestWithInput1x1Filter1x1
(
TestConv2dMKLDNNOp
):
def
init_test_case
(
self
):
TestConv2dMKLDNNOp
.
init_test_case
(
self
)
self
.
input_size
=
[
2
,
3
,
1
,
1
]
# NCHW
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
def
init_group
(
self
):
self
.
groups
=
3
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
浏览文件 @
d8a939d8
...
...
@@ -18,6 +18,24 @@ import unittest
from
paddle.fluid.tests.unittests.test_pool2d_op
import
TestPool2D_Op
,
TestCase1
,
TestCase2
,
TestCase3
,
TestCase4
,
TestCase5
def
create_test_mkldnn_use_ceil_class
(
parent
):
class
TestMKLDNNPool2DUseCeilCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_mkldnn
=
True
def
init_ceil_mode
(
self
):
self
.
ceil_mode
=
True
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"MKLDNNCeilModeCast"
)
TestMKLDNNPool2DUseCeilCase
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestMKLDNNPool2DUseCeilCase
create_test_mkldnn_use_ceil_class
(
TestPool2D_Op
)
create_test_mkldnn_use_ceil_class
(
TestCase1
)
create_test_mkldnn_use_ceil_class
(
TestCase2
)
def
create_test_mkldnn_class
(
parent
):
class
TestMKLDNNCase
(
parent
):
def
init_kernel_type
(
self
):
...
...
tools/check_doc_approval.py
已删除
100644 → 0
浏览文件 @
d4674dab
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
ast
import
hashlib
import
importlib
import
paddle.fluid
files
=
[
"paddle.fluid"
,
"paddle.fluid.average"
,
"paddle.fluid.backward"
,
"paddle.fluid.clip"
,
"paddle.fluid.data_feeder"
,
"paddle.fluid.executor"
,
"paddle.fluid.initializer"
,
"paddle.fluid.io"
,
"paddle.fluid.layers"
,
"paddle.fluid.metrics"
,
"paddle.fluid.nets"
,
"paddle.fluid.optimizer"
,
"paddle.fluid.profiler"
,
"paddle.fluid.recordio_writer"
,
"paddle.fluid.regularizer"
,
"paddle.fluid.transpiler"
]
def
md5
(
doc
):
hash
=
hashlib
.
md5
()
hash
.
update
(
str
(
doc
))
return
hash
.
hexdigest
()
def
get_module
():
for
fi
in
files
:
fi_lib
=
importlib
.
import_module
(
fi
)
doc_function
=
getattr
(
fi_lib
,
"__all__"
)
for
api
in
doc_function
:
api_name
=
fi
+
"."
+
api
try
:
doc_module
=
getattr
(
eval
(
api_name
),
"__doc__"
)
except
:
pass
doc_md5_code
=
md5
(
doc_module
)
doc_dict
[
api_name
]
=
doc_md5_code
def
doc_md5_dict
(
doc_md5_path
):
with
open
(
doc_md5_path
,
"rb"
)
as
f
:
doc_md5
=
f
.
read
()
doc_md5_dict
=
ast
.
literal_eval
(
doc_md5
)
return
doc_md5_dict
def
check_doc_md5
():
for
k
,
v
in
doc_dict
.
items
():
try
:
if
doc_ci_dict
[
k
]
!=
v
:
return
doc_dict
except
:
return
doc_dict
return
True
if
__name__
==
"__main__"
:
doc_dict
=
{}
doc_ci_dict
=
{}
doc_md5_file
=
"/root/.cache/doc_md5.txt"
if
not
os
.
path
.
exists
(
doc_md5_file
):
os
.
mknod
(
doc_md5_file
)
else
:
doc_ci_dict
=
doc_md5_dict
(
doc_md5_file
)
get_module
()
if
not
os
.
path
.
getsize
(
doc_md5_file
):
with
open
(
doc_md5_file
,
'w'
)
as
f
:
f
.
write
(
str
(
doc_dict
))
check_dic
=
True
print
(
check_dic
)
else
:
check_dic
=
check_doc_md5
()
print
(
check_dic
)
tools/diff_api.py
浏览文件 @
d8a939d8
...
...
@@ -26,4 +26,10 @@ for each_diff in result:
print
(
each_diff
)
if
error
:
print
(
'''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
1. cd ${paddle_path}, compile paddle;
2. pip install build/python/dist/(build whl package);
3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
)
sys
.
exit
(
1
)
tools/print_signatures.py
浏览文件 @
d8a939d8
...
...
@@ -24,12 +24,19 @@ import inspect
import
collections
import
sys
import
pydoc
import
hashlib
member_dict
=
collections
.
OrderedDict
()
experimental_namespace
=
{
"paddle.fluid.imperative"
}
def
md5
(
doc
):
hash
=
hashlib
.
md5
()
hash
.
update
(
str
(
doc
).
encode
(
'utf-8'
))
return
hash
.
hexdigest
()
def
visit_member
(
parent_name
,
member
):
cur_name
=
"."
.
join
([
parent_name
,
member
.
__name__
])
if
inspect
.
isclass
(
member
):
...
...
@@ -39,7 +46,10 @@ def visit_member(parent_name, member):
visit_member
(
cur_name
,
value
)
elif
callable
(
member
):
try
:
member_dict
[
cur_name
]
=
inspect
.
getargspec
(
member
)
doc
=
(
'document'
,
md5
(
member
.
__doc__
))
args
=
inspect
.
getargspec
(
member
)
all
=
(
args
,
doc
)
member_dict
[
cur_name
]
=
all
except
TypeError
:
# special for PyBind method
member_dict
[
cur_name
]
=
" "
.
join
([
line
.
strip
()
for
line
in
pydoc
.
render_doc
(
member
).
split
(
'
\n
'
)
...
...
tools/timeline.py
浏览文件 @
d8a939d8
...
...
@@ -131,7 +131,7 @@ class Timeline(object):
if
(
k
,
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
k
,
event
.
device_id
,
"CPU"
)]
=
pid
# -1 device id represents CUDA
api call
# -1 device id represents CUDA
API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
if
event
.
device_id
==
-
1
:
self
.
_chrome_trace
.
emit_pid
(
"%s:cuda_api"
%
k
,
pid
)
else
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录