Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b9ec24c6
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b9ec24c6
编写于
2月 24, 2018
作者:
X
Xin Pan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Extend current profiler for timeline and more features.
上级
2c89d975
变更
24
隐藏空白更改
内联
并排
Showing
24 changed file
with
699 addition
and
38 deletion
+699
-38
CMakeLists.txt
CMakeLists.txt
+1
-0
cmake/configure.cmake
cmake/configure.cmake
+9
-1
cmake/cuda.cmake
cmake/cuda.cmake
+2
-1
cmake/cupti.cmake
cmake/cupti.cmake
+41
-0
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+3
-1
paddle/fluid/framework/framework.proto
paddle/fluid/framework/framework.proto
+2
-0
paddle/fluid/framework/op_desc.h
paddle/fluid/framework/op_desc.h
+2
-0
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+4
-1
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+285
-0
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+72
-0
paddle/fluid/platform/dynload/CMakeLists.txt
paddle/fluid/platform/dynload/CMakeLists.txt
+6
-2
paddle/fluid/platform/dynload/cupti.cc
paddle/fluid/platform/dynload/cupti.cc
+35
-0
paddle/fluid/platform/dynload/cupti.h
paddle/fluid/platform/dynload/cupti.h
+86
-0
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+16
-0
paddle/fluid/platform/dynload/dynamic_loader.h
paddle/fluid/platform/dynload/dynamic_loader.h
+2
-0
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+32
-6
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+8
-2
paddle/fluid/platform/profiler.proto
paddle/fluid/platform/profiler.proto
+30
-0
paddle/fluid/platform/profiler_test.cc
paddle/fluid/platform/profiler_test.cc
+1
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-0
python/paddle/fluid/profiler.py
python/paddle/fluid/profiler.py
+8
-3
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/test_profiler.py
python/paddle/fluid/tests/unittests/test_profiler.py
+5
-20
python/paddle/v2/fluid/tests/unittests/test_nvprof.py
python/paddle/v2/fluid/tests/unittests/test_nvprof.py
+46
-0
未找到文件。
CMakeLists.txt
浏览文件 @
b9ec24c6
...
@@ -146,6 +146,7 @@ include(external/cares)
...
@@ -146,6 +146,7 @@ include(external/cares)
include
(
external/grpc
)
include
(
external/grpc
)
include
(
cudnn
)
# set cudnn libraries, must before configure
include
(
cudnn
)
# set cudnn libraries, must before configure
include
(
cupti
)
include
(
configure
)
# add paddle env configuration
include
(
configure
)
# add paddle env configuration
include
(
generic
)
# simplify cmake module
include
(
generic
)
# simplify cmake module
include
(
package
)
# set paddle packages
include
(
package
)
# set paddle packages
...
...
cmake/configure.cmake
浏览文件 @
b9ec24c6
...
@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)
...
@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)
if
(
NOT WITH_GPU
)
if
(
NOT WITH_GPU
)
add_definitions
(
-DHPPL_STUB_FUNC
)
add_definitions
(
-DHPPL_STUB_FUNC
)
add_definitions
(
"-DCUPTI_LIB_PATH=
\"\"
"
)
list
(
APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu
)
list
(
APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu
)
else
()
else
()
...
@@ -73,7 +74,14 @@ else()
...
@@ -73,7 +74,14 @@ else()
if
(
NOT CUDNN_FOUND
)
if
(
NOT CUDNN_FOUND
)
message
(
FATAL_ERROR
"Paddle needs cudnn to compile"
)
message
(
FATAL_ERROR
"Paddle needs cudnn to compile"
)
endif
()
endif
()
if
(
CUPTI_FOUND
)
include_directories
(
${
CUPTI_INCLUDE_DIR
}
)
add_definitions
(
-DPADDLE_WITH_CUPTI
)
add_definitions
(
"-DCUPTI_LIB_PATH=
\"
${
CUPTI_LIBRARY_PATH
}
\"
"
)
else
()
add_definitions
(
"-DCUPTI_LIB_PATH=
\"\"
"
)
message
(
STATUS
"Cannot find CUPTI, GPU Profiling is incorrect."
)
endif
()
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SIMD_FLAG
}
"
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SIMD_FLAG
}
"
)
# Include cuda and cudnn
# Include cuda and cudnn
...
...
cmake/cuda.cmake
浏览文件 @
b9ec24c6
...
@@ -155,7 +155,8 @@ endif()
...
@@ -155,7 +155,8 @@ endif()
include_directories
(
${
CUDA_INCLUDE_DIRS
}
)
include_directories
(
${
CUDA_INCLUDE_DIRS
}
)
list
(
APPEND EXTERNAL_LIBS
${
CUDA_LIBRARIES
}
${
CUDA_rt_LIBRARY
}
)
list
(
APPEND EXTERNAL_LIBS
${
CUDA_LIBRARIES
}
${
CUDA_rt_LIBRARY
}
)
if
(
NOT WITH_DSO
)
if
(
NOT WITH_DSO
)
list
(
APPEND EXTERNAL_LIBS
${
CUDNN_LIBRARY
}
${
CUDA_CUBLAS_LIBRARIES
}
${
CUDA_curand_LIBRARY
}
${
NCCL_LIBRARY
}
)
# TODO(panyx0718): CUPTI only allows DSO?
list
(
APPEND EXTERNAL_LIBS
${
CUDNN_LIBRARY
}
${
CUPTI_LIBRARY
}
${
CUDA_CUBLAS_LIBRARIES
}
${
CUDA_curand_LIBRARY
}
${
NCCL_LIBRARY
}
)
endif
(
NOT WITH_DSO
)
endif
(
NOT WITH_DSO
)
# setting nvcc arch flags
# setting nvcc arch flags
...
...
cmake/cupti.cmake
0 → 100644
浏览文件 @
b9ec24c6
if
(
NOT WITH_GPU
)
return
()
endif
()
set
(
CUPTI_ROOT
"/usr"
CACHE PATH
"CUPTI ROOT"
)
find_path
(
CUPTI_INCLUDE_DIR cupti.h
PATHS
${
CUPTI_ROOT
}
${
CUPTI_ROOT
}
/include
$ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
${
CUDA_TOOLKIT_ROOT_DIR
}
/extras/CUPTI/include
NO_DEFAULT_PATH
)
get_filename_component
(
__libpath_hist
${
CUDA_CUDART_LIBRARY
}
PATH
)
set
(
TARGET_ARCH
"x86_64"
)
if
(
NOT
${
CMAKE_SYSTEM_PROCESSOR
}
)
set
(
TARGET_ARCH
${
CMAKE_SYSTEM_PROCESSOR
}
)
endif
()
list
(
APPEND CUPTI_CHECK_LIBRARY_DIRS
${
CUPTI_ROOT
}
${
CUPTI_ROOT
}
/lib64
${
CUPTI_ROOT
}
/lib
${
CUPTI_ROOT
}
/lib/
${
TARGET_ARCH
}
-linux-gnu
$ENV{CUPTI_ROOT}
$ENV{CUPTI_ROOT}/lib64
$ENV{CUPTI_ROOT}/lib
/usr/lib
${
CUDA_TOOLKIT_ROOT_DIR
}
/extras/CUPTI/lib64
)
find_library
(
CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib
# libcupti_static.a
PATHS
${
CUPTI_CHECK_LIBRARY_DIRS
}
${
CUPTI_INCLUDE_DIR
}
${
__libpath_hist
}
NO_DEFAULT_PATH
DOC
"Path to cuPTI library."
)
get_filename_component
(
CUPTI_LIBRARY_PATH
${
CUPTI_LIBRARY
}
DIRECTORY
)
if
(
CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY
)
set
(
CUPTI_FOUND ON
)
else
()
set
(
CUPTI_FOUND OFF
)
endif
()
paddle/fluid/framework/executor.cc
浏览文件 @
b9ec24c6
...
@@ -127,7 +127,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
...
@@ -127,7 +127,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
auto
op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
auto
op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
op
->
Type
(),
pool
.
Get
(
place_
));
// TODO(panyx0718): Need a program id to distinguish programs.
platform
::
RecordEvent
record_event
(
op
->
Type
(),
pool
.
Get
(
place_
),
op_desc
->
Block
()
->
ID
());
VLOG
(
3
)
<<
place_
<<
" "
<<
op
->
DebugStringEx
(
local_scope
);
VLOG
(
3
)
<<
place_
<<
" "
<<
op
->
DebugStringEx
(
local_scope
);
op
->
Run
(
*
local_scope
,
place_
);
op
->
Run
(
*
local_scope
,
place_
);
...
...
paddle/fluid/framework/framework.proto
浏览文件 @
b9ec24c6
...
@@ -167,4 +167,6 @@ message BlockDesc {
...
@@ -167,4 +167,6 @@ message BlockDesc {
// Please refer to
// Please refer to
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
// for more details.
// for more details.
// TODO(panyx0718): A model can have multiple programs. Need a
// way to distinguish them. Maybe ID or name?
message
ProgramDesc
{
repeated
BlockDesc
blocks
=
1
;
}
message
ProgramDesc
{
repeated
BlockDesc
blocks
=
1
;
}
paddle/fluid/framework/op_desc.h
浏览文件 @
b9ec24c6
...
@@ -125,6 +125,8 @@ class OpDesc {
...
@@ -125,6 +125,8 @@ class OpDesc {
BlockDesc
*
Block
()
{
return
this
->
block_
;
}
BlockDesc
*
Block
()
{
return
this
->
block_
;
}
const
BlockDesc
&
BlockRef
()
const
{
return
*
this
->
block_
;
}
void
SetBlock
(
BlockDesc
*
block
)
{
this
->
block_
=
block
;
}
void
SetBlock
(
BlockDesc
*
block
)
{
this
->
block_
=
block
;
}
private:
private:
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
b9ec24c6
proto_library
(
profiler_proto SRCS profiler.proto
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
cc_library
(
enforce SRCS enforce.cc DEPS
)
cc_library
(
enforce SRCS enforce.cc DEPS
)
else
()
else
()
...
@@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
...
@@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test
(
transform_test SRCS transform_test.cu DEPS paddle_memory place device_context
)
nv_test
(
transform_test SRCS transform_test.cu DEPS paddle_memory place device_context
)
nv_test
(
nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context
)
nv_test
(
nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context
)
cc_library
(
profiler SRCS profiler.cc DEPS device_context
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS profiler_proto
${
GPU_CTX_DEPS
}
)
cc_library
(
profiler SRCS profiler.cc DEPS device_context device_tracer
)
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
nv_test
(
float16_gpu_test SRCS float16_test.cu
)
nv_test
(
float16_gpu_test SRCS float16_test.cu
)
...
...
paddle/fluid/platform/device_tracer.cc
0 → 100644
浏览文件 @
b9ec24c6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <map>
#include <mutex>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
platform
{
namespace
{
thread_local
const
char
*
cur_annotation
=
nullptr
;
std
::
once_flag
tracer_once_flag
;
DeviceTracer
*
tracer
=
nullptr
;
}
// namespace
#ifdef PADDLE_WITH_CUPTI
namespace
{
// TODO(panyx0718): Revisit the buffer size here.
uint64_t
kBufSize
=
32
*
1024
;
uint64_t
kAlignSize
=
8
;
#define ALIGN_BUFFER(buffer, align) \
(((uintptr_t)(buffer) & ((align)-1)) \
? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
: (buffer))
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char *errstr; \
dynload::cuptiGetResultString(_status, &errstr); \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #call, errstr); \
exit(-1); \
} \
} while (0)
void
EnableActivity
()
{
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_MEMCPY
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_KERNEL
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_DEVICE
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_OVERHEAD
));
// We don't track these activities for now.
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
}
void
DisableActivity
()
{
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_MEMCPY
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_KERNEL
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_DEVICE
));
// Disable all other activity record kinds.
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_CONTEXT
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_NAME
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_MARKER
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_OVERHEAD
));
}
void
CUPTIAPI
bufferRequested
(
uint8_t
**
buffer
,
size_t
*
size
,
size_t
*
maxNumRecords
)
{
uint8_t
*
buf
=
(
uint8_t
*
)
malloc
(
kBufSize
+
kAlignSize
);
*
size
=
kBufSize
;
*
buffer
=
ALIGN_BUFFER
(
buf
,
kAlignSize
);
*
maxNumRecords
=
0
;
}
void
CUPTIAPI
bufferCompleted
(
CUcontext
ctx
,
uint32_t
streamId
,
uint8_t
*
buffer
,
size_t
size
,
size_t
validSize
)
{
CUptiResult
status
;
CUpti_Activity
*
record
=
NULL
;
if
(
validSize
>
0
)
{
do
{
status
=
dynload
::
cuptiActivityGetNextRecord
(
buffer
,
validSize
,
&
record
);
if
(
status
==
CUPTI_SUCCESS
)
{
switch
(
record
->
kind
)
{
case
CUPTI_ACTIVITY_KIND_KERNEL
:
case
CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
:
{
auto
*
kernel
=
reinterpret_cast
<
const
CUpti_ActivityKernel3
*>
(
record
);
tracer
->
AddKernelRecords
(
kernel
->
start
,
kernel
->
end
,
kernel
->
deviceId
,
kernel
->
streamId
,
kernel
->
correlationId
);
break
;
}
default:
{
break
;
}
}
}
else
if
(
status
==
CUPTI_ERROR_MAX_LIMIT_REACHED
)
{
// Seems not an error in this case.
break
;
}
else
{
CUPTI_CALL
(
status
);
}
}
while
(
1
);
size_t
dropped
;
CUPTI_CALL
(
dynload
::
cuptiActivityGetNumDroppedRecords
(
ctx
,
streamId
,
&
dropped
));
if
(
dropped
!=
0
)
{
fprintf
(
stderr
,
"Dropped %u activity records
\n
"
,
(
unsigned
int
)
dropped
);
}
}
free
(
buffer
);
}
}
// namespace
class
DeviceTracerImpl
:
public
DeviceTracer
{
public:
DeviceTracerImpl
()
:
enabled_
(
false
)
{}
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
correlations_
[
id
]
=
anno
;
}
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
kernel_records_
.
push_back
(
KernelRecord
{
start
,
end
,
device_id
,
stream_id
,
correlation_id
});
}
bool
IsEnabled
()
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
return
enabled_
;
}
void
Enable
()
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
if
(
enabled_
)
{
fprintf
(
stderr
,
"DeviceTracer already enabled
\n
"
);
return
;
}
EnableActivity
();
// Register callbacks for buffer requests and completed by CUPTI.
CUPTI_CALL
(
dynload
::
cuptiActivityRegisterCallbacks
(
bufferRequested
,
bufferCompleted
));
CUptiResult
ret
;
ret
=
dynload
::
cuptiSubscribe
(
&
subscriber_
,
static_cast
<
CUpti_CallbackFunc
>
(
ApiCallback
),
this
);
if
(
ret
==
CUPTI_ERROR_MAX_LIMIT_REACHED
)
{
fprintf
(
stderr
,
"CUPTI subcriber limit reached.
\n
"
);
}
else
if
(
ret
!=
CUPTI_SUCCESS
)
{
fprintf
(
stderr
,
"Failed to create CUPTI subscriber.
\n
"
);
}
CUPTI_CALL
(
dynload
::
cuptiEnableCallback
(
1
,
subscriber_
,
CUPTI_CB_DOMAIN_DRIVER_API
,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel
));
CUPTI_CALL
(
dynload
::
cuptiGetTimestamp
(
&
start_ns_
));
enabled_
=
true
;
}
proto
::
Profile
GenProfile
()
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
proto
::
Profile
profile_pb
;
profile_pb
.
set_start_ns
(
start_ns_
);
profile_pb
.
set_end_ns
(
end_ns_
);
std
::
map
<
std
::
string
,
std
::
vector
<
uint64_t
>>
event_times
;
for
(
const
KernelRecord
&
r
:
kernel_records_
)
{
if
(
correlations_
.
find
(
r
.
correlation_id
)
==
correlations_
.
end
())
{
fprintf
(
stderr
,
"cannot relate a kernel activity
\n
"
);
continue
;
}
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_name
(
correlations_
.
at
(
r
.
correlation_id
));
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_stream_id
(
r
.
stream_id
);
event
->
set_device_id
(
r
.
device_id
);
event_times
[
event
->
name
()].
push_back
(
r
.
end_ns
-
r
.
start_ns
);
}
for
(
const
auto
&
et
:
event_times
)
{
fprintf
(
stderr
,
"%s: total: %fms invoked cuda kernels: %lu
\n
"
,
et
.
first
.
c_str
(),
std
::
accumulate
(
et
.
second
.
begin
(),
et
.
second
.
end
(),
0
)
/
1000000.0
,
et
.
second
.
size
());
}
return
profile_pb
;
}
void
Disable
()
{
// flush might cause additional calls to DeviceTracker.
dynload
::
cuptiActivityFlushAll
(
CUPTI_ACTIVITY_FLAG_FLUSH_FORCED
);
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
DisableActivity
();
dynload
::
cuptiUnsubscribe
(
subscriber_
);
CUPTI_CALL
(
dynload
::
cuptiGetTimestamp
(
&
end_ns_
));
PADDLE_ENFORCE
(
dynload
::
cuptiFinalize
());
enabled_
=
false
;
}
private:
static
void
CUPTIAPI
ApiCallback
(
void
*
userdata
,
CUpti_CallbackDomain
domain
,
CUpti_CallbackId
cbid
,
const
void
*
cbdata
)
{
auto
*
cbInfo
=
reinterpret_cast
<
const
CUpti_CallbackData
*>
(
cbdata
);
DeviceTracer
*
tracer
=
reinterpret_cast
<
DeviceTracer
*>
(
userdata
);
if
((
domain
==
CUPTI_CB_DOMAIN_DRIVER_API
)
&&
(
cbid
==
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel
))
{
if
(
cbInfo
->
callbackSite
==
CUPTI_API_ENTER
)
{
const
std
::
string
anno
=
cur_annotation
?
cur_annotation
:
cbInfo
->
symbolName
;
tracer
->
AddAnnotation
(
cbInfo
->
correlationId
,
anno
);
}
}
else
{
VLOG
(
1
)
<<
"Unhandled API Callback for "
<<
domain
<<
" "
<<
cbid
;
}
}
std
::
mutex
trace_mu_
;
bool
enabled_
;
uint64_t
start_ns_
;
uint64_t
end_ns_
;
std
::
vector
<
KernelRecord
>
kernel_records_
;
std
::
unordered_map
<
uint32_t
,
std
::
string
>
correlations_
;
CUpti_SubscriberHandle
subscriber_
;
};
#endif // PADDLE_WITH_CUPTI
class
DeviceTracerDummy
:
public
DeviceTracer
{
public:
DeviceTracerDummy
()
{}
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
{}
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
)
{}
bool
IsEnabled
()
{
return
false
;
}
void
Enable
()
{}
proto
::
Profile
GenProfile
()
{
return
proto
::
Profile
();
}
void
Disable
()
{}
};
void
CreateTracer
(
DeviceTracer
**
t
)
{
#ifdef PADDLE_WITH_CUPTI
*
t
=
new
DeviceTracerImpl
();
#else
*
t
=
new
DeviceTracerDummy
();
#endif // PADDLE_WITH_CUPTI
}
DeviceTracer
*
GetDeviceTracer
()
{
std
::
call_once
(
tracer_once_flag
,
CreateTracer
,
&
tracer
);
return
tracer
;
}
void
SetCurAnnotation
(
const
char
*
anno
)
{
cur_annotation
=
anno
;
}
void
ClearCurAnnotation
()
{
cur_annotation
=
nullptr
;
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device_tracer.h
0 → 100644
浏览文件 @
b9ec24c6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace
paddle
{
namespace
platform
{
///////////////////////
// WARN: Under Development. Don't depend on it yet.
//////////////////////
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
// 3. Generate a protobuf for further analysis.
class
DeviceTracer
{
public:
struct
KernelRecord
{
uint64_t
start_ns
;
uint64_t
end_ns
;
uint32_t
device_id
;
uint32_t
stream_id
;
uint32_t
correlation_id
;
};
virtual
~
DeviceTracer
()
{}
// Needs to be called once before use.
virtual
void
Enable
()
=
0
;
// Needs to be called once after use.
virtual
void
Disable
()
=
0
;
// Add a pair to correlate internal cuda id with high level
// annotation (string). So cuda statistics can be represented by
// human-readable annotations.
virtual
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
)
=
0
;
// Generate a proto after done (Disabled).
virtual
proto
::
Profile
GenProfile
()
=
0
;
virtual
bool
IsEnabled
()
=
0
;
};
// Get a DeviceTracer.
DeviceTracer
*
GetDeviceTracer
();
// Set a name for the cuda kernel operation being launched by the thread.
void
SetCurAnnotation
(
const
char
*
anno
);
// Clear the name after the operation is done.
void
ClearCurAnnotation
();
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/dynload/CMakeLists.txt
浏览文件 @
b9ec24c6
cc_library
(
dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce
)
cc_library
(
dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce
)
nv_library
(
dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
DEPS dynamic_loader
)
list
(
APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc
)
if
(
CUPTI_FOUND
)
list
(
APPEND CUDA_SRCS cupti.cc
)
endif
(
CUPTI_FOUND
)
nv_library
(
dynload_cuda SRCS
${
CUDA_SRCS
}
DEPS dynamic_loader
)
cc_library
(
dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc
)
cc_library
(
dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc
)
paddle/fluid/platform/dynload/cupti.cc
0 → 100644
浏览文件 @
b9ec24c6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUPTI
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
namespace
dynload
{
std
::
once_flag
cupti_dso_flag
;
void
*
cupti_dso_handle
=
nullptr
;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUPTI_ROUTINE_EACH
(
DEFINE_WRAP
);
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
#endif // PADDLE_WITH_CUPTI
paddle/fluid/platform/dynload/cupti.h
0 → 100644
浏览文件 @
b9ec24c6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUPTI
#include <cuda.h>
#include <cupti.h>
#include <dlfcn.h>
#include <mutex>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace
paddle
{
namespace
platform
{
namespace
dynload
{
extern
std
::
once_flag
cupti_dso_flag
;
extern
void
*
cupti_dso_handle
;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cupti routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...); \
std::call_once(cupti_dso_flag, \
paddle::platform::dynload::GetCUPTIDsoHandle, \
&cupti_dso_handle); \
void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#else
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
return __name(args...); \
} \
}; \
extern DynLoad__##__name __name
#endif
#define CUPTI_ROUTINE_EACH(__macro) \
__macro(cuptiActivityEnable); \
__macro(cuptiActivityDisable); \
__macro(cuptiActivityRegisterCallbacks); \
__macro(cuptiActivityGetAttribute); \
__macro(cuptiActivitySetAttribute); \
__macro(cuptiGetTimestamp); \
__macro(cuptiActivityGetNextRecord); \
__macro(cuptiGetResultString); \
__macro(cuptiActivityGetNumDroppedRecords); \
__macro(cuptiActivityFlushAll); \
__macro(cuptiFinalize); \
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback);
CUPTI_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
);
#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
#endif // PADDLE_WITH_CUPTI
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
b9ec24c6
...
@@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "",
...
@@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "",
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH"
);
"dlopen will search cuda from LD_LIBRARY_PATH"
);
DEFINE_string
(
cupti_dir
,
""
,
"Specify path for loading cupti.so."
);
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
namespace
dynload
{
namespace
dynload
{
static
const
char
*
cupti_lib_path
=
CUPTI_LIB_PATH
;
static
inline
std
::
string
join
(
const
std
::
string
&
part1
,
static
inline
std
::
string
join
(
const
std
::
string
&
part1
,
const
std
::
string
&
part2
)
{
const
std
::
string
&
part2
)
{
// directory separator
// directory separator
...
@@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) {
...
@@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) {
#endif
#endif
}
}
void
GetCUPTIDsoHandle
(
void
**
dso_handle
)
{
std
::
string
cupti_path
=
cupti_lib_path
;
if
(
!
FLAGS_cupti_dir
.
empty
())
{
cupti_path
=
FLAGS_cupti_dir
;
}
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath
(
cupti_path
,
"libcupti.dylib"
,
dso_handle
,
false
);
#else
GetDsoHandleFromSearchPath
(
cupti_path
,
"libcupti.so"
,
dso_handle
,
false
);
#endif
}
void
GetCurandDsoHandle
(
void
**
dso_handle
)
{
void
GetCurandDsoHandle
(
void
**
dso_handle
)
{
#if defined(__APPLE__) || defined(__OSX__)
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcurand.dylib"
,
dso_handle
);
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libcurand.dylib"
,
dso_handle
);
...
...
paddle/fluid/platform/dynload/dynamic_loader.h
浏览文件 @
b9ec24c6
...
@@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle);
...
@@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle);
*/
*/
void
GetCUDNNDsoHandle
(
void
**
dso_handle
);
void
GetCUDNNDsoHandle
(
void
**
dso_handle
);
void
GetCUPTIDsoHandle
(
void
**
dso_handle
);
/**
/**
* @brief load the DSO of CURAND
* @brief load the DSO of CURAND
*
*
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
b9ec24c6
...
@@ -15,7 +15,13 @@ limitations under the License. */
...
@@ -15,7 +15,13 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
#include <iomanip>
#include <iomanip>
#include <map>
#include <map>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -126,15 +132,20 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
...
@@ -126,15 +132,20 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList
().
Record
(
EventKind
::
kPopRange
,
name
,
g_thread_id
,
dev_ctx
);
GetEventList
().
Record
(
EventKind
::
kPopRange
,
name
,
g_thread_id
,
dev_ctx
);
}
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
,
const
DeviceContext
*
dev_ctx
)
{
int32_t
block_id
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
dev_ctx_
=
dev_ctx
;
name_
=
name
;
name_
=
name
;
PushEvent
(
name_
,
dev_ctx_
);
PushEvent
(
name_
,
dev_ctx_
);
full_name_
=
string
::
Sprintf
(
"%s_b%d"
,
name
,
block_id
);
// Maybe need the same push/pop behavior.
SetCurAnnotation
(
full_name_
.
c_str
());
}
}
RecordEvent
::~
RecordEvent
()
{
RecordEvent
::~
RecordEvent
()
{
ClearCurAnnotation
();
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
PopEvent
(
name_
,
dev_ctx_
);
PopEvent
(
name_
,
dev_ctx_
);
}
}
...
@@ -147,7 +158,14 @@ void EnableProfiler(ProfilerState state) {
...
@@ -147,7 +158,14 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling "
,
"The profiling state should be disabled when calling "
,
"EnableProfiler."
);
"EnableProfiler."
);
g_state
=
state
;
g_state
=
state
;
g_profiler_place
=
(
g_state
==
ProfilerState
::
kCUDA
)
?
"CUDA"
:
"CPU"
;
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
g_profiler_place
=
"CUDA"
;
}
else
if
(
g_state
==
ProfilerState
::
kCPU
)
{
g_profiler_place
=
"CPU"
;
}
else
{
g_profiler_place
=
"All"
;
GetDeviceTracer
()
->
Enable
();
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
// Generate some dummy evenets first to reduce the startup overhead.
// Generate some dummy evenets first to reduce the startup overhead.
...
@@ -190,6 +208,12 @@ void DisableProfiler(EventSortingKey sorted_key) {
...
@@ -190,6 +208,12 @@ void DisableProfiler(EventSortingKey sorted_key) {
Mark
(
"_stop_profiler_"
,
nullptr
);
Mark
(
"_stop_profiler_"
,
nullptr
);
g_state
=
ProfilerState
::
kDisabled
;
g_state
=
ProfilerState
::
kDisabled
;
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
g_profiler_place
==
"All"
&&
tracer
&&
tracer
->
IsEnabled
())
{
tracer
->
Disable
();
tracer
->
GenProfile
();
}
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
ParseEvents
(
all_events
,
sorted_key
);
ParseEvents
(
all_events
,
sorted_key
);
ResetProfiler
();
ResetProfiler
();
...
@@ -254,9 +278,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
...
@@ -254,9 +278,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
}
}
if
(
rit
!=
pushed_events
.
rend
())
{
if
(
rit
!=
pushed_events
.
rend
())
{
double
event_time
=
(
g_profiler_place
==
"CUDA"
)
double
event_time
=
?
rit
->
CudaElapsedMs
(
events
[
i
][
j
])
(
g_profiler_place
==
"CUDA"
||
g_profiler_place
==
"All"
)
:
rit
->
CpuElapsedMs
(
events
[
i
][
j
]);
?
rit
->
CudaElapsedMs
(
events
[
i
][
j
])
:
rit
->
CpuElapsedMs
(
events
[
i
][
j
]);
std
::
string
event_name
=
std
::
string
event_name
=
"thread"
+
std
::
to_string
(
rit
->
thread_id
())
+
"::"
+
rit
->
name
();
"thread"
+
std
::
to_string
(
rit
->
thread_id
())
+
"::"
+
rit
->
name
();
max_name_width
=
std
::
max
(
max_name_width
,
event_name
.
size
());
max_name_width
=
std
::
max
(
max_name_width
,
event_name
.
size
());
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
b9ec24c6
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <mutex>
#include <mutex>
#include <vector>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -93,6 +94,7 @@ enum ProfilerState {
...
@@ -93,6 +94,7 @@ enum ProfilerState {
kDisabled
,
// disabled state
kDisabled
,
// disabled state
kCPU
,
// CPU profiling state
kCPU
,
// CPU profiling state
kCUDA
,
// GPU profiling state
kCUDA
,
// GPU profiling state
kAll
,
// Profile both CPU and GPU. (Currently experimental).
};
};
void
Mark
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
void
Mark
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
...
@@ -102,7 +104,8 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
...
@@ -102,7 +104,8 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void
PopEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
void
PopEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
struct
RecordEvent
{
struct
RecordEvent
{
explicit
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
,
int32_t
block_id
);
~
RecordEvent
();
~
RecordEvent
();
...
@@ -110,9 +113,12 @@ struct RecordEvent {
...
@@ -110,9 +113,12 @@ struct RecordEvent {
const
DeviceContext
*
dev_ctx_
;
const
DeviceContext
*
dev_ctx_
;
// Event name
// Event name
std
::
string
name_
;
std
::
string
name_
;
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
std
::
string
full_name_
;
};
};
// Return the event list of all threads. As
um
med the returned value calls
// Return the event list of all threads. As
su
med the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
...
...
paddle/fluid/platform/profiler.proto
0 → 100644
浏览文件 @
b9ec24c6
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax
=
"proto2"
;
package
paddle
.
platform.proto
;
message
Event
{
optional
string
name
=
1
;
optional
uint64
start_ns
=
2
;
optional
uint64
end_ns
=
3
;
optional
uint32
device_id
=
5
;
optional
uint32
stream_id
=
6
;
}
message
Profile
{
repeated
Event
events
=
1
;
optional
uint64
start_ns
=
2
;
optional
uint64
end_ns
=
3
;
}
\ No newline at end of file
paddle/fluid/platform/profiler_test.cc
浏览文件 @
b9ec24c6
...
@@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) {
...
@@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) {
*/
*/
for
(
int
i
=
1
;
i
<
5
;
++
i
)
{
for
(
int
i
=
1
;
i
<
5
;
++
i
)
{
std
::
string
name
=
"evs_op_"
+
std
::
to_string
(
i
);
std
::
string
name
=
"evs_op_"
+
std
::
to_string
(
i
);
RecordEvent
record_event
(
name
,
dev_ctx
);
RecordEvent
record_event
(
name
,
dev_ctx
,
0
);
int
counter
=
1
;
int
counter
=
1
;
while
(
counter
!=
i
*
1000
)
counter
++
;
while
(
counter
!=
i
*
1000
)
counter
++
;
}
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
b9ec24c6
...
@@ -459,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -459,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle.
.
value
(
"kDisabled"
,
platform
::
ProfilerState
::
kDisabled
)
.
value
(
"kDisabled"
,
platform
::
ProfilerState
::
kDisabled
)
.
value
(
"kCPU"
,
platform
::
ProfilerState
::
kCPU
)
.
value
(
"kCPU"
,
platform
::
ProfilerState
::
kCPU
)
.
value
(
"kCUDA"
,
platform
::
ProfilerState
::
kCUDA
)
.
value
(
"kCUDA"
,
platform
::
ProfilerState
::
kCUDA
)
.
value
(
"kAll"
,
platform
::
ProfilerState
::
kAll
)
.
export_values
();
.
export_values
();
py
::
enum_
<
platform
::
EventSortingKey
>
(
m
,
"EventSortingKey"
,
py
::
arithmetic
())
py
::
enum_
<
platform
::
EventSortingKey
>
(
m
,
"EventSortingKey"
,
py
::
arithmetic
())
...
...
python/paddle/fluid/profiler.py
浏览文件 @
b9ec24c6
...
@@ -97,9 +97,14 @@ def profiler(state, sorted_key=None):
...
@@ -97,9 +97,14 @@ def profiler(state, sorted_key=None):
The `ave` means sorting by the average execution time.
The `ave` means sorting by the average execution time.
"""
"""
if
state
not
in
[
'CPU'
,
'GPU'
]:
if
state
not
in
[
'CPU'
,
'GPU'
,
"All"
]:
raise
ValueError
(
"The state must be 'CPU' or 'GPU'."
)
raise
ValueError
(
"The state must be 'CPU' or 'GPU' or 'All'."
)
prof_state
=
core
.
ProfilerState
.
kCUDA
if
state
==
"GPU"
else
core
.
ProfilerState
.
kCPU
if
state
==
"GPU"
:
prof_state
=
core
.
ProfilerState
.
kCUDA
elif
state
==
"CPU"
:
prof_state
=
core
.
ProfilerState
.
kCPU
else
:
prof_state
=
core
.
ProfilerState
.
kAll
core
.
enable_profiler
(
prof_state
)
core
.
enable_profiler
(
prof_state
)
yield
yield
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
b9ec24c6
...
@@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op)
...
@@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op)
list
(
REMOVE_ITEM TEST_OPS test_lod_array_length_op
)
list
(
REMOVE_ITEM TEST_OPS test_lod_array_length_op
)
list
(
REMOVE_ITEM TEST_OPS test_reorder_lod_tensor
)
list
(
REMOVE_ITEM TEST_OPS test_reorder_lod_tensor
)
list
(
REMOVE_ITEM TEST_OPS test_profiler
)
list
(
REMOVE_ITEM TEST_OPS test_profiler
)
list
(
REMOVE_ITEM TEST_OPS test_nvprof
)
list
(
REMOVE_ITEM TEST_OPS test_normalization_wrapper
)
list
(
REMOVE_ITEM TEST_OPS test_normalization_wrapper
)
list
(
REMOVE_ITEM TEST_OPS test_executor_and_mul
)
list
(
REMOVE_ITEM TEST_OPS test_executor_and_mul
)
list
(
REMOVE_ITEM TEST_OPS test_assign_value_op
)
list
(
REMOVE_ITEM TEST_OPS test_assign_value_op
)
...
@@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op)
...
@@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op)
py_test_modules
(
test_lod_array_length_op MODULES test_lod_array_length_op
)
py_test_modules
(
test_lod_array_length_op MODULES test_lod_array_length_op
)
py_test_modules
(
test_reorder_lod_tensor MODULES test_reorder_lod_tensor
)
py_test_modules
(
test_reorder_lod_tensor MODULES test_reorder_lod_tensor
)
py_test_modules
(
test_profiler MODULES test_profiler
)
py_test_modules
(
test_profiler MODULES test_profiler
)
py_test_modules
(
test_nvprof MODULES test_nvprof
)
py_test_modules
(
test_normalization_wrapper MODULES test_normalization_wrapper
)
py_test_modules
(
test_normalization_wrapper MODULES test_normalization_wrapper
)
py_test_modules
(
test_executor_and_mul MODULES test_executor_and_mul
)
py_test_modules
(
test_executor_and_mul MODULES test_executor_and_mul
)
py_test_modules
(
test_assign_value_op MODULES test_assign_value_op
)
py_test_modules
(
test_assign_value_op MODULES test_assign_value_op
)
...
...
python/paddle/fluid/tests/unittests/test_profiler.py
浏览文件 @
b9ec24c6
...
@@ -22,27 +22,9 @@ import paddle.fluid.core as core
...
@@ -22,27 +22,9 @@ import paddle.fluid.core as core
class
TestProfiler
(
unittest
.
TestCase
):
class
TestProfiler
(
unittest
.
TestCase
):
def
test_nvprof
(
self
):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
epoc
=
8
dshape
=
[
4
,
3
,
28
,
28
]
data
=
layers
.
data
(
name
=
'data'
,
shape
=
[
3
,
28
,
28
],
dtype
=
'float32'
)
conv
=
layers
.
conv2d
(
data
,
20
,
3
,
stride
=
[
1
,
1
],
padding
=
[
1
,
1
])
place
=
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
output_file
=
'cuda_profiler.txt'
with
profiler
.
cuda_profiler
(
output_file
,
'csv'
)
as
nvprof
:
for
i
in
range
(
epoc
):
input
=
np
.
random
.
random
(
dshape
).
astype
(
'float32'
)
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
'data'
:
input
})
os
.
remove
(
output_file
)
def
net_profiler
(
self
,
state
):
def
net_profiler
(
self
,
state
):
if
state
==
'GPU'
and
not
core
.
is_compiled_with_cuda
():
enable_if_gpu
=
state
==
'GPU'
or
state
==
"All"
if
enable_if_gpu
and
not
core
.
is_compiled_with_cuda
():
return
return
startup_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
main_program
=
fluid
.
Program
()
main_program
=
fluid
.
Program
()
...
@@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase):
...
@@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase):
def
test_cuda_profiler
(
self
):
def
test_cuda_profiler
(
self
):
self
.
net_profiler
(
'GPU'
)
self
.
net_profiler
(
'GPU'
)
def
test_all_profiler
(
self
):
self
.
net_profiler
(
'All'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/v2/fluid/tests/unittests/test_nvprof.py
0 → 100644
浏览文件 @
b9ec24c6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
os
import
numpy
as
np
import
paddle.v2.fluid
as
fluid
import
paddle.v2.fluid.profiler
as
profiler
import
paddle.v2.fluid.layers
as
layers
import
paddle.v2.fluid.core
as
core
class
TestNVProf
(
unittest
.
TestCase
):
def
test_nvprof
(
self
):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
epoc
=
8
dshape
=
[
4
,
3
,
28
,
28
]
data
=
layers
.
data
(
name
=
'data'
,
shape
=
[
3
,
28
,
28
],
dtype
=
'float32'
)
conv
=
layers
.
conv2d
(
data
,
20
,
3
,
stride
=
[
1
,
1
],
padding
=
[
1
,
1
])
place
=
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
output_file
=
'cuda_profiler.txt'
with
profiler
.
cuda_profiler
(
output_file
,
'csv'
)
as
nvprof
:
for
i
in
range
(
epoc
):
input
=
np
.
random
.
random
(
dshape
).
astype
(
'float32'
)
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
'data'
:
input
})
os
.
remove
(
output_file
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录