Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
e135069d
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e135069d
编写于
5月 22, 2023
作者:
Z
zhupengyang
提交者:
GitHub
5月 22, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[xpu][infer] support runtime configs (#53595)
上级
d327d3e1
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
648 addition
and
307 deletion
+648
-307
paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+3
-6
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+56
-50
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+3
-0
paddle/fluid/inference/api/infer_context.cc
paddle/fluid/inference/api/infer_context.cc
+126
-2
paddle/fluid/inference/api/infer_context.h
paddle/fluid/inference/api/infer_context.h
+27
-5
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+9
-2
paddle/fluid/inference/api/resource_manager.cc
paddle/fluid/inference/api/resource_manager.cc
+0
-121
paddle/fluid/inference/api/resource_manager.h
paddle/fluid/inference/api/resource_manager.h
+1
-51
paddle/phi/backends/CMakeLists.txt
paddle/phi/backends/CMakeLists.txt
+1
-1
paddle/phi/backends/xpu/xpu_context.cc
paddle/phi/backends/xpu/xpu_context.cc
+6
-4
paddle/phi/backends/xpu/xpu_l3_strategy.cc
paddle/phi/backends/xpu/xpu_l3_strategy.cc
+153
-0
paddle/phi/backends/xpu/xpu_l3_strategy.h
paddle/phi/backends/xpu/xpu_l3_strategy.h
+53
-0
paddle/phi/core/device_context.cc
paddle/phi/core/device_context.cc
+2
-5
paddle/phi/core/device_context.h
paddle/phi/core/device_context.h
+5
-5
test/cpp/inference/api/CMakeLists.txt
test/cpp/inference/api/CMakeLists.txt
+11
-0
test/cpp/inference/api/analysis_predictor_tester.cc
test/cpp/inference/api/analysis_predictor_tester.cc
+0
-55
test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
+192
-0
未找到文件。
paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
浏览文件 @
e135069d
...
@@ -566,12 +566,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
...
@@ -566,12 +566,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
}
else
{
}
else
{
conv_bias
.
push_back
(
0
);
conv_bias
.
push_back
(
0
);
}
}
if
(
conv
->
Op
()
->
HasAttr
(
"padding_algorithm"
))
{
conv2d_xpu_op_desc
.
SetAttr
(
conv2d_xpu_op_desc
.
SetAttr
(
"padding_algorithm"
,
"padding_algorithm"
,
conv
->
Op
()
->
GetAttrIfExists
<
std
::
string
>
(
"padding_algorithm"
));
PADDLE_GET_CONST
(
std
::
string
,
conv
->
Op
()
->
GetAttr
(
"padding_algorithm"
)));
}
auto
conv_paddings
=
auto
conv_paddings
=
PADDLE_GET_CONST
(
std
::
vector
<
int
>
,
conv
->
Op
()
->
GetAttr
(
"paddings"
));
PADDLE_GET_CONST
(
std
::
vector
<
int
>
,
conv
->
Op
()
->
GetAttr
(
"paddings"
));
if
(
conv_paddings
.
size
()
==
2
)
{
if
(
conv_paddings
.
size
()
==
2
)
{
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
e135069d
...
@@ -389,25 +389,21 @@ bool AnalysisPredictor::Init(
...
@@ -389,25 +389,21 @@ bool AnalysisPredictor::Init(
}
}
#endif
#endif
#if defined(PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_XPU)
if
(
config_
.
use_xpu_
&&
config_
.
use_external_stream_
)
{
if
(
config_
.
use_xpu_
)
{
private_context_
=
true
;
private_context_
=
true
;
}
if
(
!
status_is_cloned_
&&
config_
.
external_stream_enabled
())
{
if
(
private_context_
)
{
if
(
!
status_is_cloned_
)
{
predictor_stream_
=
config_
.
GetExecStream
();
predictor_stream_
=
config_
.
GetExecStream
();
}
}
// NOTE: If the external_stream equals to global_device_contexts's stream,
auto
*
global_context
=
static_cast
<
phi
::
XPUContext
*>
(
// then fallback.
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
));
auto
global_stream
=
auto
global_stream
=
global_context
->
stream
();
static_cast
<
phi
::
XPUContext
*>
(
if
(
predictor_stream_
==
nullptr
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
))
predictor_stream_
=
global_stream
;
->
stream
();
if
(
predictor_stream_
!=
global_stream
)
{
InitResourceManager
(
predictor_stream_
);
InitDeviceContexts
();
}
}
InitDeviceContexts
();
}
}
#endif
#endif
inference
::
DisplayMemoryInfo
(
place_
,
"Init predictor"
);
inference
::
DisplayMemoryInfo
(
place_
,
"Init predictor"
);
return
true
;
return
true
;
}
}
...
@@ -492,15 +488,12 @@ void AnalysisPredictor::InitResourceManager(void *stream) {
...
@@ -492,15 +488,12 @@ void AnalysisPredictor::InitResourceManager(void *stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
predictor_stream_
=
predictor_stream_
=
ResourceManager
::
Instance
().
InitGPUResource
(
place_
,
stream
);
ResourceManager
::
Instance
().
InitGPUResource
(
place_
,
stream
);
#elif defined(PADDLE_WITH_XPU)
predictor_stream_
=
ResourceManager
::
Instance
().
InitXPUResource
(
place_
,
stream
);
#endif
#endif
}
}
void
AnalysisPredictor
::
InitDeviceContexts
()
{
void
AnalysisPredictor
::
InitDeviceContexts
()
{
// Init GPUContext.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Init GPUContext.
if
(
place_
.
GetType
()
==
phi
::
AllocationType
::
GPU
)
{
if
(
place_
.
GetType
()
==
phi
::
AllocationType
::
GPU
)
{
device_contexts_
.
emplace
(
device_contexts_
.
emplace
(
place_
,
std
::
async
(
std
::
launch
::
deferred
,
[
=
]
{
place_
,
std
::
async
(
std
::
launch
::
deferred
,
[
=
]
{
...
@@ -512,12 +505,10 @@ void AnalysisPredictor::InitDeviceContexts() {
...
@@ -512,12 +505,10 @@ void AnalysisPredictor::InitDeviceContexts() {
}));
}));
}
}
#endif
#endif
#if
defined(PADDLE_WITH_XPU)
#if
def PADDLE_WITH_XPU
if
(
place_
.
GetType
()
==
phi
::
AllocationType
::
XPU
)
{
if
(
place_
.
GetType
()
==
phi
::
AllocationType
::
XPU
)
{
device_contexts_
.
emplace
(
device_contexts_
.
emplace
(
place_
,
std
::
async
(
std
::
launch
::
deferred
,
[
=
]
{
place_
,
std
::
async
(
std
::
launch
::
deferred
,
[
=
]
{
auto
*
xpu_resource
=
ResourceManager
::
Instance
().
GetXPUResource
(
predictor_stream_
);
auto
&
instance
=
memory
::
allocation
::
AllocatorFacade
::
Instance
();
auto
&
instance
=
memory
::
allocation
::
AllocatorFacade
::
Instance
();
auto
*
xpu_context
=
new
InferXPUContext
(
place_
);
auto
*
xpu_context
=
new
InferXPUContext
(
place_
);
xpu_context
->
SetAllocator
(
instance
.
GetAllocator
(
place_
).
get
());
xpu_context
->
SetAllocator
(
instance
.
GetAllocator
(
place_
).
get
());
...
@@ -530,15 +521,11 @@ void AnalysisPredictor::InitDeviceContexts() {
...
@@ -530,15 +521,11 @@ void AnalysisPredictor::InitDeviceContexts() {
instance
.
GetZeroAllocator
(
place_
).
get
());
instance
.
GetZeroAllocator
(
place_
).
get
());
xpu_context
->
SetHostZeroAllocator
(
xpu_context
->
SetHostZeroAllocator
(
instance
.
GetZeroAllocator
(
platform
::
CPUPlace
()).
get
());
instance
.
GetZeroAllocator
(
platform
::
CPUPlace
()).
get
());
xpu_context
->
SetStream
(
xpu_resource
->
GetStream
());
xpu_context
->
SetStream
(
predictor_stream_
);
xpu_context
->
SetDriverVersion
(
xpu_resource
->
GetDriverVersion
());
xpu_context
->
SetRuntimeVersion
(
xpu_resource
->
GetRuntimeVersion
());
xpu_context
->
SetXpuVersion
(
xpu_resource
->
GetXpuVersion
());
return
std
::
unique_ptr
<
phi
::
DeviceContext
>
(
xpu_context
);
return
std
::
unique_ptr
<
phi
::
DeviceContext
>
(
xpu_context
);
}));
}));
}
}
#endif
#endif
// TODO(Inference): Support other backends.
}
}
void
*
AnalysisPredictor
::
GetExecStream
()
const
{
void
*
AnalysisPredictor
::
GetExecStream
()
const
{
...
@@ -591,6 +578,11 @@ const void *AnalysisPredictor::GetDeviceContexts() const {
...
@@ -591,6 +578,11 @@ const void *AnalysisPredictor::GetDeviceContexts() const {
bool
AnalysisPredictor
::
PrepareScope
(
bool
AnalysisPredictor
::
PrepareScope
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
)
{
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
)
{
#ifdef PADDLE_WITH_XPU
// Set "XPU_PADDLE_L3_SIZE" to "0" to avoid malloc l3 cache when xpu_context
// init.
setenv
(
"XPU_PADDLE_L3_SIZE"
,
"0"
,
0
);
#endif
if
(
parent_scope
)
{
if
(
parent_scope
)
{
PADDLE_ENFORCE_NOT_NULL
(
PADDLE_ENFORCE_NOT_NULL
(
parent_scope
,
parent_scope
,
...
@@ -1513,6 +1505,7 @@ void AnalysisPredictor::PrepareArgument() {
...
@@ -1513,6 +1505,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_
->
SetCustomDeviceId
(
config_
.
custom_device_id
());
argument_
->
SetCustomDeviceId
(
config_
.
custom_device_id
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
argument_
->
SetUseXpu
(
config_
.
use_xpu_
);
argument_
->
SetUseXpu
(
config_
.
use_xpu_
);
argument_
->
SetXpuL3WorkspaceSize
(
config_
.
xpu_l3_workspace_size_
);
argument_
->
SetXpuL3WorkspaceSize
(
config_
.
xpu_l3_workspace_size_
);
...
@@ -2153,29 +2146,45 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
...
@@ -2153,29 +2146,45 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
}
}
#endif
#endif
bool
AnalysisPredictor
::
ExpRunWithExternalStream
(
void
*
stream
)
{
bool
AnalysisPredictor
::
ExpRunWithRuntimeConfig
(
void
*
config
)
{
#if defined(PADDLE_WITH_XPU)
#ifdef PADDLE_WITH_XPU
if
(
!
private_context_
)
{
PADDLE_ENFORCE
(
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
private_context_
,
"Please use config.SetExecStream to init resources, and then we "
paddle
::
platform
::
errors
::
Fatal
(
"will bind resources to execution stream."
));
"Must use private context if run predictor with external config."
));
}
if
(
stream
!=
predictor_stream_
)
{
auto
*
dev_ctxs
=
reinterpret_cast
<
const
std
::
map
<
phi
::
Place
,
std
::
shared_future
<
std
::
unique_ptr
<
phi
::
DeviceContext
>>>
*>
(
this
->
GetDeviceContexts
());
auto
*
dev_ctx
=
static_cast
<
InferXPUContext
*>
(
dev_ctxs
->
at
(
place_
).
get
().
get
());
auto
xpu_runtime_config
=
reinterpret_cast
<
paddle_infer
::
experimental
::
XpuRuntimeConfig
*>
(
config
);
auto
*
stream
=
xpu_runtime_config
->
stream
;
if
(
stream
!=
nullptr
&&
stream
!=
predictor_stream_
)
{
paddle
::
platform
::
XPUStreamSync
(
paddle
::
platform
::
XPUStreamSync
(
static_cast
<
paddle
::
xpuStream
>
(
predictor_stream_
));
static_cast
<
paddle
::
xpuStream
>
(
predictor_stream_
));
ResourceManager
::
Instance
().
XpuResourceReBindStream
(
predictor_stream_
,
stream
);
predictor_stream_
=
stream
;
predictor_stream_
=
stream
;
auto
*
dev_ctxs
=
reinterpret_cast
<
const
std
::
map
<
phi
::
Place
,
std
::
shared_future
<
std
::
unique_ptr
<
phi
::
DeviceContext
>>>
*>
(
this
->
GetDeviceContexts
());
auto
*
dev_ctx
=
static_cast
<
InferXPUContext
*>
(
dev_ctxs
->
at
(
place_
).
get
().
get
());
dev_ctx
->
SetStream
(
stream
);
dev_ctx
->
SetStream
(
stream
);
}
}
return
ZeroCopyRun
();
size_t
l3_size
=
xpu_runtime_config
->
l3_size
;
void
*
l3_ptr
=
xpu_runtime_config
->
l3_ptr
;
size_t
l3_autotune_size
=
xpu_runtime_config
->
l3_autotune_size
;
PADDLE_ENFORCE_LE
(
l3_autotune_size
,
l3_size
,
phi
::
errors
::
InvalidArgument
(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu)."
,
l3_autotune_size
,
l3_size
));
dev_ctx
->
SetL3Info
(
l3_size
,
l3_ptr
,
l3_autotune_size
);
bool
ret
=
ZeroCopyRun
();
dev_ctx
->
L3CacheAutotune
();
return
ret
;
#endif
#endif
return
false
;
return
false
;
}
}
...
@@ -2543,10 +2552,6 @@ AnalysisPredictor::~AnalysisPredictor() {
...
@@ -2543,10 +2552,6 @@ AnalysisPredictor::~AnalysisPredictor() {
if
(
predictor_stream_
!=
nullptr
)
{
if
(
predictor_stream_
!=
nullptr
)
{
ResourceManager
::
Instance
().
DestroyGPUResource
(
predictor_stream_
);
ResourceManager
::
Instance
().
DestroyGPUResource
(
predictor_stream_
);
}
}
#elif defined(PADDLE_WITH_XPU)
if
(
predictor_stream_
!=
nullptr
)
{
ResourceManager
::
Instance
().
DestroyXPUResource
(
predictor_stream_
);
}
#endif
#endif
if
(
place_
.
GetType
()
!=
phi
::
AllocationType
::
UNDEFINED
)
{
if
(
place_
.
GetType
()
!=
phi
::
AllocationType
::
UNDEFINED
)
{
...
@@ -3057,10 +3062,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
...
@@ -3057,10 +3062,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
#endif
#endif
return
false
;
return
false
;
}
}
bool
InternalUtils
::
RunWithExternalStream
(
paddle_infer
::
Predictor
*
p
,
void
*
stream
)
{
bool
InternalUtils
::
RunWithRuntimeConfig
(
paddle_infer
::
Predictor
*
p
,
void
*
config
)
{
auto
pred
=
dynamic_cast
<
paddle
::
AnalysisPredictor
*>
(
p
->
predictor_
.
get
());
auto
pred
=
dynamic_cast
<
paddle
::
AnalysisPredictor
*>
(
p
->
predictor_
.
get
());
return
pred
->
ExpRunWith
ExternalStream
(
stream
);
return
pred
->
ExpRunWith
RuntimeConfig
(
config
);
}
}
void
InternalUtils
::
UpdateConfigInterleaved
(
paddle_infer
::
Config
*
c
,
void
InternalUtils
::
UpdateConfigInterleaved
(
paddle_infer
::
Config
*
c
,
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
e135069d
...
@@ -228,6 +228,9 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -228,6 +228,9 @@ class AnalysisPredictor : public PaddlePredictor {
// Note: Can only be used under thread_local semantics.
// Note: Can only be used under thread_local semantics.
bool
ExpRunWithExternalStream
(
void
*
stream
);
bool
ExpRunWithExternalStream
(
void
*
stream
);
// Note: Can only be used under thread_local semantics.
bool
ExpRunWithRuntimeConfig
(
void
*
config
);
///
///
/// \brief Get the execution stream on devices with a concept of stream,
/// \brief Get the execution stream on devices with a concept of stream,
/// otherwise returns nullptr.
/// otherwise returns nullptr.
...
...
paddle/fluid/inference/api/infer_context.cc
浏览文件 @
e135069d
...
@@ -13,7 +13,11 @@
...
@@ -13,7 +13,11 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/inference/api/infer_context.h"
#include "paddle/fluid/inference/api/infer_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#ifdef PADDLE_WITH_XPU
#include "xpu/runtime.h"
#endif
#include "glog/logging.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -22,9 +26,129 @@ InferGPUContext::InferGPUContext(const phi::Place& place)
...
@@ -22,9 +26,129 @@ InferGPUContext::InferGPUContext(const phi::Place& place)
:
phi
::
GPUContext
(
place
,
false
)
{}
:
phi
::
GPUContext
(
place
,
false
)
{}
#endif
#endif
#if
defined(PADDLE_WITH_XPU)
#if
def PADDLE_WITH_XPU
InferXPUContext
::
InferXPUContext
(
const
phi
::
Place
&
place
)
InferXPUContext
::
InferXPUContext
(
const
phi
::
Place
&
place
)
:
phi
::
XPUContext
(
place
)
{}
:
phi
::
XPUContext
(
place
)
{}
void
*
InferXPUContext
::
Alloc
(
phi
::
TensorBase
*
tensor
,
phi
::
DataType
dtype
,
size_t
requested_size
,
bool
pinned
,
bool
fake_alloc
)
const
{
size_t
size
=
tensor
->
numel
()
*
phi
::
SizeOf
(
tensor
->
dtype
());
if
(
l3_autotune_size_
>
0
&&
holder_map_
.
empty
())
{
void
*
data_ptr
=
DeviceContext
::
Alloc
(
tensor
,
dtype
,
requested_size
,
pinned
,
fake_alloc
);
phi
::
XPUL3CacheBlock
*
l3_block
=
nullptr
;
phi
::
Allocation
*
holder
=
reinterpret_cast
<
phi
::
DenseTensor
*>
(
tensor
)
->
Holder
().
get
();
if
(
holder_l3_blocks_
.
count
(
holder
)
==
0
)
{
l3_block
=
new
phi
::
XPUL3CacheBlock
();
holder_l3_blocks_
[
holder
]
=
l3_block
;
l3_blocks_
.
push_back
(
l3_block
);
}
else
{
l3_block
=
holder_l3_blocks_
[
holder
];
}
l3_block
->
Record
(
size
);
return
data_ptr
;
}
else
if
(
l3_autotune_size_
>
0
&&
!
holder_map_
.
empty
())
{
phi
::
Allocation
*
holder
=
reinterpret_cast
<
phi
::
DenseTensor
*>
(
tensor
)
->
Holder
().
get
();
auto
holder_iter
=
holder_map_
.
find
(
holder
);
if
(
holder_iter
!=
holder_map_
.
end
())
{
auto
&
holder_pair
=
holder_iter
->
second
;
auto
*
swap_holder
=
holder_pair
.
first
;
bool
&
swap_holder_is_l3
=
holder_pair
.
second
;
if
(
swap_holder_is_l3
&&
swap_holder
->
size
()
>=
size
)
{
swap
(
*
holder
,
*
swap_holder
);
swap_holder_is_l3
=
false
;
}
else
if
(
!
swap_holder_is_l3
&&
holder
->
size
()
<
size
)
{
swap
(
*
holder
,
*
swap_holder
);
swap_holder_is_l3
=
true
;
}
}
return
DeviceContext
::
Alloc
(
tensor
,
dtype
,
requested_size
,
pinned
,
fake_alloc
);
}
else
{
return
DeviceContext
::
Alloc
(
tensor
,
dtype
,
requested_size
,
pinned
,
fake_alloc
);
}
}
void
InferXPUContext
::
SetL3Info
(
size_t
l3_size
,
void
*
l3_ptr
,
size_t
l3_autotune_size
)
{
if
(
l3_ptr
==
nullptr
)
{
if
(
l3_size_
!=
l3_size
)
{
if
(
l3_owned_
)
{
xpu_free
(
l3_ptr_
);
}
if
(
l3_size
>
0
)
{
xpu_malloc
(
&
l3_ptr_
,
l3_size
,
XPU_MEM_L3
);
if
(
l3_ptr_
!=
nullptr
)
{
VLOG
(
3
)
<<
"remalloc l3("
<<
l3_size
<<
") success."
;
l3_size_
=
l3_size
;
l3_owned_
=
true
;
l3_autotune_size_
=
l3_autotune_size
;
}
else
{
VLOG
(
3
)
<<
"malloc l3("
<<
l3_size
<<
") failed. No l3 will be used."
;
l3_size_
=
0
;
l3_owned_
=
false
;
l3_autotune_size_
=
0
;
}
}
}
}
else
{
if
(
l3_owned_
)
{
xpu_free
(
l3_ptr_
);
}
l3_ptr_
=
l3_ptr
;
l3_size_
=
l3_size
;
l3_autotune_size_
=
l3_autotune_size
;
}
if
(
l3_autotune_size_
==
0
)
{
x_context
()
->
_l3_mgr
.
set
(
l3_ptr_
,
l3_size_
);
}
}
void
InferXPUContext
::
L3CacheAutotune
()
{
if
(
l3_autotune_size_
==
0
)
return
;
if
(
holder_map_
.
empty
())
{
l3_plan_
.
RunAutotune
(
l3_blocks_
,
l3_size_
);
auto
*
plan
=
l3_plan_
.
plan
();
int8_t
*
cur_l3_ptr
=
reinterpret_cast
<
int8_t
*>
(
l3_ptr_
);
for
(
size_t
i
=
0
;
i
<
l3_blocks_
.
size
();
i
++
)
{
size_t
block_size
=
plan
->
at
(
i
);
if
(
block_size
>
0
)
{
l3_blocks_
[
i
]
->
Set
(
cur_l3_ptr
,
block_size
);
cur_l3_ptr
+=
block_size
;
}
}
x_context
()
->
_l3_mgr
.
set
(
reinterpret_cast
<
int8_t
*>
(
l3_ptr_
)
+
l3_size_
-
plan
->
back
(),
plan
->
back
());
for
(
auto
holder_l3_block
:
holder_l3_blocks_
)
{
auto
*
l3_block
=
holder_l3_block
.
second
;
if
(
l3_block
->
size
()
>
0
)
{
auto
*
holder
=
holder_l3_block
.
first
;
auto
place
=
holder
->
place
();
phi
::
Allocation
*
l3_holder
=
new
phi
::
Allocation
(
l3_block
->
data
(),
l3_block
->
size
(),
place
);
holder_map_
[
holder
]
=
std
::
make_pair
(
l3_holder
,
true
);
}
}
}
else
{
for
(
auto
&
holders
:
holder_map_
)
{
auto
*
holder
=
holders
.
first
;
auto
&
holder_pair
=
holders
.
second
;
if
(
!
holder_pair
.
second
)
{
swap
(
*
holder
,
*
(
holder_pair
.
first
));
holder_pair
.
second
=
true
;
}
}
}
}
#endif
#endif
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/infer_context.h
浏览文件 @
e135069d
...
@@ -15,6 +15,9 @@
...
@@ -15,6 +15,9 @@
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/common/place.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
#endif
namespace
paddle
{
namespace
paddle
{
...
@@ -46,14 +49,33 @@ class InferGPUContext : public phi::GPUContext {
...
@@ -46,14 +49,33 @@ class InferGPUContext : public phi::GPUContext {
};
};
#endif
#endif
#if
defined(PADDLE_WITH_XPU)
#if
def PADDLE_WITH_XPU
class
InferXPUContext
:
public
phi
::
XPUContext
{
class
InferXPUContext
:
public
phi
::
XPUContext
{
public:
public:
explicit
InferXPUContext
(
const
phi
::
Place
&
place
);
explicit
InferXPUContext
(
const
phi
::
Place
&
place
);
using
phi
::
XPUContext
::
SetDriverVersion
;
using
phi
::
XPUContext
::
SetRuntimeVersion
;
void
*
Alloc
(
phi
::
TensorBase
*
tensor
,
using
phi
::
XPUContext
::
SetStream
;
phi
::
DataType
dtype
,
using
phi
::
XPUContext
::
SetXpuVersion
;
size_t
requested_size
=
0
,
bool
pinned
=
false
,
bool
fake_alloc
=
false
)
const
override
;
void
SetL3Info
(
size_t
l3_size
,
void
*
l3_ptr
,
size_t
l3_autotune_size
);
void
L3CacheAutotune
();
private:
size_t
l3_size_
{
0
};
void
*
l3_ptr_
{
nullptr
};
bool
l3_owned_
{
false
};
size_t
l3_autotune_size_
{
0
};
mutable
std
::
vector
<
phi
::
XPUL3CacheBlock
*>
l3_blocks_
;
mutable
std
::
unordered_map
<
phi
::
Allocation
*
,
phi
::
XPUL3CacheBlock
*>
holder_l3_blocks_
;
mutable
std
::
unordered_map
<
phi
::
Allocation
*
,
std
::
pair
<
phi
::
Allocation
*
,
bool
>>
holder_map_
;
phi
::
XPUL3Planner
l3_plan_
;
};
};
#endif
#endif
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
e135069d
...
@@ -471,6 +471,13 @@ class Predictor;
...
@@ -471,6 +471,13 @@ class Predictor;
class
Tensor
;
class
Tensor
;
using
Config
=
paddle
::
AnalysisConfig
;
using
Config
=
paddle
::
AnalysisConfig
;
namespace
experimental
{
namespace
experimental
{
struct
XpuRuntimeConfig
{
void
*
stream
{
nullptr
};
size_t
l3_size
{
16773120
};
void
*
l3_ptr
{
nullptr
};
size_t
l3_autotune_size
{
0
};
};
// Unstable interface, may be modified or deleted in the future.
// Unstable interface, may be modified or deleted in the future.
class
PD_INFER_DECL
InternalUtils
{
class
PD_INFER_DECL
InternalUtils
{
public:
public:
...
@@ -479,8 +486,8 @@ class PD_INFER_DECL InternalUtils {
...
@@ -479,8 +486,8 @@ class PD_INFER_DECL InternalUtils {
cudaStream_t
stream
);
cudaStream_t
stream
);
static
bool
RunWithExternalStream
(
paddle_infer
::
Predictor
*
pred
,
static
bool
RunWithExternalStream
(
paddle_infer
::
Predictor
*
pred
,
hipStream_t
stream
);
hipStream_t
stream
);
static
bool
RunWith
ExternalStream
(
paddle_infer
::
Predictor
*
pred
,
static
bool
RunWith
RuntimeConfig
(
paddle_infer
::
Predictor
*
pred
,
void
*
config
);
void
*
stream
);
static
void
UpdateConfigInterleaved
(
paddle_infer
::
Config
*
c
,
static
void
UpdateConfigInterleaved
(
paddle_infer
::
Config
*
c
,
bool
with_interleaved
);
bool
with_interleaved
);
...
...
paddle/fluid/inference/api/resource_manager.cc
浏览文件 @
e135069d
...
@@ -41,9 +41,6 @@
...
@@ -41,9 +41,6 @@
#include "paddle/phi/backends/dynload/cusparse.h"
#include "paddle/phi/backends/dynload/cusparse.h"
#endif // PADDLE_WITH_CUDA
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_info.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
internal
{
namespace
internal
{
...
@@ -451,123 +448,5 @@ int ResourceManager::RefCount(void* stream) const {
...
@@ -451,123 +448,5 @@ int ResourceManager::RefCount(void* stream) const {
if
(
ref_count_
.
count
(
stream
)
==
0
)
return
0
;
if
(
ref_count_
.
count
(
stream
)
==
0
)
return
0
;
return
ref_count_
.
at
(
stream
);
return
ref_count_
.
at
(
stream
);
}
}
#endif
#if defined(PADDLE_WITH_XPU)
// XPUContextResource
XPUContextResource
::
XPUContextResource
(
const
phi
::
Place
&
place
,
void
*
stream
)
:
place_
(
place
)
{
InitXPUResource
(
stream
);
}
XPUContextResource
::~
XPUContextResource
()
{}
void
XPUContextResource
::
InitXPUResource
(
void
*
stream
)
{
phi
::
backends
::
xpu
::
XPUDeviceGuard
guard
(
place_
.
device
);
if
(
stream
)
{
owned_stream_
=
false
;
stream_
=
stream
;
}
InitXpuProperties
();
}
void
XPUContextResource
::
InitXpuProperties
()
{
phi
::
backends
::
xpu
::
XPUDeviceGuard
guard
(
place_
.
device
);
driver_version_
=
phi
::
backends
::
xpu
::
GetDriverVersion
();
runtime_version_
=
phi
::
backends
::
xpu
::
GetRuntimeVersion
();
xpu_version_
=
static_cast
<
int
>
(
phi
::
backends
::
xpu
::
get_xpu_version
(
place_
.
device
));
}
void
*
XPUContextResource
::
GetStream
()
const
{
return
stream_
;
}
int
XPUContextResource
::
GetDriverVersion
()
const
{
return
driver_version_
;
}
int
XPUContextResource
::
GetRuntimeVersion
()
const
{
return
runtime_version_
;
}
int
XPUContextResource
::
GetXpuVersion
()
const
{
return
xpu_version_
;
}
void
XPUContextResource
::
ReBindStream
(
void
*
stream
)
{
owned_stream_
=
false
;
stream_
=
stream
;
}
// XPUContextResource End.
// Resource Manager
void
*
ResourceManager
::
InitXPUResource
(
const
phi
::
Place
&
place
,
void
*
stream
)
{
std
::
lock_guard
<
std
::
mutex
>
lock_gurad
(
xpu_mutex_
);
if
(
xpu_resources_
.
count
(
stream
))
{
Increase
(
stream
);
return
stream
;
}
else
{
std
::
unique_ptr
<
XPUContextResource
>
resource
{
new
XPUContextResource
(
place
,
stream
)};
void
*
s
=
resource
->
GetStream
();
ref_count_
[
s
]
=
1
;
xpu_resources_
.
emplace
(
s
,
std
::
move
(
resource
));
return
s
;
}
}
XPUContextResource
*
ResourceManager
::
GetXPUResource
(
void
*
stream
)
const
{
PADDLE_ENFORCE_EQ
(
xpu_resources_
.
count
(
stream
),
true
,
platform
::
errors
::
InvalidArgument
(
"The stream[%p] not found in xpu_resources."
,
stream
));
return
xpu_resources_
.
at
(
stream
).
get
();
}
void
ResourceManager
::
XpuResourceReBindStream
(
void
*
old_stream
,
void
*
new_stream
)
{
PADDLE_ENFORCE_EQ
(
xpu_resources_
.
count
(
old_stream
),
true
,
platform
::
errors
::
InvalidArgument
(
"The stream[%p] not found in xpu_resources."
,
old_stream
));
auto
xpu_resource
=
std
::
move
(
xpu_resources_
.
at
(
old_stream
));
DestroyXPUResource
(
old_stream
);
PADDLE_ENFORCE_EQ
(
ref_count_
.
count
(
old_stream
),
0
,
platform
::
errors
::
Fatal
(
"xpu resources rebind stream failed."
));
xpu_resource
->
ReBindStream
(
new_stream
);
ref_count_
[
new_stream
]
++
;
xpu_resources_
.
emplace
(
new_stream
,
std
::
move
(
xpu_resource
));
}
void
ResourceManager
::
DestroyXPUResource
(
void
*
stream
)
{
PADDLE_ENFORCE_EQ
(
xpu_resources_
.
count
(
stream
),
true
,
platform
::
errors
::
InvalidArgument
(
"The stream[%p] not found in xpu_resources."
,
stream
));
Decrease
(
stream
);
}
void
ResourceManager
::
Decrease
(
void
*
stream
)
{
PADDLE_ENFORCE_EQ
(
ref_count_
.
count
(
stream
),
true
,
platform
::
errors
::
InvalidArgument
(
"The stream[%p] not found in ref_count."
,
stream
));
--
ref_count_
[
stream
];
if
(
ref_count_
[
stream
]
==
0
)
{
ref_count_
.
erase
(
stream
);
xpu_resources_
.
erase
(
stream
);
}
}
void
ResourceManager
::
Increase
(
void
*
stream
)
{
PADDLE_ENFORCE_EQ
(
ref_count_
.
count
(
stream
),
true
,
platform
::
errors
::
InvalidArgument
(
"The stream[%p] not found in ref_count."
,
stream
));
++
ref_count_
[
stream
];
}
int
ResourceManager
::
RefCount
(
void
*
stream
)
const
{
if
(
ref_count_
.
count
(
stream
)
==
0
)
return
0
;
return
ref_count_
.
at
(
stream
);
}
// Resource Manager End.
#endif
#endif
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/resource_manager.h
浏览文件 @
e135069d
...
@@ -124,33 +124,6 @@ class GPUContextResource {
...
@@ -124,33 +124,6 @@ class GPUContextResource {
};
};
#endif
#endif
#if defined(PADDLE_WITH_XPU)
class
XPUContextResource
{
public:
explicit
XPUContextResource
(
const
phi
::
Place
&
place
,
void
*
stream
);
~
XPUContextResource
();
phi
::
Place
Place
()
const
;
void
*
GetStream
()
const
;
int
GetDriverVersion
()
const
;
int
GetRuntimeVersion
()
const
;
int
GetXpuVersion
()
const
;
void
ReBindStream
(
void
*
stream
);
private:
void
InitXPUResource
(
void
*
stream
);
void
InitXpuProperties
();
private:
bool
owned_stream_
{
true
};
void
*
stream_
;
phi
::
Place
place_
;
int
driver_version_
;
int
runtime_version_
;
int
xpu_version_
;
};
// class XPUContextResource
#endif
class
ResourceManager
{
class
ResourceManager
{
public:
public:
ResourceManager
()
=
default
;
ResourceManager
()
=
default
;
...
@@ -168,9 +141,8 @@ class ResourceManager {
...
@@ -168,9 +141,8 @@ class ResourceManager {
std
::
mutex
cpu_mutex_
;
std
::
mutex
cpu_mutex_
;
std
::
unique_ptr
<
CPUContextResource
>
cpu_resource_
{
nullptr
};
std
::
unique_ptr
<
CPUContextResource
>
cpu_resource_
{
nullptr
};
// GPU Resource
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// GPU Resource
public:
public:
void
*
InitGPUResource
(
const
phi
::
Place
&
place
,
void
*
stream
);
void
*
InitGPUResource
(
const
phi
::
Place
&
place
,
void
*
stream
);
void
DestroyGPUResource
(
void
*
stream
);
void
DestroyGPUResource
(
void
*
stream
);
...
@@ -190,28 +162,6 @@ class ResourceManager {
...
@@ -190,28 +162,6 @@ class ResourceManager {
gpu_resources_
;
gpu_resources_
;
#endif
#endif
// XPU Resource
#if defined(PADDLE_WITH_XPU)
public:
void
*
InitXPUResource
(
const
phi
::
Place
&
place
,
void
*
stream
);
void
DestroyXPUResource
(
void
*
stream
);
XPUContextResource
*
GetXPUResource
(
void
*
stream
)
const
;
int
RefCount
(
void
*
stream
)
const
;
void
XpuResourceReBindStream
(
void
*
old_stream
,
void
*
new_stream
);
private:
void
Decrease
(
void
*
stream
);
void
Increase
(
void
*
stream
);
private:
std
::
mutex
xpu_mutex_
;
// a stream corresponding to a series of resource.
std
::
map
<
void
*
/*stream*/
,
std
::
atomic
<
int
>>
ref_count_
;
std
::
map
<
void
*
/*stream*/
,
std
::
unique_ptr
<
XPUContextResource
>>
xpu_resources_
;
#endif
private:
private:
DISABLE_COPY_AND_ASSIGN
(
ResourceManager
);
DISABLE_COPY_AND_ASSIGN
(
ResourceManager
);
};
};
...
...
paddle/phi/backends/CMakeLists.txt
浏览文件 @
e135069d
...
@@ -38,7 +38,7 @@ endif()
...
@@ -38,7 +38,7 @@ endif()
if
(
WITH_XPU
)
if
(
WITH_XPU
)
list
(
APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc
)
list
(
APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc
)
list
(
APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
list
(
APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
xpu/xpu2_op_list.cc
)
xpu/xpu2_op_list.cc
xpu/xpu_l3_strategy.cc
)
endif
()
endif
()
if
(
WITH_MKLDNN
)
if
(
WITH_MKLDNN
)
...
...
paddle/phi/backends/xpu/xpu_context.cc
浏览文件 @
e135069d
...
@@ -42,11 +42,13 @@ struct XPUContext::Impl {
...
@@ -42,11 +42,13 @@ struct XPUContext::Impl {
auto
selected_xpus
=
backends
::
xpu
::
GetXPUSelectedDevices
();
auto
selected_xpus
=
backends
::
xpu
::
GetXPUSelectedDevices
();
for
(
unsigned
int
i
=
0
;
i
<
selected_xpus
.
size
();
i
++
)
{
for
(
unsigned
int
i
=
0
;
i
<
selected_xpus
.
size
();
i
++
)
{
if
(
place_
.
GetDeviceId
()
==
selected_xpus
[
i
])
{
if
(
place_
.
GetDeviceId
()
==
selected_xpus
[
i
])
{
if
(
l3ptrs
[
place_
.
GetDeviceId
()]
==
nullptr
)
{
if
(
l3ptrs
[
place_
.
GetDeviceId
()]
!=
nullptr
)
{
xpu_malloc
(
static_cast
<
void
**>
(
&
l3ptrs
[
place_
.
GetDeviceId
()]),
xpu_free
(
l3ptrs
[
place_
.
GetDeviceId
()]);
l3_size
,
l3ptrs
[
place_
.
GetDeviceId
()]
=
nullptr
;
XPU_MEM_L3
);
}
}
xpu_malloc
(
static_cast
<
void
**>
(
&
l3ptrs
[
place_
.
GetDeviceId
()]),
l3_size
,
XPU_MEM_L3
);
if
(
l3ptrs
[
place_
.
GetDeviceId
()]
!=
nullptr
)
{
if
(
l3ptrs
[
place_
.
GetDeviceId
()]
!=
nullptr
)
{
context_
->
_l3_mgr
.
set
(
l3ptrs
[
place_
.
GetDeviceId
()],
l3_size
);
context_
->
_l3_mgr
.
set
(
l3ptrs
[
place_
.
GetDeviceId
()],
l3_size
);
VLOG
(
3
)
<<
"xpu place "
<<
static_cast
<
int
>
(
place_
.
GetDeviceId
())
VLOG
(
3
)
<<
"xpu place "
<<
static_cast
<
int
>
(
place_
.
GetDeviceId
())
...
...
paddle/phi/backends/xpu/xpu_l3_strategy.cc
0 → 100644
浏览文件 @
e135069d
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
#include "glog/logging.h"
namespace
phi
{
void
XPUL3CacheBlock
::
Set
(
void
*
addr
,
size_t
size
)
{
if
(
addr
==
nullptr
||
size
==
0
)
{
LOG
(
FATAL
)
<<
"Set XPUL3CacheBlock Size as Zero"
;
}
addr_
=
addr
;
size_
=
size
;
}
void
XPUL3Planner
::
RunAutotune
(
const
std
::
vector
<
XPUL3CacheBlock
*>&
l3_block_dict
,
size_t
l3_size
)
{
if
(
l3_block_dict
.
size
()
==
0
||
l3_size
<=
0
||
!
plan_
.
empty
())
{
return
;
}
VLOG
(
3
)
<<
"AutoTune XPU L3 Cache Block Start."
;
struct
node
{
size_t
weights
=
0
;
size_t
scores
=
0
;
std
::
vector
<
size_t
>
choices
{
0
};
};
std
::
vector
<
std
::
vector
<
node
>>
records
;
std
::
vector
<
size_t
>
record_map
;
size_t
total_scores
=
0
;
for
(
size_t
block_idx
=
0
;
block_idx
<
l3_block_dict
.
size
();
block_idx
++
)
{
XPUL3CacheBlock
*
cur_block
=
l3_block_dict
[
block_idx
];
std
::
vector
<
size_t
>&
history
=
cur_block
->
history_
;
auto
history_size
=
history
.
size
();
size_t
score
=
0
;
VLOG
(
3
)
<<
"Block Idx is "
<<
block_idx
;
if
(
history_size
>
1
)
{
std
::
vector
<
node
>
block_nodes
{
node
()};
std
::
sort
(
history
.
begin
(),
history
.
end
());
for
(
size_t
i
=
0
;
i
<
history_size
;
i
++
)
{
VLOG
(
3
)
<<
"Size History : "
<<
i
<<
" is "
<<
history
[
i
];
if
(
history
[
i
]
>
l3_size
)
{
break
;
}
score
+=
history
[
i
];
if
(
i
==
history_size
-
1
||
history
[
i
+
1
]
!=
history
[
i
])
{
node
cur_node
;
cur_node
.
weights
=
history
[
i
];
cur_node
.
choices
=
{
history
[
i
]};
cur_node
.
scores
=
score
;
block_nodes
.
push_back
(
cur_node
);
VLOG
(
3
)
<<
"Node Weights is:"
<<
cur_node
.
weights
<<
", Node Scores is: "
<<
score
;
}
}
total_scores
+=
score
;
records
.
push_back
(
block_nodes
);
record_map
.
push_back
(
block_idx
);
}
}
if
(
records
.
size
()
<=
0
)
{
return
;
}
std
::
vector
<
node
>
res
(
records
[
0
]);
for
(
size_t
block_idx
=
1
;
block_idx
<
records
.
size
();
block_idx
++
)
{
std
::
vector
<
node
>
new_nodes
;
for
(
size_t
node_idx
=
0
;
node_idx
<
records
[
block_idx
].
size
();
node_idx
++
)
{
for
(
size_t
res_idx
=
0
;
res_idx
<
res
.
size
();
res_idx
++
)
{
node
cur_node
;
size_t
cur_weights
=
records
[
block_idx
][
node_idx
].
weights
+
res
[
res_idx
].
weights
;
if
(
cur_weights
>
l3_size
)
{
break
;
}
cur_node
.
scores
=
records
[
block_idx
][
node_idx
].
scores
+
res
[
res_idx
].
scores
;
cur_node
.
weights
=
cur_weights
;
cur_node
.
choices
=
res
[
res_idx
].
choices
;
cur_node
.
choices
.
push_back
(
records
[
block_idx
][
node_idx
].
choices
[
0
]);
new_nodes
.
push_back
(
cur_node
);
}
}
struct
{
bool
operator
()(
node
a
,
node
b
)
const
{
if
(
a
.
weights
<
b
.
weights
)
{
return
true
;
}
else
if
(
a
.
weights
==
b
.
weights
)
{
return
a
.
scores
>
b
.
scores
;
}
else
{
return
false
;
}
}
}
customLess
;
std
::
sort
(
new_nodes
.
begin
(),
new_nodes
.
end
(),
customLess
);
std
::
vector
<
bool
>
stay
(
new_nodes
.
size
(),
true
);
for
(
int
i
=
new_nodes
.
size
()
-
1
;
i
>=
0
;
i
--
)
{
for
(
int
j
=
i
-
1
;
j
>=
0
;
j
--
)
{
if
(
new_nodes
[
j
].
scores
>=
new_nodes
[
i
].
scores
)
{
stay
[
i
]
=
false
;
break
;
}
}
}
res
.
clear
();
for
(
size_t
i
=
0
;
i
<
new_nodes
.
size
();
i
++
)
{
if
(
stay
[
i
]
==
true
)
{
res
.
push_back
(
new_nodes
[
i
]);
}
}
VLOG
(
3
)
<<
"XPU L3 Block IDX is "
<<
block_idx
<<
", Choices before filter are "
<<
new_nodes
.
size
()
<<
", Choices after filter are "
<<
res
.
size
();
}
// final result: res.back().choices
// std::vector<size_t> record_map;
for
(
size_t
i
=
0
;
i
<
res
.
back
().
choices
.
size
();
i
++
)
{
VLOG
(
3
)
<<
"BLOCK IDX is "
<<
i
<<
", Acquired L3 Size is "
<<
res
.
back
().
choices
[
i
];
}
double
l3_global_ratio
=
static_cast
<
double
>
(
res
.
back
().
scores
)
/
static_cast
<
double
>
(
total_scores
);
VLOG
(
3
)
<<
"Tensor Space in L3 / Tensor Space in Global :"
<<
l3_global_ratio
*
100
<<
" %"
;
size_t
block_l3_size
=
std
::
accumulate
(
res
.
back
().
choices
.
begin
(),
res
.
back
().
choices
.
end
(),
0
);
size_t
xdnn_ctx_l3_size
=
(
l3_size
-
block_l3_size
)
/
64
*
64
;
VLOG
(
3
)
<<
"Block L3 Size : "
<<
block_l3_size
<<
", XDNN Ctx L3 Size : "
<<
xdnn_ctx_l3_size
;
plan_
.
resize
(
l3_block_dict
.
size
()
+
1
,
0
);
for
(
size_t
i
=
0
;
i
<
res
.
back
().
choices
.
size
();
i
++
)
{
plan_
[
record_map
[
i
]]
=
res
.
back
().
choices
[
i
];
}
plan_
[
l3_block_dict
.
size
()]
=
xdnn_ctx_l3_size
;
VLOG
(
3
)
<<
"AutoTune XPU L3 Cache Block End."
;
}
}
// namespace phi
paddle/phi/backends/xpu/xpu_l3_strategy.h
0 → 100644
浏览文件 @
e135069d
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <numeric>
#include <vector>
namespace
phi
{
struct
XPUL3CacheBlock
{
public:
void
Clear
()
{
addr_
=
nullptr
;
size_
=
0
;
history_
.
clear
();
}
void
Set
(
void
*
addr
,
size_t
size
);
void
Record
(
size_t
size
)
{
history_
.
push_back
(
size
);
}
void
*
data
()
{
return
addr_
;
}
size_t
size
()
{
return
size_
;
}
private:
void
*
addr_
{
nullptr
};
size_t
size_
{
0
};
public:
std
::
vector
<
size_t
>
history_
;
};
class
XPUL3Planner
{
public:
void
RunAutotune
(
const
std
::
vector
<
XPUL3CacheBlock
*>&
l3_block_dict
,
size_t
l3_size
);
std
::
vector
<
size_t
>*
plan
()
{
return
&
plan_
;
}
private:
std
::
vector
<
size_t
>
plan_
;
};
}
// namespace phi
paddle/phi/core/device_context.cc
浏览文件 @
e135069d
...
@@ -393,11 +393,8 @@ template <typename T>
...
@@ -393,11 +393,8 @@ template <typename T>
T
*
DeviceContext
::
Alloc
(
TensorBase
*
tensor
,
T
*
DeviceContext
::
Alloc
(
TensorBase
*
tensor
,
size_t
requested_size
,
size_t
requested_size
,
bool
pinned
)
const
{
bool
pinned
)
const
{
if
(
pinned
)
{
DataType
dtype
=
phi
::
CppTypeToDataType
<
T
>::
Type
();
return
impl_
->
Alloc
<
T
>
(
return
static_cast
<
T
*>
(
this
->
Alloc
(
tensor
,
dtype
,
requested_size
,
pinned
));
tensor
,
GetPinnedPlace
(
GetPlace
()),
requested_size
,
pinned
);
}
return
impl_
->
Alloc
<
T
>
(
tensor
,
GetPlace
(),
requested_size
,
pinned
);
}
}
void
*
DeviceContext
::
HostAlloc
(
TensorBase
*
tensor
,
void
*
DeviceContext
::
HostAlloc
(
TensorBase
*
tensor
,
...
...
paddle/phi/core/device_context.h
浏览文件 @
e135069d
...
@@ -145,11 +145,11 @@ class PADDLE_API DeviceContext {
...
@@ -145,11 +145,11 @@ class PADDLE_API DeviceContext {
/**
/**
* @brief Allocate device memory for tensor.
* @brief Allocate device memory for tensor.
*/
*/
void
*
Alloc
(
TensorBase
*
,
v
irtual
v
oid
*
Alloc
(
TensorBase
*
,
DataType
dtype
,
DataType
dtype
,
size_t
requested_size
=
0
,
size_t
requested_size
=
0
,
bool
pinned
=
false
,
bool
pinned
=
false
,
bool
fake_alloc
=
false
)
const
;
bool
fake_alloc
=
false
)
const
;
template
<
typename
T
>
template
<
typename
T
>
T
*
Alloc
(
TensorBase
*
tensor
,
T
*
Alloc
(
TensorBase
*
tensor
,
...
...
test/cpp/inference/api/CMakeLists.txt
浏览文件 @
e135069d
...
@@ -1461,6 +1461,17 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
...
@@ -1461,6 +1461,17 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
--repeat=10
)
--repeat=10
)
endif
()
endif
()
if
(
WITH_XPU
)
inference_analysis_test
(
xpu_runtime_config_resnet50_test
SRCS
xpu_runtime_config_resnet50_test.cc
EXTRA_DEPS
paddle_inference_shared
ARGS
--infer_model=
${
RESNET50_MODEL_DIR
}
)
endif
()
set
(
inference_deps
${
analysis_deps
}
paddle_inference_api analysis
set
(
inference_deps
${
analysis_deps
}
paddle_inference_api analysis
naive_executor
${
GLOB_PASS_LIB
}
)
naive_executor
${
GLOB_PASS_LIB
}
)
...
...
test/cpp/inference/api/analysis_predictor_tester.cc
浏览文件 @
e135069d
...
@@ -17,10 +17,6 @@
...
@@ -17,10 +17,6 @@
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA)
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#endif
#endif
#if defined(PADDLE_WITH_XPU)
#include "xpu/runtime.h"
#include "xpu/xdnn.h"
#endif
#include <glog/logging.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <gtest/gtest.h>
...
@@ -671,57 +667,6 @@ TEST(Predictor, Streams) {
...
@@ -671,57 +667,6 @@ TEST(Predictor, Streams) {
}
}
#endif
#endif
#if defined(PADDLE_WITH_XPU)
TEST
(
Predictor
,
XPUStreams
)
{
// external stream
{
auto
context
=
baidu
::
xpu
::
api
::
create_context
();
xpu_stream_create
(
&
context
->
xpu_stream
);
Config
config
;
config
.
SetModel
(
FLAGS_dirname
);
config
.
EnableXpu
();
config
.
SetExecStream
(
static_cast
<
void
*>
(
context
->
xpu_stream
));
CHECK_EQ
(
config
.
external_stream_enabled
(),
true
);
auto
predictor
=
CreatePredictor
(
config
);
auto
stream
=
predictor
->
GetExecStream
();
CHECK_EQ
(
static_cast
<
void
*>
(
context
->
xpu_stream
),
stream
);
CHECK_NOTNULL
(
paddle
::
ResourceManager
::
Instance
().
GetXPUResource
(
stream
));
CHECK_EQ
(
paddle
::
ResourceManager
::
Instance
().
RefCount
(
stream
),
1
);
}
// 2 predictor on 2 stream
{
auto
context1
=
baidu
::
xpu
::
api
::
create_context
();
xpu_stream_create
(
&
context1
->
xpu_stream
);
Config
config
;
config
.
SetModel
(
FLAGS_dirname
);
config
.
EnableXpu
();
config
.
SetExecStream
(
static_cast
<
void
*>
(
context1
->
xpu_stream
));
auto
predictor
=
CreatePredictor
(
config
);
auto
stream1
=
predictor
->
GetExecStream
();
CHECK_NOTNULL
(
paddle
::
ResourceManager
::
Instance
().
GetXPUResource
(
stream1
));
CHECK_EQ
(
paddle
::
ResourceManager
::
Instance
().
RefCount
(
stream1
),
1
);
auto
context2
=
baidu
::
xpu
::
api
::
create_context
();
xpu_stream_create
(
&
context2
->
xpu_stream
);
Config
config2
;
config2
.
SetModel
(
FLAGS_dirname
);
config2
.
EnableXpu
();
config2
.
SetExecStream
(
static_cast
<
void
*>
(
context2
->
xpu_stream
));
auto
predictor2
=
CreatePredictor
(
config2
);
auto
stream2
=
predictor2
->
GetExecStream
();
CHECK_NOTNULL
(
paddle
::
ResourceManager
::
Instance
().
GetXPUResource
(
stream2
));
CHECK_EQ
(
paddle
::
ResourceManager
::
Instance
().
RefCount
(
stream2
),
1
);
CHECK_NE
(
stream1
,
stream2
);
}
}
#endif
TEST
(
AnalysisPredictor
,
OutputHookFunc
)
{
TEST
(
AnalysisPredictor
,
OutputHookFunc
)
{
auto
hookfunc
=
[](
const
std
::
string
&
type
,
auto
hookfunc
=
[](
const
std
::
string
&
type
,
const
std
::
string
&
var_name
,
const
std
::
string
&
var_name
,
...
...
test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
0 → 100644
浏览文件 @
e135069d
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include "gflags/gflags.h"
#include "test/cpp/inference/api/tester_helper.h"
#include "xpu/runtime.h"
#include "xpu/xdnn.h"
namespace
paddle_infer
{
static
const
std
::
vector
<
float
>
TRUTH_VALUES
=
{
127.779
f
,
738.165
f
,
1013.22
f
,
-
438.17
f
,
366.401
f
,
927.659
f
,
736.222
f
,
-
633.684
f
,
-
329.927
f
,
-
430.155
f
,
-
633.062
f
,
-
146.548
f
,
-
1324.28
f
,
-
1349.36
f
,
-
242.675
f
,
117.448
f
,
-
801.723
f
,
-
391.514
f
,
-
404.818
f
,
454.16
f
,
515.48
f
,
-
133.031
f
,
69.293
f
,
590.096
f
,
-
1434.69
f
,
-
1070.89
f
,
307.074
f
,
400.525
f
,
-
316.12
f
,
-
587.125
f
,
-
161.056
f
,
800.363
f
,
-
96.4708
f
,
748.706
f
,
868.174
f
,
-
447.938
f
,
112.737
f
,
1127.2
f
,
47.4355
f
,
677.72
f
,
593.186
f
,
-
336.4
f
,
551.362
f
,
397.823
f
,
78.3979
f
,
-
715.398
f
,
405.969
f
,
404.256
f
,
246.019
f
,
-
8.42969
f
,
131.365
f
,
-
648.051
f
};
void
PrepareInput
(
std
::
shared_ptr
<
Predictor
>
predictor
)
{
const
int
batch
=
1
;
const
int
channel
=
3
;
const
int
height
=
318
;
const
int
width
=
318
;
const
int
input_num
=
batch
*
channel
*
height
*
width
;
std
::
vector
<
float
>
input
(
input_num
,
1
);
auto
input_names
=
predictor
->
GetInputNames
();
auto
input_t
=
predictor
->
GetInputHandle
(
input_names
[
0
]);
input_t
->
Reshape
({
batch
,
channel
,
height
,
width
});
input_t
->
CopyFromCpu
(
input
.
data
());
}
void
CompareOutput
(
std
::
shared_ptr
<
Predictor
>
predictor
)
{
auto
output_names
=
predictor
->
GetOutputNames
();
auto
output_t
=
predictor
->
GetOutputHandle
(
output_names
[
0
]);
std
::
vector
<
int
>
output_shape
=
output_t
->
shape
();
size_t
out_num
=
std
::
accumulate
(
output_shape
.
begin
(),
output_shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
std
::
vector
<
float
>
out_data
;
out_data
.
resize
(
out_num
);
output_t
->
CopyToCpu
(
out_data
.
data
());
float
*
data_o
=
out_data
.
data
();
for
(
size_t
j
=
0
;
j
<
out_num
;
j
+=
10
)
{
EXPECT_NEAR
(
(
data_o
[
j
]
-
TRUTH_VALUES
[
j
/
10
])
/
TRUTH_VALUES
[
j
/
10
],
0.
,
10e-3
);
}
}
Config
XpuConfig
()
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/"
+
"model"
;
Config
config
;
config
.
SetModel
(
model_dir
+
"/model"
,
model_dir
+
"/params"
);
config
.
EnableXpu
();
return
config
;
}
TEST
(
resnet50_xpu
,
basic
)
{
Config
config
=
XpuConfig
();
auto
predictor
=
CreatePredictor
(
config
);
PrepareInput
(
predictor
);
predictor
->
Run
();
CompareOutput
(
predictor
);
}
#define RUN_WITH_RUNTIME_CONFIG(idx_, config_) \
Config config##idx_ = XpuConfig(); \
auto predictor##idx_ = CreatePredictor(config##idx_); \
PrepareInput(predictor##idx_); \
experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \
&config_); \
CompareOutput(predictor##idx_); \
CHECK_EQ(predictor##idx_->GetExecStream(), config_.stream);
TEST
(
runtime_stream
,
null_stream
)
{
experimental
::
XpuRuntimeConfig
xpu_runtime_config
=
{
nullptr
,
0
,
nullptr
,
0
};
RUN_WITH_RUNTIME_CONFIG
(
0
,
xpu_runtime_config
);
}
TEST
(
runtime_stream
,
new_stream
)
{
void
*
stream
=
nullptr
;
xpu_stream_create
(
&
stream
);
CHECK_NOTNULL
(
stream
);
{
experimental
::
XpuRuntimeConfig
xpu_runtime_config
=
{
stream
,
0
,
nullptr
,
0
};
RUN_WITH_RUNTIME_CONFIG
(
0
,
xpu_runtime_config
);
}
xpu_stream_destroy
(
stream
);
}
TEST
(
runtime_stream
,
2
_null_stream
)
{
experimental
::
XpuRuntimeConfig
xpu_runtime_config
=
{
nullptr
,
0
,
nullptr
,
0
};
RUN_WITH_RUNTIME_CONFIG
(
0
,
xpu_runtime_config
);
RUN_WITH_RUNTIME_CONFIG
(
1
,
xpu_runtime_config
);
}
TEST
(
runtime_stream
,
null_and_new_stream
)
{
experimental
::
XpuRuntimeConfig
xpu_runtime_config0
=
{
nullptr
,
0
,
nullptr
,
0
};
void
*
stream
=
nullptr
;
xpu_stream_create
(
&
stream
);
CHECK_NOTNULL
(
stream
);
{
experimental
::
XpuRuntimeConfig
xpu_runtime_config1
=
{
stream
,
0
,
nullptr
,
0
};
RUN_WITH_RUNTIME_CONFIG
(
0
,
xpu_runtime_config0
);
RUN_WITH_RUNTIME_CONFIG
(
1
,
xpu_runtime_config1
);
}
xpu_stream_destroy
(
stream
);
}
TEST
(
runtime_stream
,
2
_new_same_stream
)
{
void
*
stream
=
nullptr
;
xpu_stream_create
(
&
stream
);
CHECK_NOTNULL
(
stream
);
experimental
::
XpuRuntimeConfig
xpu_runtime_config
=
{
stream
,
0
,
nullptr
,
0
};
{
RUN_WITH_RUNTIME_CONFIG
(
0
,
xpu_runtime_config
);
RUN_WITH_RUNTIME_CONFIG
(
1
,
xpu_runtime_config
);
}
xpu_stream_destroy
(
stream
);
}
TEST
(
runtime_stream
,
2
_new_different_stream
)
{
void
*
stream0
=
nullptr
;
xpu_stream_create
(
&
stream0
);
CHECK_NOTNULL
(
stream0
);
experimental
::
XpuRuntimeConfig
xpu_runtime_config0
=
{
stream0
,
0
,
nullptr
,
0
};
void
*
stream1
=
nullptr
;
xpu_stream_create
(
&
stream1
);
CHECK_NOTNULL
(
stream1
);
experimental
::
XpuRuntimeConfig
xpu_runtime_config1
=
{
stream1
,
0
,
nullptr
,
0
};
{
RUN_WITH_RUNTIME_CONFIG
(
0
,
xpu_runtime_config0
);
RUN_WITH_RUNTIME_CONFIG
(
1
,
xpu_runtime_config1
);
}
xpu_stream_destroy
(
stream0
);
xpu_stream_destroy
(
stream1
);
}
void
RunPredictorWithRuntimeConfig
(
std
::
shared_ptr
<
Predictor
>
predictor
,
experimental
::
XpuRuntimeConfig
runtime_config
)
{
PrepareInput
(
predictor
);
experimental
::
InternalUtils
::
RunWithRuntimeConfig
(
predictor
.
get
(),
&
runtime_config
);
CompareOutput
(
predictor
);
CHECK_EQ
(
predictor
->
GetExecStream
(),
runtime_config
.
stream
);
}
TEST
(
runtime_stream
,
2
_thread
)
{
void
*
stream0
=
nullptr
;
xpu_stream_create
(
&
stream0
);
CHECK_NOTNULL
(
stream0
);
experimental
::
XpuRuntimeConfig
xpu_runtime_config0
=
{
stream0
,
0
,
nullptr
,
0
};
void
*
stream1
=
nullptr
;
xpu_stream_create
(
&
stream1
);
CHECK_NOTNULL
(
stream1
);
experimental
::
XpuRuntimeConfig
xpu_runtime_config1
=
{
stream1
,
0
,
nullptr
,
0
};
{
RUN_WITH_RUNTIME_CONFIG
(
0
,
xpu_runtime_config0
);
RUN_WITH_RUNTIME_CONFIG
(
1
,
xpu_runtime_config1
);
std
::
thread
t0
(
RunPredictorWithRuntimeConfig
,
predictor0
,
xpu_runtime_config0
);
std
::
thread
t1
(
RunPredictorWithRuntimeConfig
,
predictor1
,
xpu_runtime_config1
);
t0
.
join
();
t1
.
join
();
}
xpu_stream_destroy
(
stream0
);
xpu_stream_destroy
(
stream1
);
}
}
// namespace paddle_infer
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录