Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
cc927184
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
cc927184
编写于
7月 06, 2020
作者:
M
MaxwellDing
提交者:
GitHub
7月 06, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[MLU] add cast on MLU as default, test=develop (#3776)
上级
11cbd50e
变更
55
展开全部
隐藏空白更改
内联
并排
Showing
55 changed file
with
2304 addition
and
583 deletion
+2304
-583
lite/api/cxx_api_impl.cc
lite/api/cxx_api_impl.cc
+4
-6
lite/api/paddle_api.cc
lite/api/paddle_api.cc
+31
-12
lite/api/paddle_api.h
lite/api/paddle_api.h
+12
-14
lite/backends/mlu/target_wrapper.cc
lite/backends/mlu/target_wrapper.cc
+43
-8
lite/backends/mlu/target_wrapper.h
lite/backends/mlu/target_wrapper.h
+21
-5
lite/core/context.cc
lite/core/context.cc
+6
-0
lite/core/context.h
lite/core/context.h
+22
-19
lite/core/device_info.cc
lite/core/device_info.cc
+0
-48
lite/core/device_info.h
lite/core/device_info.h
+0
-23
lite/core/mir/fusion/conv_activation_fuse_pass.cc
lite/core/mir/fusion/conv_activation_fuse_pass.cc
+1
-0
lite/core/mir/fusion/fc_fuse_pass.cc
lite/core/mir/fusion/fc_fuse_pass.cc
+9
-1
lite/core/mir/memory_optimize_pass.cc
lite/core/mir/memory_optimize_pass.cc
+2
-1
lite/core/mir/mlu_postprocess_pass.cc
lite/core/mir/mlu_postprocess_pass.cc
+405
-72
lite/core/mir/mlu_postprocess_pass.h
lite/core/mir/mlu_postprocess_pass.h
+8
-2
lite/core/mir/runtime_context_assign_pass.cc
lite/core/mir/runtime_context_assign_pass.cc
+4
-0
lite/core/mir/type_layout_cast_pass.cc
lite/core/mir/type_layout_cast_pass.cc
+2
-0
lite/core/optimizer.h
lite/core/optimizer.h
+4
-4
lite/kernels/mlu/CMakeLists.txt
lite/kernels/mlu/CMakeLists.txt
+4
-3
lite/kernels/mlu/bridges/CMakeLists.txt
lite/kernels/mlu/bridges/CMakeLists.txt
+7
-1
lite/kernels/mlu/bridges/act_op.cc
lite/kernels/mlu/bridges/act_op.cc
+4
-0
lite/kernels/mlu/bridges/act_op_test.cc
lite/kernels/mlu/bridges/act_op_test.cc
+6
-2
lite/kernels/mlu/bridges/batch_norm_op.cc
lite/kernels/mlu/bridges/batch_norm_op.cc
+20
-9
lite/kernels/mlu/bridges/cast_op.cc
lite/kernels/mlu/bridges/cast_op.cc
+75
-0
lite/kernels/mlu/bridges/cast_op_test.cc
lite/kernels/mlu/bridges/cast_op_test.cc
+122
-0
lite/kernels/mlu/bridges/concat_op.cc
lite/kernels/mlu/bridges/concat_op.cc
+5
-3
lite/kernels/mlu/bridges/conv_op.cc
lite/kernels/mlu/bridges/conv_op.cc
+93
-29
lite/kernels/mlu/bridges/conv_op_test.cc
lite/kernels/mlu/bridges/conv_op_test.cc
+7
-0
lite/kernels/mlu/bridges/elementwise_ops.cc
lite/kernels/mlu/bridges/elementwise_ops.cc
+3
-1
lite/kernels/mlu/bridges/elementwise_ops_test.cc
lite/kernels/mlu/bridges/elementwise_ops_test.cc
+1
-1
lite/kernels/mlu/bridges/fc_op.cc
lite/kernels/mlu/bridges/fc_op.cc
+69
-21
lite/kernels/mlu/bridges/fc_op_test.cc
lite/kernels/mlu/bridges/fc_op_test.cc
+3
-3
lite/kernels/mlu/bridges/graph.cc
lite/kernels/mlu/bridges/graph.cc
+5
-1
lite/kernels/mlu/bridges/graph.h
lite/kernels/mlu/bridges/graph.h
+118
-40
lite/kernels/mlu/bridges/interpolate_op.cc
lite/kernels/mlu/bridges/interpolate_op.cc
+1
-0
lite/kernels/mlu/bridges/layout_op.cc
lite/kernels/mlu/bridges/layout_op.cc
+110
-0
lite/kernels/mlu/bridges/layout_op_test.cc
lite/kernels/mlu/bridges/layout_op_test.cc
+190
-0
lite/kernels/mlu/bridges/paddle_use_bridges.h
lite/kernels/mlu/bridges/paddle_use_bridges.h
+5
-0
lite/kernels/mlu/bridges/pool_op.cc
lite/kernels/mlu/bridges/pool_op.cc
+20
-16
lite/kernels/mlu/bridges/pool_op_test.cc
lite/kernels/mlu/bridges/pool_op_test.cc
+44
-54
lite/kernels/mlu/bridges/scale_op.cc
lite/kernels/mlu/bridges/scale_op.cc
+1
-0
lite/kernels/mlu/bridges/softmax_op.cc
lite/kernels/mlu/bridges/softmax_op.cc
+7
-3
lite/kernels/mlu/bridges/softmax_op_test.cc
lite/kernels/mlu/bridges/softmax_op_test.cc
+1
-1
lite/kernels/mlu/bridges/tensor.cc
lite/kernels/mlu/bridges/tensor.cc
+99
-5
lite/kernels/mlu/bridges/tensor.h
lite/kernels/mlu/bridges/tensor.h
+14
-3
lite/kernels/mlu/bridges/test_helper.cc
lite/kernels/mlu/bridges/test_helper.cc
+45
-26
lite/kernels/mlu/bridges/test_helper.h
lite/kernels/mlu/bridges/test_helper.h
+2
-1
lite/kernels/mlu/bridges/utility.cc
lite/kernels/mlu/bridges/utility.cc
+10
-21
lite/kernels/mlu/bridges/utility.h
lite/kernels/mlu/bridges/utility.h
+135
-16
lite/kernels/mlu/io_copy_compute.cc
lite/kernels/mlu/io_copy_compute.cc
+76
-8
lite/kernels/mlu/layout_compute.cc
lite/kernels/mlu/layout_compute.cc
+11
-11
lite/kernels/mlu/layout_compute.h
lite/kernels/mlu/layout_compute.h
+24
-42
lite/kernels/mlu/subgraph_compute.cc
lite/kernels/mlu/subgraph_compute.cc
+16
-4
lite/kernels/mlu/subgraph_compute.h
lite/kernels/mlu/subgraph_compute.h
+361
-34
lite/kernels/x86/activation_compute.cc
lite/kernels/x86/activation_compute.cc
+10
-0
lite/tools/build_mlu.sh
lite/tools/build_mlu.sh
+6
-9
未找到文件。
lite/api/cxx_api_impl.cc
浏览文件 @
cc927184
...
...
@@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
#endif
#ifdef LITE_WITH_MLU
Env
<
TARGET
(
kMLU
)
>::
Init
();
lite
::
DeviceInfo
::
Global
().
SetMLURunMode
(
config
.
mlu_core_version
(),
config
.
mlu_core_number
(),
config
.
mlu_use_first_conv
(),
config
.
mlu_first_conv_mean
(),
config
.
mlu_first_conv_std
(),
config
.
mlu_input_layout
());
lite
::
TargetWrapperMlu
::
SetMLURunMode
(
config
.
mlu_core_version
(),
config
.
mlu_core_number
(),
config
.
mlu_input_layout
(),
config
.
mlu_firstconv_param
());
#endif // LITE_WITH_MLU
auto
use_layout_preprocess_pass
=
config
.
model_dir
().
find
(
"OPENCL_PRE_PRECESS"
);
...
...
lite/api/paddle_api.cc
浏览文件 @
cc927184
...
...
@@ -13,6 +13,9 @@
// limitations under the License.
#include "lite/api/paddle_api.h"
#include <utility>
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/target_wrapper.h"
...
...
@@ -22,6 +25,10 @@
#include "lite/backends/cuda/target_wrapper.h"
#endif
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif
namespace
paddle
{
namespace
lite_api
{
...
...
@@ -97,6 +104,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
data
,
src_data
,
num
*
sizeof
(
T
),
lite
::
IoDirection
::
HtoD
);
#else
LOG
(
FATAL
)
<<
"Please compile the lib with CUDA."
;
#endif
}
else
if
(
type
==
TargetType
::
kMLU
)
{
#ifdef LITE_WITH_MLU
lite
::
TargetWrapperMlu
::
MemcpySync
(
data
,
src_data
,
num
*
sizeof
(
T
),
lite
::
IoDirection
::
HtoD
);
#else
LOG
(
FATAL
)
<<
"Please compile the lib with MLU."
;
#endif
}
else
{
LOG
(
FATAL
)
<<
"The CopyFromCpu interface just support kHost, kARM, kCUDA"
;
...
...
@@ -117,6 +131,13 @@ void Tensor::CopyToCpu(T *data) const {
data
,
src_data
,
num
*
sizeof
(
T
),
lite
::
IoDirection
::
DtoH
);
#else
LOG
(
FATAL
)
<<
"Please compile the lib with CUDA."
;
#endif
}
else
if
(
type
==
TargetType
::
kMLU
)
{
#ifdef LITE_WITH_MLU
lite
::
TargetWrapperMlu
::
MemcpySync
(
data
,
src_data
,
num
*
sizeof
(
T
),
lite
::
IoDirection
::
DtoH
);
#else
LOG
(
FATAL
)
<<
"Please compile the lib with MLU."
;
#endif
}
else
{
LOG
(
FATAL
)
<<
"The CopyToCpu interface just support kHost, kARM, kCUDA"
;
...
...
@@ -138,6 +159,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template
void
Tensor
::
CopyFromCpu
<
float
,
TargetType
::
kCUDA
>(
const
float
*
);
template
void
Tensor
::
CopyFromCpu
<
int8_t
,
TargetType
::
kCUDA
>(
const
int8_t
*
);
template
void
Tensor
::
CopyFromCpu
<
int
,
TargetType
::
kMLU
>(
const
int
*
);
template
void
Tensor
::
CopyFromCpu
<
int64_t
,
TargetType
::
kMLU
>(
const
int64_t
*
);
template
void
Tensor
::
CopyFromCpu
<
float
,
TargetType
::
kMLU
>(
const
float
*
);
template
void
Tensor
::
CopyFromCpu
<
int8_t
,
TargetType
::
kMLU
>(
const
int8_t
*
);
template
void
Tensor
::
CopyToCpu
(
float
*
)
const
;
template
void
Tensor
::
CopyToCpu
(
int
*
)
const
;
template
void
Tensor
::
CopyToCpu
(
int8_t
*
)
const
;
...
...
@@ -228,13 +254,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
void
CxxConfig
::
set_mlu_input_layout
(
DataLayoutType
layout
)
{
mlu_input_layout_
=
layout
;
}
void
CxxConfig
::
set_mlu_use_first_conv
(
bool
use_first_conv
)
{
mlu_use_first_conv_
=
use_first_conv
;
}
void
CxxConfig
::
set_mlu_first_conv_mean
(
const
std
::
vector
<
float
>
&
mean
)
{
void
CxxConfig
::
set_mlu_firstconv_param
(
const
std
::
vector
<
float
>
&
mean
,
const
std
::
vector
<
float
>
&
std
)
{
mlu_first_conv_mean_
=
mean
;
}
void
CxxConfig
::
set_mlu_first_conv_std
(
const
std
::
vector
<
float
>
&
std
)
{
mlu_first_conv_std_
=
std
;
}
lite_api
::
MLUCoreVersion
CxxConfig
::
mlu_core_version
()
const
{
...
...
@@ -242,12 +264,9 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
}
int
CxxConfig
::
mlu_core_number
()
const
{
return
mlu_core_number_
;
}
DataLayoutType
CxxConfig
::
mlu_input_layout
()
const
{
return
mlu_input_layout_
;
}
bool
CxxConfig
::
mlu_use_first_conv
()
const
{
return
mlu_use_first_conv_
;
}
const
std
::
vector
<
float
>
&
CxxConfig
::
mlu_first_conv_mean
()
const
{
return
mlu_first_conv_mean_
;
}
const
std
::
vector
<
float
>
&
CxxConfig
::
mlu_first_conv_std
()
const
{
return
mlu_first_conv_std_
;
std
::
pair
<
std
::
vector
<
float
>
,
std
::
vector
<
float
>>
CxxConfig
::
mlu_firstconv_param
()
const
{
return
std
::
make_pair
(
mlu_first_conv_mean_
,
mlu_first_conv_std_
);
}
#endif
...
...
lite/api/paddle_api.h
浏览文件 @
cc927184
...
...
@@ -21,6 +21,7 @@
#define PADDLE_LITE_API_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle_place.h" // NOLINT
...
...
@@ -160,9 +161,8 @@ class LITE_API CxxConfig : public ConfigBase {
lite_api
::
MLUCoreVersion
mlu_core_version_
{
lite_api
::
MLUCoreVersion
::
MLU_270
};
int
mlu_core_number_
{
1
};
DataLayoutType
mlu_input_layout_
{
DATALAYOUT
(
kNCHW
)};
bool
mlu_use_first_conv_
{
false
};
std
::
vector
<
float
>
mlu_first_conv_mean_
;
std
::
vector
<
float
>
mlu_first_conv_std_
;
std
::
vector
<
float
>
mlu_first_conv_mean_
{};
std
::
vector
<
float
>
mlu_first_conv_std_
{};
#endif
public:
...
...
@@ -210,24 +210,22 @@ class LITE_API CxxConfig : public ConfigBase {
void
set_mlu_core_version
(
lite_api
::
MLUCoreVersion
core_version
);
// set MLU core number, which is used when compiling MLU kernels
void
set_mlu_core_number
(
int
core_number
);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void
set_mlu_input_layout
(
DataLayoutType
layout
);
// whether use MLU's first conv kernel. First conv is a special kernel
// provided by MLU, its input is uint8, and also needs two 3-dimentional
// vectors which save all inputs' mean and std values
void
set_mlu_use_first_conv
(
bool
use_first_conv
);
// set the 3-dimentional mean vector used by MLU's first conv
void
set_mlu_first_conv_mean
(
const
std
::
vector
<
float
>&
mean
);
// set the 3-dimentional std vector used by MLU's first conv
void
set_mlu_first_conv_std
(
const
std
::
vector
<
float
>&
std
);
// set the 3-dimentional mean vector and 3-dimentional std vector used by
// MLU's first conv
void
set_mlu_firstconv_param
(
const
std
::
vector
<
float
>&
mean
,
const
std
::
vector
<
float
>&
std
);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void
set_mlu_input_layout
(
DataLayoutType
layout
);
lite_api
::
MLUCoreVersion
mlu_core_version
()
const
;
int
mlu_core_number
()
const
;
DataLayoutType
mlu_input_layout
()
const
;
bool
mlu_use_first_conv
()
const
;
const
std
::
vector
<
float
>&
mlu_first_conv_mean
()
const
;
const
std
::
vector
<
float
>&
mlu_first_conv_std
()
const
;
// std::pair<mean, std>
std
::
pair
<
std
::
vector
<
float
>
,
std
::
vector
<
float
>>
mlu_firstconv_param
()
const
;
#endif
// XPU only, set the size of the workspace memory from L3 cache for the
...
...
lite/backends/mlu/target_wrapper.cc
浏览文件 @
cc927184
...
...
@@ -15,6 +15,7 @@
#include "lite/backends/mlu/target_wrapper.h"
#include <memory>
#include <utility>
#include "lite/backends/mlu/mlu_utils.h"
...
...
@@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
}
// namespace mlu
thread_local
cnmlCoreVersion_t
TargetWrapperMlu
::
mlu_core_version_
{
CNML_MLU270
};
thread_local
int
TargetWrapperMlu
::
mlu_core_number_
{
1
};
thread_local
bool
TargetWrapperMlu
::
use_first_conv_
{
false
};
thread_local
std
::
vector
<
float
>
TargetWrapperMlu
::
mean_vec_
;
thread_local
std
::
vector
<
float
>
TargetWrapperMlu
::
std_vec_
;
thread_local
DataLayoutType
TargetWrapperMlu
::
input_layout_
{
DATALAYOUT
(
kNCHW
)};
size_t
TargetWrapperMlu
::
num_devices
()
{
uint32_t
dev_count
=
0
;
CNRT_CALL
(
cnrtGetDeviceCount
(
&
dev_count
))
<<
" cnrt get device count failed"
;
...
...
@@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst,
LOG
(
FATAL
)
<<
"Unsupported IoDirection"
<<
static_cast
<
int
>
(
dir
);
}
}
void
TargetWrapperMlu
::
SetMLURunMode
(
lite_api
::
MLUCoreVersion
core_version
,
int
core_number
,
DataLayoutType
input_layout
,
std
::
pair
<
std
::
vector
<
float
>
,
std
::
vector
<
float
>>
firstconv_param
)
{
switch
(
core_version
)
{
case
(
lite_api
::
MLUCoreVersion
::
MLU_220
):
mlu_core_version_
=
CNML_MLU220
;
break
;
case
(
lite_api
::
MLUCoreVersion
::
MLU_270
):
mlu_core_version_
=
CNML_MLU270
;
break
;
default:
mlu_core_version_
=
CNML_MLU270
;
break
;
}
mlu_core_number_
=
core_number
;
mean_vec_
=
firstconv_param
.
first
;
std_vec_
=
firstconv_param
.
second
;
use_first_conv_
=
!
(
mean_vec_
.
empty
()
||
std_vec_
.
empty
());
input_layout_
=
input_layout
;
}
cnmlCoreVersion_t
TargetWrapperMlu
::
MLUCoreVersion
()
{
return
mlu_core_version_
;
}
int
TargetWrapperMlu
::
MLUCoreNumber
()
{
return
mlu_core_number_
;
}
bool
TargetWrapperMlu
::
UseFirstConv
()
{
return
use_first_conv_
;
}
const
std
::
vector
<
float
>&
TargetWrapperMlu
::
MeanVec
()
{
return
mean_vec_
;
}
const
std
::
vector
<
float
>&
TargetWrapperMlu
::
StdVec
()
{
return
std_vec_
;
}
// void TargetWrapperMlu::MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const stream_t& stream) {
// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
// MemcpySync(dst, src, size, dir);
// }
DataLayoutType
TargetWrapperMlu
::
InputLayout
()
{
return
input_layout_
;
}
}
// namespace lite
}
// namespace paddle
lite/backends/mlu/target_wrapper.h
浏览文件 @
cc927184
...
...
@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <utility>
#include <vector>
#include "lite/backends/mlu/mlu_utils.h"
#include "lite/core/target_wrapper.h"
...
...
@@ -43,11 +45,25 @@ class TargetWrapper<TARGET(kMLU)> {
const
void
*
src
,
size_t
size
,
IoDirection
dir
);
// static void MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const queue_t& queue);
static
void
SetMLURunMode
(
lite_api
::
MLUCoreVersion
core_version
,
int
core_number
,
DataLayoutType
input_layout
,
std
::
pair
<
std
::
vector
<
float
>
,
std
::
vector
<
float
>>
firstconv_param
);
static
cnmlCoreVersion_t
MLUCoreVersion
();
static
int
MLUCoreNumber
();
static
bool
UseFirstConv
();
static
const
std
::
vector
<
float
>&
MeanVec
();
static
const
std
::
vector
<
float
>&
StdVec
();
static
DataLayoutType
InputLayout
();
private:
static
thread_local
cnmlCoreVersion_t
mlu_core_version_
;
static
thread_local
int
mlu_core_number_
;
static
thread_local
bool
use_first_conv_
;
static
thread_local
std
::
vector
<
float
>
mean_vec_
;
static
thread_local
std
::
vector
<
float
>
std_vec_
;
static
thread_local
DataLayoutType
input_layout_
;
};
}
// namespace lite
...
...
lite/core/context.cc
浏览文件 @
cc927184
...
...
@@ -27,5 +27,11 @@ thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
int
Context
<
TargetType
::
kXPU
>::
_workspace_l3_size_per_thread
{
0
};
#endif
#ifdef LITE_WITH_MLU
int
Context
<
TargetType
::
kMLU
>::
next_queue_id_
{
0
};
std
::
map
<
int
,
int
>
Context
<
TargetType
::
kMLU
>::
queue_id_map_
;
std
::
mutex
Context
<
TargetType
::
kMLU
>::
map_mutex_
;
#endif
}
// namespace lite
}
// namespace paddle
lite/core/context.h
浏览文件 @
cc927184
...
...
@@ -25,6 +25,7 @@
#ifdef LITE_WITH_MLU
#include <cnml.h>
#include <cnrt.h>
#include <mutex> // NOLINT
#include "lite/backends/mlu/mlu_utils.h"
#endif
#ifdef LITE_WITH_XPU
...
...
@@ -249,11 +250,11 @@ class Context<TargetType::kMLU> {
void
InitOnce
()
{}
MLUContext
&
operator
=
(
const
MLUContext
&
ctx
)
{
this
->
Init
(
ctx
.
device_id_
,
ctx
.
exec_queue_id_
,
ctx
.
io_queue_id_
);
this
->
Init
(
ctx
.
device_id_
,
ctx
.
exec_queue_id_
);
return
*
this
;
}
void
Init
(
int
dev_id
,
int
exec_queue_id
=
0
,
int
io_queue_id
=
0
)
{
void
Init
(
int
dev_id
,
int
exec_queue_id
=
0
)
{
CHECK_GT
(
devs
.
size
(),
0UL
)
<<
"Env is not initialized or current target is not exit!"
;
if
(
dev_id
>=
static_cast
<
int
>
(
devs
.
size
()))
{
...
...
@@ -264,21 +265,19 @@ class Context<TargetType::kMLU> {
device_id_
=
dev_id
;
}
SetMluDevice
(
device_id_
);
if
(
io_queue_id
>=
devs
[
dev_id
].
max_queue
())
{
LOG
(
WARNING
)
<<
"data queue index exceeds the maximum queue number, "
"set to default qeueu(0)!"
;
io_queue_id
=
0
;
}
if
(
exec_queue_id
>=
devs
[
dev_id
].
max_queue
())
{
LOG
(
WARNING
)
<<
"exec queue index exceeds the maximum queue number, "
"set to default qeueu(0)!"
;
exec_queue_id
=
0
;
// get queue id from map
std
::
unique_lock
<
std
::
mutex
>
lk
(
map_mutex_
);
if
(
queue_id_map_
.
find
(
exec_queue_id
)
==
queue_id_map_
.
end
())
{
queue_id_map_
[
exec_queue_id
]
=
next_queue_id_
++
%
devs
[
dev_id
].
max_queue
();
}
io_queue_
=
devs
[
dev_id
].
io_queues
()[
io_queue_id
];
exec_queue_
=
devs
[
dev_id
].
exec_queues
()[
exec_queue_id
];
exec_queue_id_
=
queue_id_map_
[
exec_queue_id
];
VLOG
(
4
)
<<
"pick mlu queue id: "
<<
exec_queue_id_
;
lk
.
unlock
();
exec_queue_id_
=
exec_queue_id
;
io_queue_id_
=
io_queue_id
;
io_queue_
=
devs
[
dev_id
].
io_queues
()[
exec_queue_id_
]
;
exec_queue_
=
devs
[
dev_id
].
exec_queues
()[
exec_queue_id_
]
;
}
void
CopySharedTo
(
MLUContext
*
ctx
)
{
ctx
->
forward_param_
=
forward_param_
;
}
...
...
@@ -290,10 +289,12 @@ class Context<TargetType::kMLU> {
void
SetIoQueue
(
cnrtQueue_t
queue
)
{
io_queue_
=
queue
;
}
cnmlCoreVersion_t
MLUCoreVersion
()
{
return
DeviceInfo
::
Global
().
MLUCoreVersion
();
return
paddle
::
lite
::
TargetWrapperMlu
::
MLUCoreVersion
();
}
int
MLUCoreNumber
()
{
return
DeviceInfo
::
Global
().
MLUCoreNumber
();
}
int
MLUCoreNumber
()
{
return
paddle
::
lite
::
TargetWrapperMlu
::
MLUCoreNumber
();
}
u32_t
affinity
()
{
return
affinity_
;
}
...
...
@@ -304,10 +305,12 @@ class Context<TargetType::kMLU> {
std
::
string
name
()
const
{
return
"MLUContext"
;
}
private:
static
int
next_queue_id_
;
static
std
::
map
<
int
,
int
>
queue_id_map_
;
static
std
::
mutex
map_mutex_
;
int
device_id_
;
// overall information
int
exec_queue_id_
;
int
io_queue_id_
;
cnrtQueue_t
io_queue_
;
cnrtQueue_t
exec_queue_
;
...
...
@@ -455,7 +458,7 @@ class ContextScheduler {
case
TARGET
(
kMLU
):
{
int
dev_id
=
TargetWrapper
<
TargetType
::
kMLU
>::
GetCurDevice
();
auto
&
context
=
ctx
->
As
<
MLUContext
>
();
context
.
Init
(
dev_id
);
context
.
Init
(
dev_id
,
exec_stream_id
);
kernel_contexts_
[
TargetType
::
kMLU
].
As
<
MLUContext
>
().
CopySharedTo
(
&
context
);
LOG
(
INFO
)
<<
"New Context for MLU"
;
...
...
lite/core/device_info.cc
浏览文件 @
cc927184
...
...
@@ -66,15 +66,6 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
thread_local
TensorLite
DeviceInfo
::
workspace_
;
thread_local
int64_t
DeviceInfo
::
count_
=
0
;
#ifdef LITE_WITH_MLU
thread_local
cnmlCoreVersion_t
DeviceInfo
::
mlu_core_version_
{
CNML_MLU270
};
thread_local
int
DeviceInfo
::
mlu_core_number_
{
1
};
thread_local
bool
DeviceInfo
::
use_first_conv_
{
false
};
thread_local
std
::
vector
<
float
>
DeviceInfo
::
mean_vec_
;
thread_local
std
::
vector
<
float
>
DeviceInfo
::
std_vec_
;
thread_local
DataLayoutType
DeviceInfo
::
input_layout_
{
DATALAYOUT
(
kNCHW
)};
#endif
#ifdef TARGET_IOS
const
int
DEFAULT_L1_CACHE_SIZE
=
64
*
1024
;
const
int
DEFAULT_L2_CACHE_SIZE
=
2048
*
1024
;
...
...
@@ -1089,45 +1080,6 @@ int DeviceInfo::Setup() {
return
0
;
}
#ifdef LITE_WITH_MLU
void
DeviceInfo
::
SetMLURunMode
(
lite_api
::
MLUCoreVersion
core_version
,
int
core_number
,
bool
use_first_conv
,
const
std
::
vector
<
float
>&
mean_vec
,
const
std
::
vector
<
float
>&
std_vec
,
DataLayoutType
input_layout
)
{
switch
(
core_version
)
{
case
(
lite_api
::
MLUCoreVersion
::
MLU_220
):
mlu_core_version_
=
CNML_MLU220
;
break
;
case
(
lite_api
::
MLUCoreVersion
::
MLU_270
):
mlu_core_version_
=
CNML_MLU270
;
break
;
default:
mlu_core_version_
=
CNML_MLU270
;
break
;
}
mlu_core_number_
=
core_number
;
use_first_conv_
=
use_first_conv
;
mean_vec_
=
mean_vec
;
std_vec_
=
std_vec
;
input_layout_
=
input_layout
;
}
cnmlCoreVersion_t
DeviceInfo
::
MLUCoreVersion
()
{
return
mlu_core_version_
;
}
int
DeviceInfo
::
MLUCoreNumber
()
{
return
mlu_core_number_
;
}
bool
DeviceInfo
::
UseFirstConv
()
{
return
use_first_conv_
;
}
const
std
::
vector
<
float
>&
DeviceInfo
::
MeanVec
()
const
{
return
mean_vec_
;
}
const
std
::
vector
<
float
>&
DeviceInfo
::
StdVec
()
const
{
return
std_vec_
;
}
DataLayoutType
DeviceInfo
::
InputLayout
()
const
{
return
input_layout_
;
}
#endif // LITE_WITH_MLU
void
DeviceInfo
::
SetRunMode
(
lite_api
::
PowerMode
mode
,
int
thread_num
)
{
#ifdef ARM_WITH_OMP
thread_num
=
std
::
min
(
thread_num
,
core_num_
);
...
...
lite/core/device_info.h
浏览文件 @
cc927184
...
...
@@ -55,20 +55,6 @@ class DeviceInfo {
int
Setup
();
void
SetRunMode
(
lite_api
::
PowerMode
mode
,
int
thread_num
);
#ifdef LITE_WITH_MLU
void
SetMLURunMode
(
lite_api
::
MLUCoreVersion
core_version
,
int
core_number
,
bool
use_first_conv
,
const
std
::
vector
<
float
>&
mean_vec
,
const
std
::
vector
<
float
>&
std_vec
,
DataLayoutType
input_layout
);
cnmlCoreVersion_t
MLUCoreVersion
();
int
MLUCoreNumber
();
bool
UseFirstConv
();
const
std
::
vector
<
float
>&
MeanVec
()
const
;
const
std
::
vector
<
float
>&
StdVec
()
const
;
DataLayoutType
InputLayout
()
const
;
#endif
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
);
void
SetArch
(
ARMArch
arch
)
{
arch_
=
arch
;
}
...
...
@@ -120,15 +106,6 @@ class DeviceInfo {
static
thread_local
TensorLite
workspace_
;
static
thread_local
int64_t
count_
;
#ifdef LITE_WITH_MLU
static
thread_local
cnmlCoreVersion_t
mlu_core_version_
;
static
thread_local
int
mlu_core_number_
;
static
thread_local
bool
use_first_conv_
;
static
thread_local
std
::
vector
<
float
>
mean_vec_
;
static
thread_local
std
::
vector
<
float
>
std_vec_
;
static
thread_local
DataLayoutType
input_layout_
;
#endif
void
SetDotInfo
(
int
argc
,
...);
void
SetFP16Info
(
int
argc
,
...);
void
SetFP32Info
(
int
argc
,
...);
...
...
lite/core/mir/fusion/conv_activation_fuse_pass.cc
浏览文件 @
cc927184
...
...
@@ -64,4 +64,5 @@ REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
paddle
::
lite
::
mir
::
ConvActivationFusePass
)
.
BindTargets
({
TARGET
(
kAny
)})
.
ExcludeTargets
({
TARGET
(
kXPU
)})
.
ExcludeTargets
({
TARGET
(
kMLU
)})
.
BindKernel
(
"conv2d"
);
lite/core/mir/fusion/fc_fuse_pass.cc
浏览文件 @
cc927184
...
...
@@ -24,8 +24,13 @@ namespace mir {
void
FcFusePass
::
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
{
#ifdef LITE_WITH_X86
#ifdef LITE_WITH_MLU
fusion
::
FcFuser
fuser
(
false
);
fuser
(
graph
.
get
());
#else
fusion
::
FcFuser
fuser
(
true
);
fuser
(
graph
.
get
());
#endif
#endif
fusion
::
FcFuser
fuser2
(
false
);
...
...
@@ -38,6 +43,9 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS
(
lite_fc_fuse_pass
,
paddle
::
lite
::
mir
::
FcFusePass
)
.
BindTargets
({
TARGET
(
kAny
)})
.
ExcludeTargets
({
TARGET
(
kXPU
),
TARGET
(
kX86
)})
.
ExcludeTargets
({
TARGET
(
kXPU
)})
#ifndef LITE_WITH_MLU
.
ExcludeTargets
({
TARGET
(
kX86
)})
#endif
.
ExcludeTargets
({
TARGET
(
kBM
)})
.
BindKernel
(
"fc"
);
lite/core/mir/memory_optimize_pass.cc
浏览文件 @
cc927184
...
...
@@ -314,4 +314,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
TARGET
(
kXPU
),
TARGET
(
kBM
),
TARGET
(
kRKNPU
),
TARGET
(
kAPU
)});
TARGET
(
kAPU
),
TARGET
(
kMLU
)});
lite/core/mir/mlu_postprocess_pass.cc
浏览文件 @
cc927184
此差异已折叠。
点击以展开。
lite/core/mir/mlu_postprocess_pass.h
浏览文件 @
cc927184
...
...
@@ -79,6 +79,8 @@ class MLUPostprocessPass : public ProgramPass {
const
Type
**
arg_type
,
SSAGraph
*
graph
);
void
ModifyInputOutputDataType
(
SSAGraph
*
graph
);
void
ModifyLayout
(
SSAGraph
*
graph
);
bool
NeedInsert
(
Node
*
node
,
const
Type
*
inst_type
);
...
...
@@ -86,12 +88,14 @@ class MLUPostprocessPass : public ProgramPass {
void
InsertBefore
(
SSAGraph
*
graph
,
Node
*
head_node
,
Node
*
inst_node
,
const
Type
*
type
);
const
Type
*
type
,
bool
use_mlu_cast
);
void
InsertAfter
(
SSAGraph
*
graph
,
Node
*
tail_node
,
Node
*
inst_node
,
const
Type
*
type
);
const
Type
*
type
,
bool
use_mlu_cast
);
Node
*
InsertCastBefore
(
const
std
::
string
&
op_type
,
const
std
::
string
&
cast_arg_name
,
...
...
@@ -115,6 +119,8 @@ class MLUPostprocessPass : public ProgramPass {
bool
IsFirstConvInSubgraph
(
Node
*
arg_node
,
Node
*
inst
);
void
AdjustSubgraph
(
Node
*
subgraph_node
,
const
Type
*
op_type
);
private:
std
::
set
<
std
::
string
>
first_conv_nodes_
;
};
...
...
lite/core/mir/runtime_context_assign_pass.cc
浏览文件 @
cc927184
...
...
@@ -44,6 +44,10 @@ class RuntimeContextAssignPass : public StmtPass {
inst
.
picked_kernel
().
SetContext
(
ContextScheduler
::
Global
().
NewContext
(
inst
.
picked_kernel
().
target
()));
}
#elif LITE_WITH_MLU
inst
.
picked_kernel
().
SetContext
(
ContextScheduler
::
Global
().
NewContext
(
inst
.
picked_kernel
().
target
(),
static_cast
<
int
>
(
reinterpret_cast
<
int64_t
>
(
graph
.
get
()))));
#else
int
stream_id
=
inst
.
stream_id_
;
...
...
lite/core/mir/type_layout_cast_pass.cc
浏览文件 @
cc927184
...
...
@@ -249,11 +249,13 @@ void OpenCLTypeLayoutTransformPass::Apply(
REGISTER_MIR_PASS
(
type_layout_cast_pass
,
paddle
::
lite
::
mir
::
TypeLayoutTransformPass
)
.
BindTargets
({
TARGET
(
kAny
)})
.
ExcludeTargets
({
TARGET
(
kMLU
)})
.
BindKernel
(
"layout_once"
)
.
BindKernel
(
"layout"
);
REGISTER_MIR_PASS
(
type_layout_cast_preprocess_pass
,
paddle
::
lite
::
mir
::
OpenCLTypeLayoutTransformPass
)
.
BindTargets
({
TARGET
(
kAny
)})
.
ExcludeTargets
({
TARGET
(
kMLU
)})
.
BindKernel
(
"layout_once"
)
.
BindKernel
(
"layout"
);
lite/core/optimizer.h
浏览文件 @
cc927184
...
...
@@ -108,9 +108,13 @@ class Optimizer {
"bm_subgraph_pass"
,
"apu_subgraph_pass"
,
"rknpu_subgraph_pass"
,
"mlu_subgraph_pass"
,
"static_kernel_pick_pass"
,
// pick original kernel from graph
"remove_tf_redundant_ops_pass"
,
"variable_place_inference_pass"
,
// inference arg/var's
"mlu_postprocess_pass"
,
// info(target/precision/layout/device)
// using kernel info
"argument_type_display_pass"
,
// debug pass: show arg-type-node's
...
...
@@ -140,13 +144,9 @@ class Optimizer {
"variable_place_inference_pass"
,
//
"argument_type_display_pass"
,
"mlu_subgraph_pass"
,
"runtime_context_assign_pass"
,
"argument_type_display_pass"
,
"mlu_postprocess_pass"
,
"memory_optimize_pass"
}};
if
(
passes
.
size
()
==
1
)
{
...
...
lite/kernels/mlu/CMakeLists.txt
浏览文件 @
cc927184
...
...
@@ -4,6 +4,7 @@ endif()
add_subdirectory
(
bridges
)
add_kernel
(
subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS
${
lite_kernel_deps
}
${
mlu_subgraph_bridges
}
)
add_kernel
(
io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS
${
lite_kernel_deps
}
${
math_mlu
}
)
add_kernel
(
calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS
${
lite_kernel_deps
}
${
math_mlu
}
)
add_kernel
(
layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS
${
lite_kernel_deps
}
${
math_mlu
}
)
add_kernel
(
io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS
${
lite_kernel_deps
}
${
target_wrapper_mlu
}
)
add_kernel
(
calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS
${
lite_kernel_deps
}
)
# depend on transpose function in backend/x86/math/math_function
add_kernel
(
layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS
${
lite_kernel_deps
}
${
math_function
}
)
lite/kernels/mlu/bridges/CMakeLists.txt
浏览文件 @
cc927184
...
...
@@ -3,7 +3,7 @@ if(NOT LITE_WITH_MLU)
endif
()
lite_cc_library
(
subgraph_bridge_utility_mlu SRCS utility.cc DEPS
${
mlu_builder_libs
}
tensor
)
lite_cc_library
(
subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS
${
mlu_builder_libs
}
)
lite_cc_library
(
subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS
${
mlu_builder_libs
}
subgraph_bridge_utility_mlu
)
lite_cc_library
(
subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu
)
set
(
mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu
)
...
...
@@ -18,6 +18,8 @@ lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_d
lite_cc_library
(
subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS
${
subgraph_bridge_deps_mlu
}
)
lite_cc_library
(
subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS
${
subgraph_bridge_deps_mlu
}
)
lite_cc_library
(
subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS
${
subgraph_bridge_deps_mlu
}
)
lite_cc_library
(
subgraph_bridge_cast_op_mlu SRCS cast_op.cc DEPS
${
subgraph_bridge_deps_mlu
}
)
lite_cc_library
(
subgraph_bridge_layout_op_mlu SRCS layout_op.cc DEPS
${
subgraph_bridge_deps_mlu
}
)
set
(
mlu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_mlu
...
...
@@ -32,6 +34,8 @@ set(mlu_subgraph_bridges
subgraph_bridge_scale_op_mlu
subgraph_bridge_interp_op_mlu
subgraph_bridge_concat_op_mlu
subgraph_bridge_cast_op_mlu
subgraph_bridge_layout_op_mlu
CACHE INTERNAL
"mlu_subgraph_bridges"
)
lite_cc_library
(
subgraph_test_helper_mlu SRCS test_helper.cc DEPS
${
mlu_subgraph_bridges
}
)
...
...
@@ -45,4 +49,6 @@ lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer targe
lite_cc_test
(
test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_layout_converter_mlu SRCS layout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_cast_converter_mlu SRCS cast_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
message
(
STATUS
"+++++ mlu_subgraph_bridges:
${
mlu_subgraph_bridges
}
"
)
lite/kernels/mlu/bridges/act_op.cc
浏览文件 @
cc927184
...
...
@@ -60,6 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
output_tensor
->
mlu_tensor
()));
}
graph
->
FuseOp
(
activation_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
activation_op
));
return
SUCCESS
;
}
...
...
@@ -72,6 +73,9 @@ REGISTER_SUBGRAPH_BRIDGE(sigmoid,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
relu
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
relu6
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
tanh
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
leaky_relu
,
kMLU
,
...
...
lite/kernels/mlu/bridges/act_op_test.cc
浏览文件 @
cc927184
...
...
@@ -13,7 +13,9 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
...
...
@@ -116,7 +118,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
opdesc
.
SetAttr
(
"offset"
,
0.5
f
);
}
// create and convert op to
NPU model, then run it on NP
U
// create and convert op to
MLU model, then run it on ML
U
auto
op
=
CreateOp
<
operators
::
ActivationOp
>
(
opdesc
,
&
scope
);
// execute reference implementation and save to output tensor
act_ref
(
op
);
...
...
@@ -134,7 +136,8 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
TEST
(
MLUBridges
,
activation
)
{
std
::
vector
<
std
::
vector
<
int64_t
>>
shapes
{{
1
},
{
2
,
3
},
{
1
,
2
,
3
,
4
}};
std
::
vector
<
std
::
string
>
types
{
"sigmoid"
,
"relu"
,
"tanh"
,
"leaky_relu"
};
std
::
vector
<
std
::
string
>
types
{
"sigmoid"
,
"relu"
,
"relu6"
,
"tanh"
,
"leaky_relu"
};
for
(
auto
x_shape
:
shapes
)
{
for
(
auto
op_type
:
types
)
{
test_act
(
x_shape
,
op_type
);
...
...
@@ -149,5 +152,6 @@ TEST(MLUBridges, activation) {
USE_SUBGRAPH_BRIDGE
(
sigmoid
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
relu
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
relu6
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
tanh
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
leaky_relu
,
kMLU
)
lite/kernels/mlu/bridges/batch_norm_op.cc
浏览文件 @
cc927184
...
...
@@ -48,25 +48,32 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
mean
=
scope
->
FindVar
(
mean_var_name
)
->
GetMutable
<
Tensor
>
();
auto
mean_dims
=
mean
->
dims
().
Vectorize
();
if
(
mean_dims
.
size
()
<
4
)
{
mean_dims
.
insert
(
mean_dims
.
begin
(),
4
-
mean_dims
.
size
(),
1
);
}
auto
mean_tensor
=
graph
->
AddNode
(
mean_var_name
,
mean_dims
,
CNML_CONST
,
CNML_
CNHW
,
graph
->
FPType
());
mean_var_name
,
mean_dims
,
CNML_CONST
,
CNML_
NHWC
,
graph
->
FPType
());
auto
variance
=
scope
->
FindVar
(
variance_var_name
)
->
GetMutable
<
Tensor
>
();
auto
variance_dims
=
variance
->
dims
().
Vectorize
();
if
(
variance_dims
.
size
()
<
4
)
{
variance_dims
.
insert
(
variance_dims
.
begin
(),
4
-
variance_dims
.
size
(),
1
);
}
auto
variance_tensor
=
graph
->
AddNode
(
variance_var_name
,
variance_dims
,
CNML_CONST
,
CNML_
CNHW
,
graph
->
FPType
());
variance_var_name
,
variance_dims
,
CNML_CONST
,
CNML_
NHWC
,
graph
->
FPType
());
auto
scale
=
scope
->
FindVar
(
scale_var_name
)
->
GetMutable
<
Tensor
>
();
auto
bias
=
scope
->
FindVar
(
bias_var_name
)
->
GetMutable
<
Tensor
>
();
int
co
=
static_cast
<
int
>
(
mean_dims
[
0
]);
int
co
=
static_cast
<
int
>
(
mean_dims
[
3
]);
std
::
vector
<
float
>
variance_trans
(
co
);
std
::
vector
<
float
>
mean_trans
(
co
);
for
(
int
i
=
0
;
i
<
co
;
++
i
)
{
variance
->
mutable_data
<
float
>
()
[
i
]
=
variance
_trans
[
i
]
=
scale
->
data
<
float
>
()[
i
]
/
sqrtf
(
variance
->
data
<
float
>
()[
i
]
+
epsilon
);
mean
->
mutable_data
<
float
>
()[
i
]
=
mean
->
data
<
float
>
()[
i
]
-
bias
->
data
<
float
>
()[
i
]
/
variance
->
data
<
float
>
()[
i
];
mean_trans
[
i
]
=
mean
->
data
<
float
>
()[
i
]
-
bias
->
data
<
float
>
()[
i
]
/
variance_trans
[
i
];
}
auto
input_tensor
=
graph
->
GetNode
(
x_var_name
);
...
...
@@ -77,10 +84,14 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
mean_tensor
->
mlu_tensor
(),
variance_tensor
->
mlu_tensor
()));
graph
->
BindConstData
(
variance_var_name
,
variance
);
graph
->
BindConstData
(
mean_var_name
,
mean
);
graph
->
BindConstRawData
(
variance_var_name
,
variance_trans
.
data
(),
variance_trans
.
size
(),
true
);
graph
->
BindConstRawData
(
mean_var_name
,
mean_trans
.
data
(),
mean_trans
.
size
(),
true
);
graph
->
FuseOp
(
bn_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
bn_op
));
return
SUCCESS
;
}
...
...
lite/kernels/mlu/bridges/cast_op.cc
0 → 100644
浏览文件 @
cc927184
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
mlu
{
int
CastConverter
(
void
*
ctx
,
OpLite
*
op
,
KernelBase
*
kernel
)
{
CHECK
(
ctx
!=
nullptr
);
CHECK
(
op
!=
nullptr
);
auto
graph
=
static_cast
<
Graph
*>
(
ctx
);
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[MLU] Converting "
+
op_type
+
"..."
;
auto
x_var_name
=
op_info
->
Input
(
"X"
).
front
();
auto
out_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output
=
scope
->
FindVar
(
out_var_name
)
->
GetMutable
<
Tensor
>
();
auto
output_dims
=
output
->
dims
().
Vectorize
();
auto
in_dtype
=
op_info
->
GetAttr
<
int
>
(
"in_dtype"
);
auto
out_dtype
=
op_info
->
GetAttr
<
int
>
(
"out_dtype"
);
CHECK
(
graph
->
HasNode
(
x_var_name
));
auto
x_tensor
=
graph
->
GetNode
(
x_var_name
);
cnmlDataType_t
out_type
;
cnmlCastType_t
cast_type
;
if
(
in_dtype
==
4
&&
out_dtype
==
5
)
{
cast_type
=
CNML_CAST_FLOAT16_TO_FLOAT32
;
out_type
=
CNML_DATA_FLOAT32
;
}
else
if
(
in_dtype
==
5
&&
out_dtype
==
4
)
{
cast_type
=
CNML_CAST_FLOAT32_TO_FLOAT16
;
out_type
=
CNML_DATA_FLOAT16
;
}
else
{
CHECK
(
0
)
<<
"Unsupported cast type"
;
}
auto
output_tensor
=
graph
->
AddNode
(
out_var_name
,
output_dims
,
CNML_TENSOR
,
CNML_NCHW
,
out_type
);
cnmlBaseOp_t
cast_op
;
CNML_CALL
(
cnmlCreateCastOp
(
&
cast_op
,
cast_type
,
x_tensor
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
()));
graph
->
FuseOp
(
cast_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
cast_op
));
return
SUCCESS
;
}
}
// namespace mlu
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
cast
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
CastConverter
);
lite/kernels/mlu/bridges/cast_op_test.cc
0 → 100644
浏览文件 @
cc927184
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/cast_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
mlu
{
void
test_cast_FP16_to_FP32
(
std
::
vector
<
int64_t
>
shape
)
{
// prepare input&output variables
std
::
string
x_var_name
=
"x"
;
std
::
string
out_var_name
=
"out"
;
Scope
scope
;
auto
*
x
=
scope
.
Var
(
x_var_name
)
->
GetMutable
<
Tensor
>
();
auto
*
out
=
scope
.
Var
(
out_var_name
)
->
GetMutable
<
Tensor
>
();
x
->
Resize
(
DDim
(
shape
));
auto
*
x_data
=
x
->
mutable_data
<
paddle
::
lite
::
fluid
::
float16
>
();
// initialize input&output data
for
(
int
i
=
0
;
i
<
x
->
dims
().
production
();
i
++
)
{
x_data
[
i
]
=
static_cast
<
paddle
::
lite
::
fluid
::
float16
>
(
i
);
}
// initialize op desc
int
in_dtype
=
4
,
out_dtype
=
5
;
cpp
::
OpDesc
opdesc
;
opdesc
.
SetType
(
"cast"
);
opdesc
.
SetInput
(
"X"
,
{
x_var_name
});
opdesc
.
SetOutput
(
"Out"
,
{
out_var_name
});
opdesc
.
SetAttr
(
"in_dtype"
,
in_dtype
);
opdesc
.
SetAttr
(
"out_dtype"
,
out_dtype
);
auto
op
=
CreateOp
<
operators
::
CastOp
>
(
opdesc
,
&
scope
);
Tensor
data
;
data
.
Resize
(
DDim
(
shape
));
auto
*
copy_data
=
data
.
mutable_data
<
paddle
::
lite
::
fluid
::
float16
>
();
data
.
CopyDataFrom
(
*
x
);
x
->
set_precision
(
paddle
::
lite_api
::
PrecisionType
::
kFP16
);
LaunchOp
(
op
,
{
x_var_name
},
{
out_var_name
});
// compare results
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
out
->
dims
().
production
();
i
++
)
{
VLOG
(
5
)
<<
i
;
EXPECT_NEAR
(
out_data
[
i
],
static_cast
<
double
>
(
copy_data
[
i
]),
5e-4
);
}
}
void
test_cast_FP32_to_FP16
(
std
::
vector
<
int64_t
>
shape
)
{
// prepare input&output variables
std
::
string
x_var_name
=
"x"
;
std
::
string
out_var_name
=
"out"
;
Scope
scope
;
auto
*
x
=
scope
.
Var
(
x_var_name
)
->
GetMutable
<
Tensor
>
();
auto
*
out
=
scope
.
Var
(
out_var_name
)
->
GetMutable
<
Tensor
>
();
x
->
Resize
(
DDim
(
shape
));
auto
*
x_data
=
x
->
mutable_data
<
float
>
();
// initialize input&output data
for
(
int
i
=
0
;
i
<
x
->
dims
().
production
();
i
++
)
{
x_data
[
i
]
=
static_cast
<
float
>
(
i
);
}
// initialize op desc
int
in_dtype
=
5
,
out_dtype
=
4
;
cpp
::
OpDesc
opdesc
;
opdesc
.
SetType
(
"cast"
);
opdesc
.
SetInput
(
"X"
,
{
x_var_name
});
opdesc
.
SetOutput
(
"Out"
,
{
out_var_name
});
opdesc
.
SetAttr
(
"in_dtype"
,
in_dtype
);
opdesc
.
SetAttr
(
"out_dtype"
,
out_dtype
);
auto
op
=
CreateOp
<
operators
::
CastOp
>
(
opdesc
,
&
scope
);
Tensor
data
;
data
.
Resize
(
DDim
(
shape
));
auto
*
copy_data
=
data
.
mutable_data
<
float
>
();
data
.
CopyDataFrom
(
*
x
);
x
->
set_precision
(
paddle
::
lite_api
::
PrecisionType
::
kFloat
);
LaunchOp
(
op
,
{
x_var_name
},
{
out_var_name
});
// compare results
auto
*
out_data
=
out
->
mutable_data
<
paddle
::
lite
::
fluid
::
float16
>
();
for
(
int
i
=
0
;
i
<
out
->
dims
().
production
();
i
++
)
{
VLOG
(
5
)
<<
i
;
EXPECT_NEAR
(
static_cast
<
double
>
(
out_data
[
i
]),
copy_data
[
i
],
5e-4
);
}
}
TEST
(
MLUBridges
,
cast
)
{
test_cast_FP16_to_FP32
({
2
,
3
,
4
,
5
});
test_cast_FP16_to_FP32
({
6
,
3
,
2
,
5
});
test_cast_FP32_to_FP16
({
2
,
3
,
4
,
5
});
test_cast_FP32_to_FP16
({
6
,
3
,
2
,
5
});
}
}
// namespace mlu
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
USE_SUBGRAPH_BRIDGE
(
cast
,
kMLU
);
lite/kernels/mlu/bridges/concat_op.cc
浏览文件 @
cc927184
...
...
@@ -44,9 +44,10 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
dims
=
output_dims
.
size
();
int
axis
=
(
param_axis
<
0
)
?
(
param_axis
+
dims
)
:
param_axis
;
CHECK_LE
(
axis
,
4
)
<<
"Unsupport dims in mlu concat"
;
int
nchw_to_nhwc_axis_map
[
4
]
=
{
0
,
3
,
1
,
2
};
int
nhwc_axis
=
nchw_to_nhwc_axis_map
[
axis
];
CHECK_LT
(
axis
,
dims
)
<<
"Unsupport dims in mlu concat"
;
// value of nhwc2nchw_axis is index of nhwc
// order of nhwc2nchw_axis is nchw
int
nhwc_axis
=
GetAxisNHWC2NCHW
<
int
>
(
dims
)[
axis
];
auto
output_tensor
=
graph
->
AddNode
(
out_var_name
,
output_dims
,
CNML_TENSOR
,
CNML_NCHW
,
graph
->
FPType
());
...
...
@@ -60,6 +61,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
&
outputs
,
1
));
graph
->
FuseOp
(
concat_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
concat_op
));
return
SUCCESS
;
}
...
...
lite/kernels/mlu/bridges/conv_op.cc
浏览文件 @
cc927184
...
...
@@ -13,7 +13,9 @@
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <algorithm>
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
...
...
@@ -30,6 +32,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
const
auto
*
op_info
=
op
->
op_info
();
const
auto
*
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[MLU] Converting "
<<
op_info
->
Type
()
<<
"... "
;
CHECK
(
!
op_info
->
HasAttr
(
"act_type"
));
// get input, filter and op attributes
const
auto
input_var_name
=
op_info
->
Input
(
"Input"
).
front
();
...
...
@@ -43,8 +46,13 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
const
auto
output_shape
=
output
->
dims
().
Vectorize
();
const
auto
bs
=
input_dims
[
0
];
const
auto
oc
=
filter_dims
[
0
];
const
auto
groups
=
op_info
->
GetAttr
<
int
>
(
"groups"
);
CHECK_EQ
(
input_dims
.
size
(),
4u
);
CHECK_EQ
(
filter_dims
.
size
(),
4u
);
CHECK
(
!
(
op_info
->
HasAttr
(
"fuse_relu"
)
&&
(
op_info
->
GetAttr
<
bool
>
(
"fuse_relu"
)
==
true
)))
<<
"UnSupported param fuse_relu is true!"
;
const
auto
strides
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"strides"
);
auto
dilations
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"dilations"
);
auto
paddings
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"paddings"
);
...
...
@@ -70,13 +78,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
padding_algorithm
,
input_dims
,
filter_dims
);
bool
is_group_mode
=
groups
>
1
;
bool
is_depthwise_mode
=
false
;
if
(
filter_dims
[
0
]
==
groups
&&
filter_dims
[
1
]
==
1
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
)
{
// depthwise filter shape = {1, ic ,kh ,kw}
is_depthwise_mode
=
true
;
is_group_mode
=
false
;
}
auto
input_tensor
=
graph
->
GetNode
(
input_var_name
);
const
auto
output_tensor
=
graph
->
AddNode
(
output_var_name
,
output_shape
,
CNML_TENSOR
,
CNML_NCHW
,
graph
->
FPType
());
std
::
vector
<
int64_t
>
cnml_filter_shape
=
{
filter_dims
[
0
],
filter_dims
[
1
],
filter_dims
[
2
],
filter_dims
[
3
]};
if
(
is_depthwise_mode
)
{
/*paddle filter shape is {oc , ic / groups == 1, kh, kw} while
cnml depthwise conv filter expect shape {oc / groups == 1 , ic , kh, kw}
so we should shape filter shape
*/
cnml_filter_shape
=
{
filter_dims
[
1
],
filter_dims
[
0
],
filter_dims
[
2
],
filter_dims
[
3
]};
}
// Create filter node
const
auto
filter_tensor
=
graph
->
AddNode
(
filter_var_name
,
filter_dims
.
Vectorize
()
,
cnml_filter_shape
,
CNML_FILTER
,
CNML_NCHW
,
graph
->
FPType
());
...
...
@@ -89,15 +116,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
dequant
(
filter_dequant
.
data
(),
filter
->
mutable_data
<
int8_t
>
(),
1
,
filter_dims
[
0
],
filter_dims
[
1
]
*
filter_dims
[
2
]
*
filter_dims
[
3
],
cnml_filter_shape
[
0
],
cnml_filter_shape
[
1
]
*
cnml_filter_shape
[
2
]
*
cnml_filter_shape
[
3
],
weight_scale
);
transpose
(
filter_dequant
.
data
(),
filter
->
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
filter_dims
[
0
]),
static_cast
<
int
>
(
filter_dims
[
1
]),
static_cast
<
int
>
(
filter_dims
[
2
]),
static_cast
<
int
>
(
filter_dims
[
3
])},
{
static_cast
<
int
>
(
cnml_filter_shape
[
0
]),
static_cast
<
int
>
(
cnml_filter_shape
[
1
]),
static_cast
<
int
>
(
cnml_filter_shape
[
2
]),
static_cast
<
int
>
(
cnml_filter_shape
[
3
])},
{
0
,
2
,
3
,
1
});
filter
->
set_precision
(
PrecisionType
::
kFloat
);
}
else
if
(
filter
->
precision
()
!=
PrecisionType
::
kFloat
)
{
...
...
@@ -116,7 +143,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
std
::
vector
<
int64_t
>
bias_shape
;
if
(
bias_data_size
==
oc
)
{
// 0: {oc}
bias_shape
=
{
oc
};
bias_shape
=
{
1
,
1
,
1
,
oc
};
}
else
if
(
bias_data_size
==
output_data_size
/
bs
)
{
LOG
(
FATAL
)
<<
"Unsupported ... ..."
;
// 1: {1, oc, oh, ow}
...
...
@@ -130,18 +157,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
<<
" isn't supported in conv2d Op when output dimension is "
<<
output_dims
;
}
bias_tensor
=
graph
->
AddNode
(
bias_var_name
,
bias_dims
.
Vectorize
(),
CNML_CONST
,
CNML_CNHW
,
graph
->
FPType
());
bias_tensor
=
graph
->
AddNode
(
bias_var_name
,
bias_shape
,
CNML_CONST
,
CNML_NHWC
,
graph
->
FPType
());
graph
->
BindConstData
(
bias_var_name
,
bias
);
}
const
auto
input_scale
=
op_info
->
GetAttr
<
float
>
(
"input_scale"
);
bool
use_first_conv
=
false
;
if
(
lite
::
DeviceInfo
::
Global
().
UseFirstConv
()
&&
input_dims
[
1
]
==
3
)
{
if
(
lite
::
TargetWrapperMlu
::
UseFirstConv
()
&&
input_dims
[
1
]
==
3
)
{
use_first_conv
=
true
;
}
...
...
@@ -158,38 +182,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
paddings
[
0
],
paddings
[
0
]));
const
auto
mean_tensor
=
graph
->
AddNode
(
"first_conv_mean_tensor"
,
std
::
vector
<
int64_t
>
{
3
},
std
::
vector
<
int64_t
>
{
1
,
1
,
1
,
3
},
CNML_CONST
,
CNML_
CNHW
,
CNML_
NHWC
,
graph
->
FPType
());
const
auto
std_tensor
=
graph
->
AddNode
(
"first_conv_std_tensor"
,
std
::
vector
<
int64_t
>
{
3
},
std
::
vector
<
int64_t
>
{
1
,
1
,
1
,
3
},
CNML_CONST
,
CNML_
CNHW
,
CNML_
NHWC
,
graph
->
FPType
());
graph
->
BindConstRawData
(
"first_conv_mean_tensor"
,
lite
::
DeviceInfo
::
Global
().
MeanVec
().
data
(),
lite
::
TargetWrapperMlu
::
MeanVec
().
data
(),
3
,
false
);
graph
->
BindConstRawData
(
"first_conv_std_tensor"
,
lite
::
DeviceInfo
::
Global
().
StdVec
().
data
(),
lite
::
TargetWrapperMlu
::
StdVec
().
data
(),
3
,
false
);
graph
->
GetNode
(
input_var_name
)
->
set_mlu_dtype
(
CNML_DATA_UINT8
);
input_tensor
->
set_mlu_dtype
(
CNML_DATA_UINT8
);
CNML_CALL
(
cnmlCreateConvFirstOpForward
(
&
conv_op
,
conv_param
,
graph
->
GetNode
(
input_var_name
)
->
mlu_tensor
(),
input_tensor
->
mlu_tensor
(),
mean_tensor
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
(),
filter_tensor
->
mlu_tensor
(),
bias_tensor
?
bias_tensor
->
mlu_tensor
()
:
nullptr
,
std_tensor
->
mlu_tensor
()));
CNML_CALL
(
cnmlDestroyConvFirstOpParam
(
&
conv_param
));
}
else
if
(
is_depthwise_mode
)
{
cnmlConvDepthwiseOpParam_t
conv_depthwise_param
;
cnmlCreateConvDepthwiseOpParam_V2
(
&
conv_depthwise_param
,
strides
[
0
],
strides
[
1
],
paddings
[
0
]
*
2
,
paddings
[
2
]
*
2
);
CNML_CALL
(
cnmlCreateConvDepthwiseOpForward
(
&
conv_op
,
conv_depthwise_param
,
input_tensor
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
(),
filter_tensor
->
mlu_tensor
(),
bias_tensor
?
bias_tensor
->
mlu_tensor
()
:
nullptr
));
CNML_CALL
(
cnmlDestroyConvDepthwiseOpParam
(
&
conv_depthwise_param
));
}
else
if
(
is_group_mode
)
{
cnmlConvOpParam_t
conv_param
;
CNML_CALL
(
cnmlCreateConvOpParam
(
&
conv_param
,
strides
[
0
],
strides
[
1
],
dilations
[
0
],
dilations
[
1
],
paddings
[
0
]
*
2
,
paddings
[
2
]
*
2
));
CNML_CALL
(
cnmlCreateConvGroupOpForward
(
&
conv_op
,
conv_param
,
input_tensor
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
(),
filter_tensor
->
mlu_tensor
(),
bias_tensor
?
bias_tensor
->
mlu_tensor
()
:
nullptr
,
groups
));
CNML_CALL
(
cnmlDestroyConvOpParam
(
&
conv_param
));
}
else
{
cnmlConvOpParam_t
conv_param
;
VLOG
(
5
)
<<
"conv param ("
<<
input_var_name
<<
")"
<<
"stride: "
<<
strides
[
0
]
<<
','
<<
strides
[
1
]
<<
'\t'
<<
"dilations: "
<<
dilations
[
0
]
<<
','
<<
dilations
[
1
]
<<
'\t'
<<
"paddings: "
<<
paddings
[
0
]
<<
','
<<
paddings
[
2
]
<<
std
::
endl
;
CNML_CALL
(
cnmlCreateConvOpParam
(
&
conv_param
,
strides
[
0
],
strides
[
1
],
...
...
@@ -200,19 +261,21 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CNML_CALL
(
cnmlCreateConvOpForward
(
&
conv_op
,
conv_param
,
graph
->
GetNode
(
input_var_name
)
->
mlu_tensor
(),
input_tensor
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
(),
filter_tensor
->
mlu_tensor
(),
bias_tensor
?
bias_tensor
->
mlu_tensor
()
:
nullptr
));
CNML_CALL
(
cnmlDestroyConvOpParam
(
&
conv_param
));
}
graph
->
SetComputingDataType
(
conv_op
,
graph
->
GetNode
(
input_var_name
)
->
mlu_tensor
(),
1
/
input_scale
);
graph
->
SetComputingDataType
(
conv_op
,
filter_tensor
->
mlu_tensor
(),
1
/
*
min_element
(
weight_scale
.
begin
(),
weight_scale
.
end
()));
if
(
!
is_depthwise_mode
)
{
graph
->
SetComputingDataType
(
conv_op
,
graph
->
GetNode
(
input_var_name
)
->
mlu_tensor
(),
1
/
input_scale
);
graph
->
SetComputingDataType
(
conv_op
,
filter_tensor
->
mlu_tensor
(),
1
/
*
max_element
(
weight_scale
.
begin
(),
weight_scale
.
end
()));
}
CNML_CALL
(
cnmlSetOperationComputingLayout
(
conv_op
,
CNML_NHWC
));
if
(
HasInputArg
(
op_info
,
scope
,
"Bias"
))
{
auto
*
bias
=
scope
->
FindVar
(
bias_var_name
)
->
GetMutable
<
Tensor
>
();
...
...
@@ -220,6 +283,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
graph
->
BindConstData
(
filter_var_name
,
filter
);
graph
->
FuseOp
(
conv_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
conv_op
));
return
REBUILD_WHEN_SHAPE_CHANGED
;
}
...
...
lite/kernels/mlu/bridges/conv_op_test.cc
浏览文件 @
cc927184
...
...
@@ -13,8 +13,11 @@
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
...
...
@@ -331,6 +334,10 @@ TEST(MLUBridges, conv) {
#endif
}
TEST
(
MLUBridges
,
depthwise_conv2d
)
{
test_conv
(
1
,
8
,
8
,
14
,
14
,
false
,
false
,
false
,
true
,
1
,
1
,
2
,
3
);
}
}
// namespace mlu
}
// namespace subgraph
}
// namespace lite
...
...
lite/kernels/mlu/bridges/elementwise_ops.cc
浏览文件 @
cc927184
...
...
@@ -23,7 +23,7 @@ namespace mlu {
std
::
vector
<
int64_t
>
CvtYShape
(
const
Tensor
&
x
,
Tensor
*
y
,
int
axis
)
{
auto
x_dims
=
x
.
dims
();
CHECK_EQ
(
x_dims
.
size
(),
4UL
)
<<
"[MLU] Only support 4-dimension x"
;
//
CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
auto
y_dims
=
y
->
dims
();
CHECK_GE
(
x_dims
.
size
(),
y_dims
.
size
());
...
...
@@ -117,6 +117,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
graph
->
FuseOp
(
elementwise_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
elementwise_op
));
cnmlBaseOp_t
act_op
;
if
(
op_type
==
"fusion_elementwise_add_activation"
)
{
auto
mid_tensor
=
graph
->
GetNode
(
out_var_name
+
"_mid"
);
...
...
@@ -127,6 +128,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
mid_tensor
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
()));
graph
->
FuseOp
(
act_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
act_op
));
}
return
REBUILD_WHEN_SHAPE_CHANGED
;
}
...
...
lite/kernels/mlu/bridges/elementwise_ops_test.cc
浏览文件 @
cc927184
...
...
@@ -153,7 +153,7 @@ void test_elementwise_add(const std::vector<int64_t>& x_shape,
opdesc
.
SetOutput
(
"Out"
,
{
out_var_name
});
opdesc
.
SetAttr
(
"axis"
,
axis
);
// create and convert op to
NPU model, then run it on NP
U
// create and convert op to
MLU model, then run it on ML
U
auto
op
=
CreateOp
<
operators
::
ElementwiseOp
>
(
opdesc
,
&
scope
);
// execute reference implementation and save to output tensor
...
...
lite/kernels/mlu/bridges/fc_op.cc
浏览文件 @
cc927184
...
...
@@ -34,7 +34,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
w_var_name
=
op_info
->
Input
(
"W"
).
front
();
auto
output_var_name
=
op_info
->
Output
(
"Out"
).
front
();
// int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims"
);
CHECK
(
!
op_info
->
HasAttr
(
"activation_type"
)
);
auto
x
=
scope
->
FindVar
(
x_var_name
)
->
GetMutable
<
Tensor
>
();
auto
w
=
scope
->
FindVar
(
w_var_name
)
->
GetMutable
<
Tensor
>
();
auto
output
=
scope
->
FindVar
(
output_var_name
)
->
GetMutable
<
Tensor
>
();
...
...
@@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK_EQ
(
w_dims
.
size
(),
2UL
);
// Create w node
std
::
vector
<
int64_t
>
w_shape
{
w_dims
[
1
],
w_dims
[
0
]};
std
::
vector
<
int64_t
>
cnml_w_shape
;
if
(
x_dims
.
size
()
==
4
)
{
if
(
x_dims
[
1
]
*
x_dims
[
2
]
*
x_dims
[
3
]
==
w_dims
[
0
])
{
cnml_w_shape
=
{
static_cast
<
int
>
(
w_dims
[
1
]),
static_cast
<
int
>
(
x_dims
[
1
]),
// input_c
static_cast
<
int
>
(
x_dims
[
2
]),
// input_h
static_cast
<
int
>
(
x_dims
[
3
]),
// input_w
};
}
else
{
LOG
(
FATAL
)
<<
"in fc op, we expect input_h * input_w * input_c == filter_c"
<<
" but we got input_c = "
<<
x_dims
[
1
]
<<
" input_h = "
<<
x_dims
[
2
]
<<
" input_w = "
<<
x_dims
[
3
]
<<
" filter_c = "
<<
w_dims
[
0
]
<<
std
::
endl
;
}
}
else
{
cnml_w_shape
=
{
w_dims
[
1
],
w_dims
[
0
]};
}
auto
w_tensor
=
graph
->
AddNode
(
w_var_name
,
w_shape
,
CNML_FILTER
,
CNML_NCHW
,
graph
->
FPType
());
w_var_name
,
cnml_
w_shape
,
CNML_FILTER
,
CNML_NCHW
,
graph
->
FPType
());
auto
input_scale
=
op_info
->
GetAttr
<
float
>
(
"input_scale"
);
...
...
@@ -63,15 +82,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if
(
HasInputArg
(
op_info
,
scope
,
"Bias"
))
{
bias_var_name
=
op_info
->
Input
(
"Bias"
).
front
();
auto
bias
=
scope
->
FindVar
(
bias_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
bias_dims
=
bias
->
dims
();
auto
bias_dims
=
bias
->
dims
()
.
Vectorize
()
;
CHECK
(
!
graph
->
HasNode
(
bias_var_name
));
if
(
bias_dims
.
size
()
<
4u
)
{
bias_dims
.
insert
(
bias_dims
.
begin
(),
4
-
bias_dims
.
size
(),
1
);
}
// CHECK_EQ(bias_dims.production(), n);
bias_tensor
=
graph
->
AddNode
(
bias_var_name
,
bias_dims
.
Vectorize
(),
CNML_CONST
,
CNML_CNHW
,
graph
->
FPType
());
bias_tensor
=
graph
->
AddNode
(
bias_var_name
,
bias_dims
,
CNML_CONST
,
CNML_NHWC
,
graph
->
FPType
());
graph
->
BindConstData
(
bias_var_name
,
bias
);
}
cnmlBaseOp_t
fc_op
;
...
...
@@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if
(
w
->
precision
()
==
PrecisionType
::
kUnk
||
w
->
precision
()
==
PrecisionType
::
kInt8
)
{
std
::
vector
<
float
>
w_dequant
(
w
->
data_size
());
dequant
(
w_dequant
.
data
(),
w
->
mutable_data
<
int8_t
>
(),
1
,
w_dims
[
1
],
w_dims
[
0
],
weight_scale
);
for
(
int
i
=
0
;
i
<
w_dims
[
1
];
i
++
)
{
for
(
int
j
=
0
;
j
<
w_dims
[
0
];
j
++
)
{
w
->
mutable_data
<
float
>
()[
i
*
w_dims
[
0
]
+
j
]
=
w_dequant
[
i
+
j
*
w_dims
[
1
]];
}
if
(
cnml_w_shape
.
size
()
==
2
)
{
dequant
(
w_dequant
.
data
(),
w
->
mutable_data
<
int8_t
>
(),
1
,
cnml_w_shape
[
0
],
cnml_w_shape
[
1
],
weight_scale
);
transpose2d
(
w_dequant
.
data
(),
w
->
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
cnml_w_shape
[
0
]),
static_cast
<
int
>
(
cnml_w_shape
[
1
])});
}
else
if
(
cnml_w_shape
.
size
()
==
4
)
{
dequant
(
w_dequant
.
data
(),
w
->
mutable_data
<
int8_t
>
(),
1
,
cnml_w_shape
[
0
],
cnml_w_shape
[
1
]
*
cnml_w_shape
[
2
]
*
cnml_w_shape
[
3
],
weight_scale
);
int
c_o_num
=
cnml_w_shape
[
0
];
int
c_i_num
=
cnml_w_shape
[
1
];
int
h_i_num
=
cnml_w_shape
[
2
];
int
w_i_num
=
cnml_w_shape
[
3
];
// chw == ci * hi * wi == w_dim[0]
// first trans [chw, co] -> [co,chw]
std
::
vector
<
float
>
first_trans_output
(
w_dequant
.
size
());
int
chw
=
c_i_num
*
h_i_num
*
w_i_num
;
transpose2d
(
w_dequant
.
data
(),
first_trans_output
.
data
(),
{
chw
,
c_o_num
});
// second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
transpose
(
first_trans_output
.
data
(),
w
->
mutable_data
<
float
>
(),
{
c_o_num
,
c_i_num
,
h_i_num
,
w_i_num
},
{
0
,
2
,
3
,
1
});
}
else
{
LOG
(
FATAL
)
<<
"expect w_shape.size == 2 or 4, but got "
<<
cnml_w_shape
.
size
()
<<
std
::
endl
;
}
w
->
set_precision
(
PrecisionType
::
kFloat
);
}
else
if
(
w
->
precision
()
!=
PrecisionType
::
kFloat
)
{
LOG
(
FATAL
)
<<
"UnSupported weight precision!"
;
...
...
@@ -110,9 +157,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
graph
->
SetComputingDataType
(
fc_op
,
w_tensor
->
mlu_tensor
(),
1
/
*
m
in
_element
(
weight_scale
.
begin
(),
weight_scale
.
end
()));
1
/
*
m
ax
_element
(
weight_scale
.
begin
(),
weight_scale
.
end
()));
graph
->
FuseOp
(
fc_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
fc_op
));
return
REBUILD_WHEN_SHAPE_CHANGED
;
}
...
...
lite/kernels/mlu/bridges/fc_op_test.cc
浏览文件 @
cc927184
...
...
@@ -175,9 +175,9 @@ void test_fc(const std::vector<int64_t>& input_shape,
TEST
(
MLUBridges
,
fc
)
{
for
(
bool
use_bias
:
{
true
,
false
})
{
//
test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
//
test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
//
test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
test_fc
({
1
,
8
,
8
,
1
},
{
64
,
4
},
1
,
use_bias
);
test_fc
({
1
,
5
,
5
,
1
},
{
25
,
7
},
1
,
use_bias
);
test_fc
({
1
,
4
,
1
,
1
},
{
4
,
8
},
1
,
use_bias
);
test_fc
({
1
,
1024
,
1
,
1
},
{
1024
,
32
},
1
,
use_bias
);
}
}
...
...
lite/kernels/mlu/bridges/graph.cc
浏览文件 @
cc927184
...
...
@@ -27,10 +27,14 @@ std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
cnmlTensorType_t
tensor_type
,
cnmlDataOrder_t
shape_order
,
cnmlDataType_t
mlu_dtype
,
cnmlDataOrder_t
data_order
,
void
*
raw_ptr
)
{
CHECK
(
!
HasNode
(
name
));
VLOG
(
5
)
<<
"add mlu node: "
<<
name
<<
"
\t
data type "
<<
static_cast
<
int
>
(
mlu_dtype
)
<<
"
\t
data order "
<<
static_cast
<
int
>
(
data_order
);
auto
node
=
std
::
shared_ptr
<
MLUTensor
>
(
new
MLUTensor
(
shape
,
tensor_type
,
shape_order
,
mlu_dtype
));
new
MLUTensor
(
shape
,
tensor_type
,
shape_order
,
mlu_dtype
,
data_order
));
node
->
set_mlu_ptr
(
raw_ptr
);
nodes_
.
insert
(
std
::
make_pair
(
name
,
node
));
return
node
;
...
...
lite/kernels/mlu/bridges/graph.h
浏览文件 @
cc927184
...
...
@@ -15,13 +15,15 @@
#pragma once
#include <cmath>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/kernels/mlu/bridges/tensor.h"
#include "lite/utils/env.h"
#define PRINT_HW_TIME false
...
...
@@ -45,32 +47,30 @@ class Graph {
CNRT_CALL
(
cnrtCreateNotifier
(
&
notifier_end_
));
#endif
}
~
Graph
()
{
FreeConstData
();
CNML_CALL
(
cnmlDestroyFusionOp
(
&
fusion_op_
));
for
(
auto
op
:
ops_
)
{
CNML_CALL
(
cnmlDestroyBaseOp
(
&
op
));
}
#if PRINT_HW_TIME
CNRT_CALL
(
cnrtDestroyNotifier
(
&
notifier_start_
));
CNRT_CALL
(
cnrtDestroyNotifier
(
&
notifier_end_
));
double
total_time
=
0
;
for
(
auto
&
f
:
time_log_
)
{
total_time
+=
f
;
if
(
!
time_log_
.
empty
())
{
for
(
auto
&
f
:
time_log_
)
{
total_time
+=
f
;
}
std
::
cout
<<
"cnml hardware time for "
<<
time_log_
.
size
()
<<
" process:"
<<
total_time
/
time_log_
.
size
()
<<
std
::
endl
;
}
std
::
cout
<<
"cnml hardware time for "
<<
time_log_
.
size
()
<<
" process:"
<<
total_time
/
time_log_
.
size
()
<<
std
::
endl
;
#endif
}
// Data node
std
::
shared_ptr
<
MLUTensor
>
AddNode
(
const
std
::
string
&
name
,
std
::
vector
<
int64_t
>
shape
,
cnmlTensorType_t
tensor_type
=
CNML_TENSOR
,
cnmlDataOrder_t
data
_order
=
CNML_NCHW
,
cnmlDataOrder_t
shape
_order
=
CNML_NCHW
,
cnmlDataType_t
mlu_dtype
=
CNML_DATA_FLOAT32
,
cnmlDataOrder_t
data_order
=
CNML_NHWC
,
void
*
raw_ptr
=
nullptr
);
std
::
shared_ptr
<
MLUTensor
>
GetNode
(
const
std
::
string
&
name
)
{
...
...
@@ -82,9 +82,16 @@ class Graph {
return
nodes_
.
find
(
name
)
!=
nodes_
.
end
();
}
void
AddInput
(
std
::
shared_ptr
<
MLUTensor
>
tensor
)
{
void
AddInput
(
std
::
shared_ptr
<
MLUTensor
>
tensor
,
bool
disable_batch_size_changeable
=
true
)
{
inputs_
.
push_back
(
tensor
->
mlu_tensor
());
input_tensors_
.
push_back
(
tensor
);
if
(
!
disable_batch_size_changeable
)
{
constexpr
int
input_dimNb
=
4
;
bool
input_dim_mutable
[
4
]
=
{
true
,
false
,
false
,
false
};
CNML_CALL
(
cnmlSetTensorDimMutable
(
tensor
->
mlu_tensor
(),
input_dim_mutable
,
input_dimNb
));
}
}
void
AddOutput
(
std
::
shared_ptr
<
MLUTensor
>
tensor
)
{
...
...
@@ -92,6 +99,22 @@ class Graph {
output_tensors_
.
push_back
(
tensor
);
}
std
::
vector
<
std
::
shared_ptr
<
MLUTensor
>>*
MutableInputs
()
{
return
&
input_tensors_
;
}
std
::
vector
<
std
::
shared_ptr
<
MLUTensor
>>*
MutableOutputs
()
{
return
&
output_tensors_
;
}
void
GenOfflineModel
(
const
std
::
string
&
name
)
{
cnmlModel_t
model
;
const
std
::
string
&
symbol
=
"subnet0"
;
const
auto
&
filename
=
name
+
".offline.cambricon"
;
CNML_CALL
(
cnmlCreateModel
(
&
model
,
filename
.
c_str
()));
CNML_CALL
(
cnmlAddFusionOpToModel
(
model
,
fusion_op_
,
symbol
.
c_str
()));
CNML_CALL
(
cnmlSaveModel
(
model
,
filename
.
c_str
()));
CNML_CALL
(
cnmlDestroyModel
(
model
));
}
void
FuseOp
(
cnmlBaseOp_t
op
)
{
CNML_CALL
(
cnmlFuseOp
(
op
,
fusion_op_
));
}
void
Compile
(
cnmlCoreVersion_t
core_version
,
int
core_number
)
{
...
...
@@ -103,18 +126,37 @@ class Graph {
CNML_CALL
(
cnmlSetFusionOpCorenum
(
fusion_op_
,
core_number
));
CNML_CALL
(
cnmlSetFusionOpCoreVersion
(
fusion_op_
,
core_version
));
CNML_CALL
(
cnmlCompileFusionOp_V2
(
fusion_op_
));
for
(
auto
in
:
input_tensors_
)
{
input_addrs_
.
push_back
(
in
->
mlu_data
());
}
for
(
auto
out
:
output_tensors_
)
{
output_addrs_
.
push_back
(
out
->
mlu_data
());
}
}
#define MEASURE_HWTIME_START(que) \
do { \
CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); \
} while (0)
#define MEASURE_HWTIME_END(que) \
do { \
thread_local float hw_time; \
CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); \
CNRT_CALL(cnrtSyncQueue(que)); \
CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); \
hw_time /= 1000.0f; \
DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; \
std::lock_guard<std::mutex> lk(time_mut_); \
time_log_.push_back(hw_time); \
} while (0)
void
Compute
(
cnrtInvokeFuncParam_t
forward_param
,
cnrtQueue_t
que
)
{
input_addrs_
.
resize
(
input_tensors_
.
size
());
output_addrs_
.
resize
(
output_tensors_
.
size
());
for
(
size_t
i
=
0
;
i
<
input_addrs_
.
size
();
++
i
)
{
input_addrs_
[
i
]
=
input_tensors_
[
i
]
->
mlu_data
();
}
for
(
size_t
i
=
0
;
i
<
output_addrs_
.
size
();
++
i
)
{
output_addrs_
[
i
]
=
output_tensors_
[
i
]
->
mlu_data
();
}
#if PRINT_HW_TIME
thread_local
float
hw_time
;
CNRT_CALL
(
cnrtPlaceNotifier
(
notifier_start_
,
que
));
MEASURE_HWTIME_START
(
que
);
#endif
CNML_CALL
(
cnmlComputeFusionOpForward_V3
(
fusion_op_
,
input_addrs_
.
data
(),
...
...
@@ -124,18 +166,46 @@ class Graph {
&
forward_param
,
que
));
#if PRINT_HW_TIME
CNRT_CALL
(
cnrtPlaceNotifier
(
notifier_end_
,
que
)
);
MEASURE_HWTIME_END
(
que
);
#endif
}
CNRT_CALL
(
cnrtSyncQueue
(
que
));
void
Compute
(
cnrtQueue_t
que
,
const
std
::
vector
<
std
::
shared_ptr
<
MLUTensor
>>&
in
,
const
std
::
vector
<
std
::
shared_ptr
<
MLUTensor
>>&
out
)
{
std
::
vector
<
cnmlTensor_t
>
in_tensor
;
std
::
vector
<
cnmlTensor_t
>
out_tensor
;
input_addrs_
.
resize
(
in
.
size
());
output_addrs_
.
resize
(
out
.
size
());
for
(
size_t
i
=
0
;
i
<
input_addrs_
.
size
();
++
i
)
{
input_addrs_
[
i
]
=
in
[
i
]
->
mlu_data
();
in_tensor
.
push_back
(
in
[
i
]
->
mlu_tensor
());
}
for
(
size_t
i
=
0
;
i
<
output_addrs_
.
size
();
++
i
)
{
output_addrs_
[
i
]
=
out
[
i
]
->
mlu_data
();
out_tensor
.
push_back
(
out
[
i
]
->
mlu_tensor
());
}
#if PRINT_HW_TIME
MEASURE_HWTIME_START
(
que
);
#endif
/* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3
* -> cnmlComputeFusionOpForward_V4 */
CNML_CALL
(
cnmlComputeFusionOpForward_V4
(
fusion_op_
,
&
in_tensor
[
0
],
input_addrs_
.
data
(),
input_addrs_
.
size
(),
&
out_tensor
[
0
],
output_addrs_
.
data
(),
output_addrs_
.
size
(),
que
,
NULL
));
#if PRINT_HW_TIME
CNRT_CALL
(
cnrtNotifierDuration
(
notifier_start_
,
notifier_end_
,
&
hw_time
));
hw_time
/=
1000.0
f
;
DLOG
(
INFO
)
<<
"cnml hardware time "
<<
hw_time
<<
"ms"
<<
std
::
endl
;
std
::
lock_guard
<
std
::
mutex
>
lk
(
time_mut_
);
time_log_
.
push_back
(
hw_time
);
MEASURE_HWTIME_END
(
que
);
#endif
}
#undef MEASURE_HWTIME_START
#undef MEASURE_HWTIME_END
template
<
typename
T
>
void
*
RegisterConstData
(
size_t
len
)
{
...
...
@@ -165,7 +235,7 @@ class Graph {
CNML_CALL
(
cnmlBindConstData_V2
(
nodes_
[
tensor_name
]
->
mlu_tensor
(),
alloc_data
,
false
));
}
else
if
(
fp_type_
==
CNML_DATA_FLOAT16
)
{
void
*
data_fp16
=
RegisterConstData
<
::
paddle
::
lite
::
fluid
::
float16
>
(
len
);
void
*
data_fp16
=
RegisterConstData
<
paddle
::
lite
::
fluid
::
float16
>
(
len
);
CNRT_CALL
(
cnrtCastDataType
(
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
data
)),
CNRT_FLOAT32
,
...
...
@@ -180,7 +250,7 @@ class Graph {
}
}
void
BindConstData
(
std
::
string
tensor_name
,
::
paddle
::
lite
::
Tensor
*
tensor
)
{
void
BindConstData
(
std
::
string
tensor_name
,
paddle
::
lite
::
Tensor
*
tensor
)
{
const
float
*
data
=
tensor
->
data
<
float
>
();
size_t
len
=
tensor
->
data_size
();
if
(
fp_type_
==
CNML_DATA_FLOAT32
)
{
...
...
@@ -189,10 +259,14 @@ class Graph {
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
data
)),
false
));
}
else
if
(
fp_type_
==
CNML_DATA_FLOAT16
)
{
auto
*
data_fp16
=
tensor
->
mutable_data
<::
paddle
::
lite
::
fluid
::
float16
>
();
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
{
data_fp16
[
i
]
=
static_cast
<::
paddle
::
lite
::
fluid
::
float16
>
(
data
[
i
]);
}
void
*
data_fp16
=
RegisterConstData
<
paddle
::
lite
::
fluid
::
float16
>
(
len
);
CNRT_CALL
(
cnrtCastDataType
(
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
data
)),
CNRT_FLOAT32
,
data_fp16
,
CNRT_FLOAT16
,
len
,
nullptr
));
CNML_CALL
(
cnmlBindConstData_V2
(
nodes_
[
tensor_name
]
->
mlu_tensor
(),
static_cast
<
void
*>
(
data_fp16
),
false
));
...
...
@@ -206,19 +280,23 @@ class Graph {
float
scale
,
cnmlDataType_t
data_type
=
CNML_DATA_INT8
)
{
cnmlQuantizedParam_t
quant_param
;
CNML_CALL
(
cnmlCreateQuantizedParam
(
&
quant_param
,
scale2position
(
scale
),
1
,
0.0
));
int
pos
=
scale2position
(
scale
);
auto
cnml_scale
=
pow
(
2
,
pos
)
*
scale
;
VLOG
(
5
)
<<
"[cnml quantized param] pos: "
<<
pos
<<
"
\t
scale: "
<<
cnml_scale
<<
std
::
endl
;
CNML_CALL
(
cnmlCreateQuantizedParam
(
&
quant_param
,
pos
,
cnml_scale
,
0.0
));
CNML_CALL
(
cnmlSetOperationComputingDataType
(
op
,
tensor
,
data_type
,
quant_param
));
CNML_CALL
(
cnmlDestroyQuantizedParam
(
&
quant_param
));
}
void
SetFPType
(
::
paddle
::
lite_api
::
PrecisionType
type
)
{
void
SetFPType
(
paddle
::
lite_api
::
PrecisionType
type
)
{
origin_fp_type_
=
type
;
switch
(
type
)
{
case
::
paddle
::
lite_api
::
PrecisionType
::
kFP16
:
case
paddle
::
lite_api
::
PrecisionType
::
kFP16
:
fp_type_
=
CNML_DATA_FLOAT16
;
break
;
case
::
paddle
::
lite_api
::
PrecisionType
::
kFloat
:
case
paddle
::
lite_api
::
PrecisionType
::
kFloat
:
fp_type_
=
CNML_DATA_FLOAT32
;
break
;
default:
...
...
@@ -230,14 +308,14 @@ class Graph {
private:
cnmlDataType_t
fp_type_
{
CNML_DATA_FLOAT32
};
std
::
map
<
std
::
string
,
std
::
shared_ptr
<
MLUTensor
>>
nodes_
;
paddle
::
lite_api
::
PrecisionType
origin_fp_type_
{
PRECISION
(
kFloat
)};
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
MLUTensor
>>
nodes_
;
std
::
vector
<
cnmlTensor_t
>
inputs_
;
std
::
vector
<
cnmlTensor_t
>
outputs_
;
std
::
vector
<
void
*>
input_addrs_
;
std
::
vector
<
void
*>
output_addrs_
;
std
::
vector
<
std
::
shared_ptr
<
MLUTensor
>>
input_tensors_
;
std
::
vector
<
std
::
shared_ptr
<
MLUTensor
>>
output_tensors_
;
std
::
vector
<
cnmlBaseOp_t
>
ops_
;
cnmlFusionOp_t
fusion_op_
;
std
::
vector
<
void
*>
const_data_storage_
;
#if PRINT_HW_TIME
...
...
lite/kernels/mlu/bridges/interpolate_op.cc
浏览文件 @
cc927184
...
...
@@ -85,6 +85,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
nn_param
));
CNML_CALL
(
cnmlDestroyNearestNeighborOpParam
(
&
nn_param
));
graph
->
FuseOp
(
interp_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
interp_op
));
return
SUCCESS
;
}
...
...
lite/kernels/mlu/bridges/layout_op.cc
0 → 100644
浏览文件 @
cc927184
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
mlu
{
int
LayoutConverter
(
void
*
ctx
,
OpLite
*
op
,
KernelBase
*
kernel
)
{
CHECK
(
ctx
!=
nullptr
);
CHECK
(
op
!=
nullptr
);
auto
graph
=
static_cast
<
Graph
*>
(
ctx
);
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[MLU] Converting "
+
op_type
+
"..."
;
auto
x_var_name
=
op_info
->
Input
(
"Input"
).
front
();
auto
x
=
scope
->
FindVar
(
x_var_name
)
->
GetMutable
<
Tensor
>
();
auto
out_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output
=
scope
->
FindVar
(
out_var_name
)
->
GetMutable
<
Tensor
>
();
auto
output_dims
=
output
->
dims
().
Vectorize
();
std
::
shared_ptr
<
MLUTensor
>
output_tensor
;
CHECK
(
graph
->
HasNode
(
x_var_name
));
std
::
vector
<
int
>
axis
;
auto
x_tensor
=
graph
->
GetNode
(
x_var_name
);
auto
x_data_order
=
x_tensor
->
dorder
();
auto
x_dims
=
x
->
dims
().
Vectorize
();
if
(
x_data_order
==
CNML_NCHW
)
{
switch
(
x_dims
.
size
())
{
case
2
:
axis
=
{
0
,
1
};
break
;
case
3
:
axis
=
{
0
,
2
,
1
};
break
;
case
4
:
axis
=
{
0
,
2
,
3
,
1
};
break
;
case
5
:
axis
=
{
0
,
2
,
3
,
4
,
1
};
break
;
default:
CHECK
(
0
)
<<
"Unsupport shape"
;
}
output_tensor
=
graph
->
AddNode
(
out_var_name
,
output_dims
,
CNML_TENSOR
,
CNML_NCHW
,
x_tensor
->
dtype
());
VLOG
(
3
)
<<
"layout transpose nchw to nhwc"
<<
std
::
endl
;
}
else
{
switch
(
x_dims
.
size
())
{
case
2
:
axis
=
{
0
,
1
};
break
;
case
3
:
axis
=
{
0
,
2
,
1
};
break
;
case
4
:
axis
=
{
0
,
3
,
1
,
2
};
break
;
case
5
:
axis
=
{
0
,
4
,
1
,
2
,
3
};
break
;
default:
CHECK
(
0
)
<<
"Unsupport shpae"
;
}
VLOG
(
3
)
<<
"layout transpose nhwc to nchw"
<<
std
::
endl
;
output_tensor
=
graph
->
AddNode
(
out_var_name
,
output_dims
,
CNML_TENSOR
,
CNML_NCHW
,
x_tensor
->
dtype
(),
CNML_NCHW
);
}
cnmlBaseOp_t
layout_op
;
cnmlNdTransposeOpParam_t
transpose_param
;
CNML_CALL
(
cnmlCreateNdTransposeOpParam
(
&
transpose_param
,
axis
.
data
(),
axis
.
size
()));
CNML_CALL
(
cnmlCreateNdTransposeProOp
(
&
layout_op
,
x_tensor
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
(),
transpose_param
));
graph
->
FuseOp
(
layout_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
layout_op
));
return
SUCCESS
;
}
}
// namespace mlu
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
layout
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
LayoutConverter
);
lite/kernels/mlu/bridges/layout_op_test.cc
0 → 100644
浏览文件 @
cc927184
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/layout_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
mlu
{
void
test_layout_NHWC2NCHW
(
std
::
vector
<
int64_t
>
input_shape
)
{
// prepare input&output variables
std
::
string
x_var_name
=
"input"
;
std
::
string
out_var_name
=
"out"
;
Scope
scope
;
auto
*
x
=
scope
.
Var
(
x_var_name
)
->
GetMutable
<
Tensor
>
();
auto
*
out
=
scope
.
Var
(
out_var_name
)
->
GetMutable
<
Tensor
>
();
x
->
Resize
(
DDim
(
input_shape
));
// initialize input&output data
FillTensor
<
float
>
(
x
);
// initialize op desc
cpp
::
OpDesc
opdesc
;
opdesc
.
SetType
(
"layout"
);
opdesc
.
SetInput
(
"Input"
,
{
x_var_name
});
opdesc
.
SetOutput
(
"Out"
,
{
out_var_name
});
auto
op
=
CreateOp
<
operators
::
LayoutOp
>
(
opdesc
,
&
scope
);
// execute reference implementation and save to output tensor
Tensor
input
;
input
.
Resize
(
DDim
(
input_shape
));
switch
(
input_shape
.
size
())
{
case
2
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
1
])},
{
0
,
1
});
break
;
case
3
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
2
]),
static_cast
<
int
>
(
input_shape
[
1
])},
{
0
,
2
,
1
});
break
;
case
4
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
2
]),
static_cast
<
int
>
(
input_shape
[
3
]),
static_cast
<
int
>
(
input_shape
[
1
])},
{
0
,
3
,
1
,
2
});
break
;
case
5
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
2
]),
static_cast
<
int
>
(
input_shape
[
3
]),
static_cast
<
int
>
(
input_shape
[
4
]),
static_cast
<
int
>
(
input_shape
[
1
])},
{
0
,
4
,
1
,
2
,
3
});
break
;
default:
CHECK
(
0
)
<<
"Unsupport"
;
}
auto
*
x_data
=
input
.
mutable_data
<
float
>
();
LaunchOp
(
op
,
{
x_var_name
},
{
out_var_name
});
// compare results
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
out
->
dims
().
production
();
i
++
)
{
VLOG
(
5
)
<<
i
;
EXPECT_NEAR
(
out_data
[
i
],
x_data
[
i
],
5e-4
);
}
}
void
test_layout_NCHW2NHWC
(
std
::
vector
<
int64_t
>
input_shape
)
{
// prepare input&output variables
std
::
string
x_var_name
=
"input"
;
std
::
string
out_var_name
=
"out"
;
Scope
scope
;
auto
*
x
=
scope
.
Var
(
x_var_name
)
->
GetMutable
<
Tensor
>
();
auto
*
out
=
scope
.
Var
(
out_var_name
)
->
GetMutable
<
Tensor
>
();
x
->
Resize
(
DDim
(
input_shape
));
// initialize input&output data
FillTensor
<
float
>
(
x
);
// initialize op desc
cpp
::
OpDesc
opdesc
;
opdesc
.
SetType
(
"layout"
);
opdesc
.
SetInput
(
"Input"
,
{
x_var_name
});
opdesc
.
SetOutput
(
"Out"
,
{
out_var_name
});
auto
op
=
CreateOp
<
operators
::
LayoutOp
>
(
opdesc
,
&
scope
);
// execute reference implementation and save to output tensor
Tensor
input
;
input
.
Resize
(
DDim
(
input_shape
));
switch
(
input_shape
.
size
())
{
case
2
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
1
])},
{
0
,
1
});
break
;
case
3
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
1
]),
static_cast
<
int
>
(
input_shape
[
2
])},
{
0
,
2
,
1
});
break
;
case
4
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
1
]),
static_cast
<
int
>
(
input_shape
[
2
]),
static_cast
<
int
>
(
input_shape
[
3
])},
{
0
,
2
,
3
,
1
});
break
;
case
5
:
transpose
<
float
>
(
x
->
mutable_data
<
float
>
(),
input
.
mutable_data
<
float
>
(),
{
static_cast
<
int
>
(
input_shape
[
0
]),
static_cast
<
int
>
(
input_shape
[
1
]),
static_cast
<
int
>
(
input_shape
[
2
]),
static_cast
<
int
>
(
input_shape
[
3
]),
static_cast
<
int
>
(
input_shape
[
4
])},
{
0
,
2
,
3
,
4
,
1
});
break
;
default:
CHECK
(
0
)
<<
"Unsupport"
;
}
auto
*
x_data
=
input
.
mutable_data
<
float
>
();
LaunchOp
(
op
,
{
x_var_name
},
{
out_var_name
},
CNML_NCHW
);
// compare results
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
out
->
dims
().
production
();
i
++
)
{
VLOG
(
5
)
<<
i
;
EXPECT_NEAR
(
out_data
[
i
],
x_data
[
i
],
5e-4
);
}
}
TEST
(
MLUBridges
,
layout
)
{
test_layout_NHWC2NCHW
({
12
,
32
,
4
});
test_layout_NHWC2NCHW
({
12
,
32
,
44
,
3
});
test_layout_NHWC2NCHW
({
12
,
32
,
44
,
3
,
6
});
test_layout_NCHW2NHWC
({
12
,
32
,
55
});
test_layout_NCHW2NHWC
({
12
,
32
,
44
,
3
});
test_layout_NCHW2NHWC
({
12
,
32
,
44
,
3
,
8
});
test_layout_NHWC2NCHW
({
12
,
32
});
test_layout_NCHW2NHWC
({
12
,
32
});
}
}
// namespace mlu
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
USE_SUBGRAPH_BRIDGE
(
layout
,
kMLU
);
lite/kernels/mlu/bridges/paddle_use_bridges.h
浏览文件 @
cc927184
...
...
@@ -15,6 +15,7 @@
#pragma once
USE_SUBGRAPH_BRIDGE
(
relu
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
relu6
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
conv2d
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
depthwise_conv2d
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
elementwise_add
,
kMLU
);
...
...
@@ -26,3 +27,7 @@ USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
USE_SUBGRAPH_BRIDGE
(
leaky_relu
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
concat
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
scale
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
sigmoid
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
elementwise_mul
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
cast
,
kMLU
);
USE_SUBGRAPH_BRIDGE
(
layout
,
kMLU
);
lite/kernels/mlu/bridges/pool_op.cc
浏览文件 @
cc927184
...
...
@@ -55,6 +55,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
global_pooling
=
op_info
->
GetAttr
<
bool
>
(
"global_pooling"
);
auto
ksize
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"ksize"
);
auto
strides
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"strides"
);
CHECK
(
!
(
op_info
->
HasAttr
(
"exclusive"
)
&&
op_info
->
GetAttr
<
bool
>
(
"exclusive"
)
==
false
))
<<
"Unsupport param exclusive is false!"
;
if
(
paddings
.
size
()
==
2L
)
{
for
(
size_t
i
=
0
;
i
<
2L
;
++
i
)
{
...
...
@@ -62,8 +65,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
paddings
.
insert
(
paddings
.
begin
()
+
2
*
i
+
1
,
copy_pad
);
}
}
int
pad_height
=
paddings
[
0
];
int
pad_width
=
paddings
[
2
];
std
::
string
padding_algorithm
(
""
);
if
(
op_info
->
HasAttr
(
"padding_algorithm"
))
{
padding_algorithm
=
op_info
->
GetAttr
<
std
::
string
>
(
"padding_algorithm"
);
...
...
@@ -72,6 +73,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
if
(
op_info
->
HasAttr
(
"adaptive"
))
{
adaptive
=
op_info
->
GetAttr
<
bool
>
(
"adaptive"
);
}
auto
input_dims
=
x
->
dims
();
lite
::
operators
::
UpdatePadding
(
&
paddings
,
global_pooling
,
adaptive
,
...
...
@@ -80,31 +83,31 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
strides
,
ksize
);
// std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
// for (size_t i = 0; i < 2; i++) {
// output_shape.push_back(
// (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
// ksize[0]) /
// strides[i] +
// 1);
// }
if
(
global_pooling
)
{
ksize
.
resize
(
static_cast
<
size_t
>
(
input_dims
.
size
())
-
2
);
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
ksize
[
i
]
=
static_cast
<
int
>
(
input_dims
[
i
+
2
]);
}
}
auto
output_tensor
=
graph
->
AddNode
(
output_var_name
,
output_shape
,
CNML_TENSOR
,
CNML_NCHW
,
graph
->
FPType
());
cnmlPoolOpParam_t
pool_param
;
CNML_CALL
(
cnmlCreatePoolOpParam_V
2
(
&
pool_param
,
cnmlCreatePoolOpParam_V
3
(
&
pool_param
,
ksize
[
0
],
ksize
[
1
],
strides
[
0
],
strides
[
1
],
pad_height
,
pad_width
,
1
,
// dilation
1
,
paddings
[
0
],
paddings
[
1
],
paddings
[
2
],
paddings
[
3
],
1
,
// dilation h
1
,
// dilation w
ToCnmlPoolMode
(
pooling_type
),
ceil_mode
?
CNML_POOL_K
VALID
:
CNML_POOL_KFULL
,
ceil_mode
?
CNML_POOL_K
FULL
:
CNML_POOL_KVALID
,
true
,
/* real */
1
/* blend factor */
));
cnmlBaseOp_t
pool_op
;
...
...
@@ -114,6 +117,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
output_tensor
->
mlu_tensor
()));
CNML_CALL
(
cnmlDestroyPoolOpParam
(
&
pool_param
));
graph
->
FuseOp
(
pool_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
pool_op
));
return
SUCCESS
;
}
...
...
lite/kernels/mlu/bridges/pool_op_test.cc
浏览文件 @
cc927184
...
...
@@ -43,6 +43,12 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
std
::
string
pooling_type
=
op_info
->
GetAttr
<
std
::
string
>
(
"pooling_type"
);
bool
global_pooling
=
op_info
->
GetAttr
<
bool
>
(
"global_pooling"
);
if
(
pooling_type
==
"max"
)
{
for
(
int
i
=
0
;
i
<
out_dims
.
production
();
++
i
)
{
dst_ptr
[
i
]
=
-
65504.
f
;
}
}
int
in_n
=
in_dims
[
0
];
int
in_c
=
in_dims
[
1
];
int
in_h
=
in_dims
[
2
];
...
...
@@ -203,62 +209,46 @@ void test_pool(int bs,
}
TEST
(
MLUBridges
,
pool
)
{
// for (auto pooling_type : {"max", "avg"}) {
// for (auto ceil_mode : {true, false}) {
// for (auto global_pooling : {/*true, */ false}) {
// for (auto exclusive : {true /*, false*/}) {
// for (auto ksize : {2, 3}) {
// for (auto stride : {1, 2}) {
// for (auto padding : {0, 1}) {
// for (auto bs : {1, 3}) {
// for (auto ic : {1, 3}) {
// for (auto ih : {3, 7}) {
// for (auto iw : {3, 7}) {
// test_pool(bs,
// ic,
// ih,
// iw,
// pooling_type,
// ceil_mode,
// global_pooling,
// exclusive,
// ksize,
// stride,
// padding);
// }
// }
// }
// }
// }
// }
// }
// }
// }
// }
// }
for
(
auto
pooling_type
:
{
"max"
,
"avg"
})
{
for
(
auto
ceil_mode
:
{
true
,
false
})
{
bool
global_pooling
=
false
;
bool
exclusive
=
true
;
int
ksize
=
2
;
int
stride
=
1
;
int
padding
=
0
;
int
bs
=
6
;
int
ic
=
6
;
int
ih
=
6
;
int
iw
=
6
;
test_pool
(
bs
,
ic
,
ih
,
iw
,
pooling_type
,
ceil_mode
,
global_pooling
,
exclusive
,
ksize
,
stride
,
padding
);
for
(
auto
global_pooling
:
{
true
,
false
})
{
for
(
auto
exclusive
:
{
true
/*, false*/
})
{
for
(
auto
ksize
:
{
2
,
3
})
{
for
(
auto
stride
:
{
1
,
2
})
{
for
(
auto
padding
:
{
0
,
1
})
{
for
(
auto
bs
:
{
1
,
3
})
{
for
(
auto
ic
:
{
1
,
3
})
{
for
(
auto
ih
:
{
3
,
7
})
{
for
(
auto
iw
:
{
3
,
7
})
{
LOG
(
INFO
)
<<
"shape: "
<<
bs
<<
','
<<
ic
<<
','
<<
ih
<<
','
<<
iw
<<
'\t'
<<
"pooling type: "
<<
pooling_type
<<
'\t'
<<
"ceil model: "
<<
ceil_mode
<<
'\t'
<<
"global_pooling: "
<<
global_pooling
<<
'\t'
<<
"exclusive: "
<<
exclusive
<<
'\t'
<<
"ksize: "
<<
ksize
<<
'\t'
<<
"stride: "
<<
stride
<<
'\t'
<<
"padding: "
<<
padding
;
test_pool
(
bs
,
ic
,
ih
,
iw
,
pooling_type
,
ceil_mode
,
global_pooling
,
exclusive
,
ksize
,
stride
,
padding
);
}
}
}
}
}
}
}
}
}
}
}
}
...
...
lite/kernels/mlu/bridges/scale_op.cc
浏览文件 @
cc927184
...
...
@@ -61,6 +61,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
alpha_tensor
->
mlu_tensor
(),
beta_tensor
->
mlu_tensor
()));
graph
->
FuseOp
(
scale_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
scale_op
));
return
SUCCESS
;
}
...
...
lite/kernels/mlu/bridges/softmax_op.cc
浏览文件 @
cc927184
...
...
@@ -35,9 +35,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
out_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output
=
scope
->
FindVar
(
out_var_name
)
->
GetMutable
<
Tensor
>
();
auto
output_dims
=
output
->
dims
().
Vectorize
();
auto
x_shape
=
scope
->
FindVar
(
x_var_name
)
->
GetMutable
<
Tensor
>
()
->
dims
().
Vectorize
();
// nchw axis to nhwc aixs
int
nchw_to_nhwc_aixs_map
[
4
]
=
{
0
,
3
,
1
,
2
};
// nchw axis to nhwc axis
int
axis
=
1
;
if
(
op_info
->
HasAttr
(
"axis"
))
{
axis
=
op_info
->
GetAttr
<
int
>
(
"axis"
);
...
...
@@ -45,7 +46,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis
=
output_dims
.
size
()
+
axis
;
}
}
int
nhwc_axis
=
nchw_to_nhwc_aixs_map
[
axis
];
// value of nhwc2nchw_axis is index of nhwc
// order of nhwc2nchw_axis is nchw
int
nhwc_axis
=
GetAxisNHWC2NCHW
<
int
>
(
x_shape
.
size
())[
axis
];
auto
output_tensor
=
graph
->
AddNode
(
out_var_name
,
output_dims
,
CNML_TENSOR
,
CNML_NCHW
,
graph
->
FPType
());
...
...
@@ -55,6 +58,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
graph
->
GetNode
(
x_var_name
)
->
mlu_tensor
(),
output_tensor
->
mlu_tensor
()));
graph
->
FuseOp
(
softmax_op
);
CNML_CALL
(
cnmlDestroyBaseOp
(
&
softmax_op
));
return
SUCCESS
;
}
...
...
lite/kernels/mlu/bridges/softmax_op_test.cc
浏览文件 @
cc927184
...
...
@@ -93,7 +93,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
opdesc
.
SetOutput
(
"Out"
,
{
out_var_name
});
opdesc
.
SetAttr
(
"axis"
,
axis
);
// create and convert op to
NPU model, then run it on NP
U
// create and convert op to
MLU model, then run it on ML
U
auto
op
=
CreateOp
<
operators
::
SoftmaxOp
>
(
opdesc
,
&
scope
);
// execute reference implementation and save to output tensor
softmax_ref
<
float
>
(
op
);
...
...
lite/kernels/mlu/bridges/tensor.cc
浏览文件 @
cc927184
...
...
@@ -16,6 +16,9 @@
#include <glog/logging.h>
#include <algorithm>
#include <climits>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
namespace
paddle
{
...
...
@@ -25,8 +28,9 @@ namespace mlu {
MLUTensor
::
MLUTensor
(
const
std
::
vector
<
int64_t
>&
shape
,
cnmlTensorType_t
tensor_type
,
cnmlDataOrder_t
data_order
,
cnmlDataType_t
mlu_dtype
)
cnmlDataOrder_t
shape_order
,
cnmlDataType_t
mlu_dtype
,
cnmlDataOrder_t
data_order
)
:
mlu_tensor_
(
nullptr
),
tensor_type_
(
tensor_type
),
mlu_ptr_
(
nullptr
)
{
std
::
vector
<
int
>
int_shape
;
for
(
auto
i
:
shape
)
{
...
...
@@ -36,15 +40,18 @@ MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
LOG
(
FATAL
)
<<
"Shape size is beyond the limitation of MLUTensor!"
;
}
}
remember
(
int_shape
,
tensor_type
,
mlu_dtype
,
data_order
);
remember
(
int_shape
,
tensor_type
,
mlu_dtype
,
shape_order
,
data_order
);
}
void
MLUTensor
::
remember
(
const
std
::
vector
<
int
>&
shape
,
cnmlTensorType_t
tensor_type
,
cnmlDataType_t
mlu_dtype
,
cnmlDataOrder_t
shape_order
)
{
cnmlDataOrder_t
shape_order
,
cnmlDataOrder_t
data_order
)
{
tensor_type_
=
tensor_type
;
mlu_dtype_
=
mlu_dtype
;
data_order_
=
data_order
;
origin_shape_
.
assign
(
shape
.
begin
(),
shape
.
end
());
int
size
=
4
;
if
(
shape
.
size
()
>
4
||
shape_order
==
CNML_ARRAY
)
{
...
...
@@ -239,13 +246,22 @@ void MLUTensor::remember(const std::vector<int>& shape,
break
;
}
}
dim_
=
shape_
.
size
();
auto
shape_NCHW
=
DimNHWC2NCHW
(
shape_
);
shape_NCHW
.
erase
(
shape_NCHW
.
begin
()
+
shape
.
size
(),
shape_NCHW
.
end
());
dim_
=
shape_NCHW
.
size
();
shape_
=
DimNCHW2NHWC
(
shape_NCHW
);
}
void
MLUTensor
::
Create
()
{
if
(
mlu_tensor_
==
nullptr
)
{
CNML_CALL
(
cnmlCreateTensor_V2
(
&
mlu_tensor_
,
tensor_type_
));
std
::
vector
<
int
>
dim_shape
(
shape_
);
if
(
data_order_
==
CNML_NCHW
)
{
std
::
transform
(
origin_shape_
.
cbegin
(),
origin_shape_
.
cend
(),
dim_shape
.
begin
(),
[](
DDim
::
value_type
in
)
{
return
static_cast
<
int
>
(
in
);
});
}
int
*
dim_strides
=
nullptr
;
CNML_CALL
(
cnmlSetTensorShape_V2
(
mlu_tensor_
,
dim_
,
dim_shape
.
data
(),
dim_strides
));
...
...
@@ -258,6 +274,84 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
return
mlu_tensor_
;
}
void
MLUTensor
::
ToFile
(
std
::
string
file_name
)
{
if
(
mlu_ptr_
)
{
VLOG
(
5
)
<<
"to dump mlu ptr: "
<<
mlu_ptr_
<<
" to: "
<<
file_name
;
int
count
=
1
;
for
(
size_t
i
=
0
;
i
<
shape_
.
size
();
i
++
)
{
count
*=
shape_
[
i
];
}
VLOG
(
6
)
<<
" dump count: "
<<
count
;
VLOG
(
6
)
<<
" dump shape: "
;
for
(
size_t
i
=
0
;
i
<
shape_
.
size
();
i
++
)
{
VLOG
(
6
)
<<
shape_
[
i
]
<<
" "
;
}
std
::
vector
<
float
>
cpu_data_fp32
(
count
);
// fp16 to fp32
if
(
mlu_dtype_
==
CNML_DATA_FLOAT16
)
{
VLOG
(
6
)
<<
" convert fp16 to fp32 "
;
std
::
vector
<
uint16_t
>
cpu_data_fp16
(
count
);
cnrtMemcpy
(
cpu_data_fp16
.
data
(),
mlu_ptr_
,
count
*
sizeof
(
uint16_t
),
CNRT_MEM_TRANS_DIR_DEV2HOST
);
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
cnrtConvertHalfToFloat
(
&
(
cpu_data_fp32
[
i
]),
cpu_data_fp16
[
i
]);
}
}
else
{
cnrtMemcpy
(
cpu_data_fp32
.
data
(),
mlu_ptr_
,
count
*
sizeof
(
float
),
CNRT_MEM_TRANS_DIR_DEV2HOST
);
}
// trans to nchw
std
::
vector
<
float
>
cpu_data_trans
(
count
);
if
(
data_order_
!=
CNML_NCHW
)
{
switch
(
shape_
.
size
())
{
case
4
:
transpose
(
cpu_data_fp32
.
data
(),
cpu_data_trans
.
data
(),
shape_
,
{
0
,
3
,
1
,
2
});
break
;
case
3
:
transpose
(
cpu_data_fp32
.
data
(),
cpu_data_trans
.
data
(),
shape_
,
{
0
,
2
,
1
});
break
;
case
2
:
transpose
(
cpu_data_fp32
.
data
(),
cpu_data_trans
.
data
(),
shape_
,
{
0
,
1
});
break
;
case
1
:
transpose
(
cpu_data_fp32
.
data
(),
cpu_data_trans
.
data
(),
shape_
,
{
0
});
break
;
default:
CHECK
(
0
)
<<
"ToFile only support dim <=4"
;
break
;
}
}
// to file
std
::
ostringstream
outs
;
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
if
(
data_order_
==
CNML_NCHW
)
{
outs
<<
cpu_data_fp32
[
i
]
<<
std
::
endl
;
}
else
{
outs
<<
cpu_data_trans
[
i
]
<<
std
::
endl
;
}
}
std
::
ofstream
of
;
of
.
open
(
file_name
,
std
::
ios
::
out
);
of
<<
outs
.
str
();
of
.
close
();
}
else
{
LOG
(
FATAL
)
<<
"mlu ptr is null ,can not dump mlu content to : "
<<
file_name
;
}
}
MLUTensor
::~
MLUTensor
()
{
if
(
mlu_tensor_
!=
nullptr
)
{
CNML_CALL
(
cnmlDestroyTensor
(
&
mlu_tensor_
));
...
...
lite/kernels/mlu/bridges/tensor.h
浏览文件 @
cc927184
...
...
@@ -14,6 +14,8 @@
#pragma once
#include <fstream>
#include <string>
#include <vector>
#include "lite/kernels/mlu/bridges/utility.h"
...
...
@@ -33,13 +35,15 @@ class MLUTensor {
MLUTensor
(
const
std
::
vector
<
int64_t
>&
shape
,
cnmlTensorType_t
tensor_type
=
CNML_TENSOR
,
cnmlDataOrder_t
data_order
=
CNML_NCHW
,
cnmlDataType_t
mlu_dtype
=
CNML_DATA_FLOAT32
);
cnmlDataOrder_t
shape_order
=
CNML_NCHW
,
cnmlDataType_t
mlu_dtype
=
CNML_DATA_FLOAT32
,
cnmlDataOrder_t
data_order
=
CNML_NHWC
);
void
remember
(
const
std
::
vector
<
int
>&
shape
,
cnmlTensorType_t
tensor_type
,
cnmlDataType_t
mlu_dtype
,
cnmlDataOrder_t
shape_order
);
cnmlDataOrder_t
shape_order
,
cnmlDataOrder_t
data_order
);
void
Create
();
cnmlTensor_t
mlu_tensor
();
void
*
mlu_data
()
{
...
...
@@ -47,14 +51,21 @@ class MLUTensor {
return
mlu_ptr_
;
}
cnmlDataType_t
dtype
()
{
return
mlu_dtype_
;
}
void
set_mlu_dtype
(
cnmlDataType_t
type
)
{
mlu_dtype_
=
type
;
}
const
std
::
vector
<
int64_t
>&
get_origin_shape
()
const
{
return
origin_shape_
;
}
~
MLUTensor
();
void
ToFile
(
std
::
string
file_name
);
cnmlDataOrder_t
dorder
()
{
return
data_order_
;
}
private:
cnmlTensor_t
mlu_tensor_
;
std
::
vector
<
int
>
shape_
;
std
::
vector
<
int64_t
>
origin_shape_
;
cnmlTensorType_t
tensor_type_
;
cnmlDataType_t
mlu_dtype_
;
int
dim_
{
0
};
...
...
lite/kernels/mlu/bridges/test_helper.cc
浏览文件 @
cc927184
...
...
@@ -24,18 +24,38 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
template
<
lite_api
::
PrecisionType
Dtype
>
void
PrepareInput
(
Graph
*
graph
,
const
std
::
string
&
input_name
,
Tensor
*
input_tensor
,
cnmlDataOrder_t
order
)
{
thread_local
Tensor
temp_input
;
temp_input
.
Resize
(
input_tensor
->
dims
().
Vectorize
());
temp_input
.
CopyDataFrom
(
*
input_tensor
);
using
data_type
=
typename
MLUTypeTraits
<
Dtype
>::
type
;
auto
input_node
=
graph
->
AddNode
(
input_name
,
input_tensor
->
dims
().
Vectorize
(),
CNML_TENSOR
,
CNML_NCHW
,
MLUTypeTraits
<
Dtype
>::
cnml_type
,
order
,
reinterpret_cast
<
void
*>
(
input_tensor
->
template
mutable_data
<
data_type
>(
TARGET
(
kMLU
))));
CHECK
(
input_node
);
CNRT_CHECK
(
cnrtMemcpy
(
input_tensor
->
template
mutable_data
<
data_type
>(),
temp_input
.
mutable_data
<
data_type
>
(),
sizeof
(
data_type
)
*
input_tensor
->
dims
().
production
(),
CNRT_MEM_TRANS_DIR_HOST2DEV
));
}
void
LaunchOp
(
const
std
::
shared_ptr
<
lite
::
OpLite
>
op
,
const
std
::
vector
<
std
::
string
>&
input_var_names
,
const
std
::
vector
<
std
::
string
>&
output_var_names
)
{
const
std
::
vector
<
std
::
string
>&
output_var_names
,
cnmlDataOrder_t
order
)
{
CNRT_CALL
(
cnrtInit
(
0
));
::
paddle
::
lite
::
SetMluDevice
(
0
);
lite
::
SetMluDevice
(
0
);
cnrtQueue_t
queue_
;
cnrtInvokeFuncParam_t
forward_param
;
u32_t
affinity
=
1
;
int
data_param
=
1
;
forward_param
.
data_parallelism
=
&
data_param
;
forward_param
.
affinity
=
&
affinity
;
forward_param
.
end
=
CNRT_PARAM_END
;
CNRT_CALL
(
cnrtCreateQueue
(
&
queue_
));
cnrtDev_t
dev_handle
;
CNRT_CALL
(
cnrtGetDeviceHandle
(
&
dev_handle
,
0
));
...
...
@@ -50,23 +70,21 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
// Convert input data var and add it into the MLU IR graph
for
(
auto
&
input_name
:
input_var_names
)
{
auto
input_tensor
=
scope
->
FindMutableTensor
(
input_name
);
CHECK
(
input_tensor
);
Tensor
temp_input
;
temp_input
.
Resize
(
input_tensor
->
dims
().
Vectorize
());
temp_input
.
CopyDataFrom
(
*
input_tensor
);
auto
input_node
=
graph
.
AddNode
(
input_name
,
input_tensor
->
dims
().
Vectorize
(),
CNML_TENSOR
,
CNML_NCHW
,
graph
.
FPType
(),
reinterpret_cast
<
void
*>
(
input_tensor
->
mutable_data
<
float
>
(
TARGET
(
kMLU
))));
CHECK
(
input_node
);
CNRT_CHECK
(
cnrtMemcpy
(
input_tensor
->
mutable_data
<
float
>
(),
temp_input
.
mutable_data
<
float
>
(),
sizeof
(
float
)
*
input_tensor
->
dims
().
production
(),
CNRT_MEM_TRANS_DIR_HOST2DEV
));
auto
data_type
=
input_tensor
->
precision
();
switch
(
data_type
)
{
#define PREPARE_INPUT(type__) \
case PRECISION(type__): \
PrepareInput<PRECISION(type__)>(&graph, input_name, input_tensor, order); \
break;
PREPARE_INPUT
(
kFP16
)
PREPARE_INPUT
(
kFloat
)
PREPARE_INPUT
(
kInt8
)
PREPARE_INPUT
(
kInt32
)
#undef PREPARE_INPUT
default:
CHECK
(
0
);
}
}
op
->
CheckShape
();
op
->
InferShape
();
...
...
@@ -89,8 +107,9 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
}
graph
.
Compile
(
CNML_MLU270
,
1
);
graph
.
Compute
(
queue_
,
*
(
graph
.
MutableInputs
()),
*
(
graph
.
MutableOutputs
()));
CNRT_CALL
(
cnrtSyncQueue
(
queue_
));
graph
.
Compute
(
forward_param
,
queue_
);
for
(
auto
&
output_name
:
output_var_names
)
{
auto
output_tensor
=
scope
->
FindMutableTensor
(
output_name
);
Tensor
temp_out
;
...
...
lite/kernels/mlu/bridges/test_helper.h
浏览文件 @
cc927184
...
...
@@ -58,7 +58,8 @@ void FillTensor(Tensor* x,
void
LaunchOp
(
const
std
::
shared_ptr
<
lite
::
OpLite
>
op
,
const
std
::
vector
<
std
::
string
>&
input_var_names
,
const
std
::
vector
<
std
::
string
>&
output_var_names
);
const
std
::
vector
<
std
::
string
>&
output_var_names
,
cnmlDataOrder_t
order
=
CNML_NHWC
);
}
// namespace mlu
}
// namespace subgraph
...
...
lite/kernels/mlu/bridges/utility.cc
浏览文件 @
cc927184
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/kernels/mlu/bridges/utility.h"
#include <utility>
namespace
paddle
{
...
...
@@ -20,33 +21,21 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
void
transpose
(
float
*
input_data
,
float
*
output_data
,
std
::
vector
<
int
>
input_shape
,
std
::
vector
<
int
>
axis
)
{
void
transpose
2d
(
float
*
input_data
,
float
*
output_data
,
std
::
vector
<
int
>
input_shape
)
{
CHECK_EQ
(
input_shape
.
size
(),
2
);
int
old_index
=
-
1
;
int
new_index
=
-
1
;
int
dim
[
4
]
=
{
0
};
std
::
vector
<
int
>
shape
=
input_shape
;
for
(
dim
[
0
]
=
0
;
dim
[
0
]
<
input_shape
[
0
];
dim
[
0
]
++
)
{
for
(
dim
[
1
]
=
0
;
dim
[
1
]
<
input_shape
[
1
];
dim
[
1
]
++
)
{
for
(
dim
[
2
]
=
0
;
dim
[
2
]
<
input_shape
[
2
];
dim
[
2
]
++
)
{
for
(
dim
[
3
]
=
0
;
dim
[
3
]
<
input_shape
[
3
];
dim
[
3
]
++
)
{
old_index
=
dim
[
0
]
*
shape
[
1
]
*
shape
[
2
]
*
shape
[
3
]
+
dim
[
1
]
*
shape
[
2
]
*
shape
[
3
]
+
dim
[
2
]
*
shape
[
3
]
+
dim
[
3
];
new_index
=
dim
[
axis
[
0
]]
*
shape
[
axis
[
1
]]
*
shape
[
axis
[
2
]]
*
shape
[
axis
[
3
]]
+
dim
[
axis
[
1
]]
*
shape
[
axis
[
2
]]
*
shape
[
axis
[
3
]]
+
dim
[
axis
[
2
]]
*
shape
[
axis
[
3
]]
+
dim
[
axis
[
3
]];
output_data
[
new_index
]
=
input_data
[
old_index
];
}
}
for
(
int
i
=
0
;
i
<
input_shape
[
0
];
i
++
)
{
for
(
int
j
=
0
;
j
<
input_shape
[
1
];
j
++
)
{
old_index
=
i
*
input_shape
[
1
]
+
j
;
new_index
=
j
*
input_shape
[
0
]
+
i
;
output_data
[
new_index
]
=
input_data
[
old_index
];
}
}
}
int
scale2position
(
float
scale
)
{
return
static_cast
<
int
>
(
-
std
::
log2
(
scale
));
}
void
dequant
(
float
*
dst
,
int8_t
*
src
,
size_t
size
,
float
scale
)
{
for
(
size_t
i
=
0
;
i
<
size
;
++
i
)
{
dst
[
i
]
=
static_cast
<
float
>
(
src
[
i
])
*
scale
;
...
...
lite/kernels/mlu/bridges/utility.h
浏览文件 @
cc927184
...
...
@@ -16,24 +16,76 @@
#include <cnml.h>
#include <cnrt.h>
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/mlu/mlu_utils.h"
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/fluid/
data_type
.h"
#include "lite/fluid/
float16
.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
mlu
{
void
transpose
(
float
*
input_data
,
float
*
output_data
,
void
transpose2d
(
float
*
input_data
,
float
*
output_data
,
std
::
vector
<
int
>
input_shape
);
template
<
typename
dtype
>
void
transpose
(
dtype
*
input_data
,
dtype
*
output_data
,
std
::
vector
<
int
>
input_shape
,
std
::
vector
<
int
>
axis
);
int
scale2position
(
float
scale
);
std
::
vector
<
int
>
axis
)
{
int
old_index
=
-
1
;
int
new_index
=
-
1
;
std
::
vector
<
int
>
shape
;
std
::
vector
<
int
>
expand_axis
;
if
(
input_shape
.
size
()
<
5u
)
{
for
(
size_t
i
=
0
;
i
<
5
-
input_shape
.
size
();
i
++
)
{
shape
.
push_back
(
1
);
expand_axis
.
push_back
(
i
);
}
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
i
++
)
{
shape
.
push_back
(
input_shape
[
i
]);
expand_axis
.
push_back
(
axis
[
i
]
+
5
-
input_shape
.
size
());
}
}
else
{
shape
=
input_shape
;
expand_axis
=
axis
;
}
int
dim
[
5
]
=
{
0
};
for
(
dim
[
0
]
=
0
;
dim
[
0
]
<
shape
[
0
];
dim
[
0
]
++
)
{
for
(
dim
[
1
]
=
0
;
dim
[
1
]
<
shape
[
1
];
dim
[
1
]
++
)
{
for
(
dim
[
2
]
=
0
;
dim
[
2
]
<
shape
[
2
];
dim
[
2
]
++
)
{
for
(
dim
[
3
]
=
0
;
dim
[
3
]
<
shape
[
3
];
dim
[
3
]
++
)
{
for
(
dim
[
4
]
=
0
;
dim
[
4
]
<
shape
[
4
];
dim
[
4
]
++
)
{
old_index
=
dim
[
0
]
*
shape
[
1
]
*
shape
[
2
]
*
shape
[
3
]
*
shape
[
4
]
+
dim
[
1
]
*
shape
[
2
]
*
shape
[
3
]
*
shape
[
4
]
+
dim
[
2
]
*
shape
[
3
]
*
shape
[
4
]
+
dim
[
3
]
*
shape
[
4
]
+
dim
[
4
];
new_index
=
dim
[
expand_axis
[
0
]]
*
shape
[
expand_axis
[
1
]]
*
shape
[
expand_axis
[
2
]]
*
shape
[
expand_axis
[
3
]]
*
shape
[
expand_axis
[
4
]]
+
dim
[
expand_axis
[
1
]]
*
shape
[
expand_axis
[
2
]]
*
shape
[
expand_axis
[
3
]]
*
shape
[
expand_axis
[
4
]]
+
dim
[
expand_axis
[
2
]]
*
shape
[
expand_axis
[
3
]]
*
shape
[
expand_axis
[
4
]]
+
dim
[
expand_axis
[
3
]]
*
shape
[
expand_axis
[
4
]]
+
dim
[
expand_axis
[
4
]];
output_data
[
new_index
]
=
input_data
[
old_index
];
}
}
}
}
}
}
inline
int
scale2position
(
float
scale
)
{
return
std
::
floor
(
-
std
::
log2
(
scale
));
}
void
dequant
(
float
*
dst
,
int8_t
*
src
,
size_t
size
,
float
scale
);
void
dequant
(
float
*
dst
,
...
...
@@ -64,27 +116,94 @@ inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
std
::
vector
<
int64_t
>
({
dim
[
0
],
dim
[
2
],
dim
[
3
],
dim
[
1
]}));
}
inline
const
std
::
vector
<
int64_t
>
DimNHWC2NCHW
(
const
std
::
vector
<
int64_t
>&
dim
)
{
return
std
::
vector
<
int64_t
>
({
dim
[
0
],
dim
[
3
],
dim
[
1
],
dim
[
2
]});
template
<
typename
data_type
>
inline
const
std
::
vector
<
data_type
>
DimNHWC2NCHW
(
const
std
::
vector
<
data_type
>&
dim
)
{
switch
(
dim
.
size
())
{
case
1
:
return
dim
;
case
2
:
return
dim
;
case
3
:
return
std
::
vector
<
data_type
>
({
dim
[
0
],
dim
[
2
],
dim
[
1
]});
case
4
:
return
std
::
vector
<
data_type
>
({
dim
[
0
],
dim
[
3
],
dim
[
1
],
dim
[
2
]});
case
5
:
return
std
::
vector
<
data_type
>
({
dim
[
0
],
dim
[
4
],
dim
[
1
],
dim
[
2
],
dim
[
3
]});
default:
CHECK
(
0
)
<<
"unsupport dimension"
;
}
}
template
<
typename
data_type
>
inline
const
std
::
vector
<
data_type
>
DimNCHW2NHWC
(
const
std
::
vector
<
data_type
>&
dim
)
{
switch
(
dim
.
size
())
{
case
1
:
return
dim
;
case
2
:
return
dim
;
case
3
:
return
std
::
vector
<
data_type
>
({
dim
[
0
],
dim
[
2
],
dim
[
1
]});
case
4
:
return
std
::
vector
<
data_type
>
({
dim
[
0
],
dim
[
2
],
dim
[
3
],
dim
[
1
]});
case
5
:
return
std
::
vector
<
data_type
>
({
dim
[
0
],
dim
[
2
],
dim
[
3
],
dim
[
4
],
dim
[
1
]});
default:
CHECK
(
0
)
<<
"unsupport dimension"
;
}
}
inline
const
std
::
vector
<
int64_t
>
DimNCHW2NHWC
(
const
std
::
vector
<
int64_t
>&
dim
)
{
return
std
::
vector
<
int64_t
>
({
dim
[
0
],
dim
[
2
],
dim
[
3
],
dim
[
1
]});
template
<
typename
data_type
>
inline
std
::
vector
<
data_type
>
GetAxisNHWC2NCHW
(
size_t
n_dims
)
{
std
::
vector
<
data_type
>
nhwc2nchw_axis
(
n_dims
);
nhwc2nchw_axis
[
0
]
=
0
;
if
(
n_dims
>
1
)
nhwc2nchw_axis
[
1
]
=
n_dims
-
1
;
for
(
size_t
i
=
2
;
i
<
n_dims
;
++
i
)
{
nhwc2nchw_axis
[
i
]
=
i
-
1
;
}
return
nhwc2nchw_axis
;
}
template
<
typename
data_type
>
inline
std
::
vector
<
data_type
>
GetAxisNCHW2NHWC
(
size_t
n_dims
)
{
std
::
vector
<
data_type
>
nchw2nhwc_axis
(
n_dims
);
nchw2nhwc_axis
[
0
]
=
0
;
for
(
size_t
i
=
1
;
i
<
n_dims
-
1
;
++
i
)
{
nchw2nhwc_axis
[
i
]
=
i
+
1
;
}
if
(
n_dims
>
1
)
nchw2nhwc_axis
[
n_dims
-
1
]
=
1
;
return
nchw2nhwc_axis
;
}
template
<
paddle
::
lite_api
::
PrecisionType
>
struct
FPTypeTraits
{};
struct
MLUTypeTraits
{
/* using type = void; */
/* static constexpr cnmlDataType_t cnml_type = CNML_DATA_INVALID; */
};
template
<
>
struct
MLUTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kFloat
>
{
using
type
=
float
;
static
constexpr
cnmlDataType_t
cnml_type
=
CNML_DATA_FLOAT32
;
};
template
<
>
struct
MLUTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kFP16
>
{
using
type
=
paddle
::
lite
::
fluid
::
float16
;
static
constexpr
cnmlDataType_t
cnml_type
=
CNML_DATA_FLOAT16
;
};
template
<
>
struct
FPTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kFloat
>
{
typedef
float
T
;
struct
MLUTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kInt8
>
{
using
type
=
int8_t
;
static
constexpr
cnmlDataType_t
cnml_type
=
CNML_DATA_INT8
;
};
template
<
>
struct
FPTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kFP16
>
{
typedef
paddle
::
lite
::
fluid
::
float16
T
;
struct
MLUTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kInt32
>
{
using
type
=
int32_t
;
static
constexpr
cnmlDataType_t
cnml_type
=
CNML_DATA_INT32
;
};
}
// namespace mlu
...
...
lite/kernels/mlu/io_copy_compute.cc
浏览文件 @
cc927184
...
...
@@ -41,6 +41,9 @@ class IoCopyHostToMluCompute
auto
mem_size
=
param
.
x
->
memory_size
();
// LOG(INFO) << "copy size " << mem_size;
auto
*
data
=
param
.
y
->
mutable_data
(
TARGET
(
kMLU
),
mem_size
);
VLOG
(
6
)
<<
"io_copy host to mlu] memory size: "
<<
mem_size
<<
" precision type: "
<<
PrecisionToStr
(
Precision
);
param
.
y
->
set_precision
(
param
.
x
->
precision
());
CopyFromHostSync
(
data
,
param
.
x
->
raw_data
(),
mem_size
);
}
...
...
@@ -79,6 +82,13 @@ class IoCopyMluToHostCompute
CHECK
(
param
.
x
->
target
()
==
TARGET
(
kMLU
));
auto
mem_size
=
param
.
x
->
memory_size
();
auto
*
data
=
param
.
y
->
mutable_data
(
TARGET
(
kHost
),
mem_size
);
VLOG
(
6
)
<<
"io_copy mlu to host] memory size: "
<<
mem_size
<<
" precision type: "
<<
PrecisionToStr
(
Precision
);
// sync queue to ensure process done
auto
&
mlu_context
=
this
->
ctx_
->
template
As
<
MLUContext
>();
CNRT_CALL
(
cnrtSyncQueue
(
mlu_context
.
exec_queue
()));
CopyToHostSync
(
data
,
param
.
x
->
raw_data
(),
mem_size
);
}
...
...
@@ -97,8 +107,14 @@ REGISTER_LITE_KERNEL(
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
IoCopyHostToMluCompute
<
PRECISION
(
kFloat
)
>
,
host_to_device_kFloat
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
...
...
@@ -108,8 +124,31 @@ REGISTER_LITE_KERNEL(
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
IoCopyHostToMluCompute
<
PRECISION
(
kFP16
)
>
,
host_to_device_kFP16
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
io_copy
,
kMLU
,
kInt32
,
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
IoCopyHostToMluCompute
<
PRECISION
(
kInt32
)
>
,
host_to_device_kInt32
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kInt32
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kInt32
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
...
...
@@ -119,8 +158,14 @@ REGISTER_LITE_KERNEL(
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
IoCopyMluToHostCompute
<
PRECISION
(
kFloat
)
>
,
device_to_host_kFloat
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
...
...
@@ -130,6 +175,29 @@ REGISTER_LITE_KERNEL(
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
IoCopyMluToHostCompute
<
PRECISION
(
kFP16
)
>
,
device_to_host_kFP16
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
io_copy
,
kMLU
,
kInt8
,
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
IoCopyHostToMluCompute
<
PRECISION
(
kInt8
)
>
,
host_to_device_to_kInt8
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kInt8
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kInt8
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
lite/kernels/mlu/layout_compute.cc
浏览文件 @
cc927184
...
...
@@ -24,9 +24,9 @@ namespace mlu {} // namespace mlu
REGISTER_LITE_KERNEL
(
layout
,
k
MLU
,
k
X86
,
kFloat
,
kN
HWC
,
kN
CHW
,
paddle
::
lite
::
kernels
::
mlu
::
LayoutNhwcToNchwCompute
<
PRECISION
(
kFloat
)
>
,
def_layout_nhwc2nchw_fp32
)
.
BindInput
(
"Input"
,
...
...
@@ -41,9 +41,9 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL
(
layout
,
k
MLU
,
k
X86
,
kFP16
,
kN
HWC
,
kN
CHW
,
paddle
::
lite
::
kernels
::
mlu
::
LayoutNhwcToNchwCompute
<
PRECISION
(
kFP16
)
>
,
def_layout_nhwc2nchw_fp16
)
.
BindInput
(
"Input"
,
...
...
@@ -58,9 +58,9 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL
(
layout
,
k
MLU
,
k
X86
,
kFloat
,
kN
HWC
,
kN
CHW
,
paddle
::
lite
::
kernels
::
mlu
::
LayoutNchwToNhwcCompute
<
PRECISION
(
kFloat
)
>
,
def_layout_nchw2nhwc_fp32
)
.
BindInput
(
"Input"
,
...
...
@@ -75,9 +75,9 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL
(
layout
,
k
MLU
,
k
X86
,
kFP16
,
kN
HWC
,
kN
CHW
,
paddle
::
lite
::
kernels
::
mlu
::
LayoutNchwToNhwcCompute
<
PRECISION
(
kFP16
)
>
,
def_layout_nchw2nhwc_fp16
)
.
BindInput
(
"Input"
,
...
...
@@ -92,11 +92,11 @@ REGISTER_LITE_KERNEL(
REGISTER_LITE_KERNEL
(
layout
,
k
MLU
,
k
X86
,
kInt8
,
kN
HWC
,
kN
CHW
,
paddle
::
lite
::
kernels
::
mlu
::
LayoutNchwToNhwcCompute
<
PRECISION
(
kInt8
)
>
,
def_layout_nchw2nhwc_
fp32_
int8
)
def_layout_nchw2nhwc_int8
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kInt8
),
...
...
lite/kernels/mlu/layout_compute.h
浏览文件 @
cc927184
...
...
@@ -22,6 +22,7 @@
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/operators/layout_op.h"
namespace
paddle
{
...
...
@@ -29,24 +30,6 @@ namespace lite {
namespace
kernels
{
namespace
mlu
{
template
<
paddle
::
lite_api
::
PrecisionType
>
struct
FPTypeTraits
{};
template
<
>
struct
FPTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kFloat
>
{
typedef
float
T
;
};
template
<
>
struct
FPTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kFP16
>
{
typedef
paddle
::
lite
::
fluid
::
float16
T
;
};
template
<
>
struct
FPTypeTraits
<
paddle
::
lite_api
::
PrecisionType
::
kInt8
>
{
typedef
int8_t
T
;
};
template
<
lite
::
TargetType
Target
,
typename
T
>
inline
void
LayoutTransCompute
(
const
int
dim
,
const
lite
::
Context
<
Target
>&
context
,
...
...
@@ -73,7 +56,7 @@ inline void LayoutTransCompute(const int dim,
template
<
PrecisionType
Precision
>
class
LayoutNchwToNhwcCompute
:
public
KernelLite
<
TARGET
(
k
MLU
),
Precision
,
DATALAYOUT
(
kNHWC
)
>
{
:
public
KernelLite
<
TARGET
(
k
X86
),
Precision
,
DATALAYOUT
(
kNCHW
)
>
{
public:
using
param_t
=
operators
::
LayoutParam
;
...
...
@@ -81,36 +64,37 @@ class LayoutNchwToNhwcCompute
auto
&
param
=
this
->
template
Param
<
param_t
>();
auto
*
x
=
param
.
x
;
auto
*
out
=
param
.
y
;
out
->
template
mutable_data
<
typename
FPTypeTraits
<
Precision
>
::
T
>
();
auto
x_dims
=
param
.
x
->
dims
().
size
();
out
->
template
mutable_data
<
typename
subgraph
::
mlu
::
MLUTypeTraits
<
Precision
>
::
type
>
();
auto
x_ndims
=
param
.
x
->
dims
().
size
();
auto
&
context
=
this
->
ctx_
->
template
As
<
X86Context
>();
const
auto
origin_dims
=
out
->
dims
().
Vectorize
();
std
::
vector
<
int
>
axis
;
switch
(
x_dims
)
{
switch
(
x_
n
dims
)
{
case
2
:
axis
=
{
0
,
1
};
break
;
case
3
:
axis
=
{
0
,
2
,
1
};
out
->
Resize
(
std
::
vector
<
int64_t
>
{
o
ut
->
dims
()[
0
],
out
->
dims
()[
2
],
out
->
dims
()
[
1
]});
o
rigin_dims
[
0
],
origin_dims
[
2
],
origin_dims
[
1
]});
break
;
case
4
:
axis
=
{
0
,
2
,
3
,
1
};
out
->
Resize
(
std
::
vector
<
int64_t
>
{
o
ut
->
dims
()[
0
],
out
->
dims
()[
2
],
out
->
dims
()[
3
],
out
->
dims
()
[
1
]});
o
rigin_dims
[
0
],
origin_dims
[
2
],
origin_dims
[
3
],
origin_dims
[
1
]});
break
;
default:
CHECK
(
0
)
<<
"Unsupport dim in mlu layout nchw to nhwc"
;
}
LayoutTransCompute
<
lite
::
TargetType
::
kX86
,
typename
FPTypeTraits
<
Precision
>::
T
>
(
x_dims
,
context
,
*
x
,
out
,
axis
);
typename
subgraph
::
mlu
::
MLUTypeTraits
<
Precision
>::
type
>
(
x_
n
dims
,
context
,
*
x
,
out
,
axis
);
if
(
x_dims
>
2
)
{
if
(
x_
n
dims
>
2
)
{
out
->
Resize
(
origin_dims
);
}
}
...
...
@@ -122,7 +106,7 @@ class LayoutNchwToNhwcCompute
template
<
PrecisionType
Precision
>
class
LayoutNhwcToNchwCompute
:
public
KernelLite
<
TARGET
(
k
MLU
),
Precision
,
DATALAYOUT
(
kNHWC
)
>
{
:
public
KernelLite
<
TARGET
(
k
X86
),
Precision
,
DATALAYOUT
(
kNCHW
)
>
{
public:
using
param_t
=
operators
::
LayoutParam
;
...
...
@@ -130,25 +114,27 @@ class LayoutNhwcToNchwCompute
auto
&
param
=
this
->
template
Param
<
param_t
>();
auto
*
x
=
param
.
x
;
auto
*
out
=
param
.
y
;
out
->
template
mutable_data
<
typename
FPTypeTraits
<
Precision
>
::
T
>
();
auto
x_dims
=
param
.
x
->
dims
().
size
();
out
->
template
mutable_data
<
typename
subgraph
::
mlu
::
MLUTypeTraits
<
Precision
>
::
type
>
();
auto
&
context
=
this
->
ctx_
->
template
As
<
X86Context
>();
const
auto
origin_dims
=
out
->
dims
().
Vectorize
();
TensorLite
tmp_t
;
tmp_t
.
ShareDataWith
(
*
x
);
const
auto
x_dims
=
x
->
dims
().
Vectorize
();
auto
x_ndims
=
param
.
x
->
dims
().
size
();
std
::
vector
<
int
>
axis
;
switch
(
x_dims
)
{
switch
(
x_
n
dims
)
{
case
2
:
axis
=
{
0
,
1
};
break
;
case
3
:
out
->
Resize
(
std
::
vector
<
int64_t
>
{
out
->
dims
()[
0
],
out
->
dims
()[
2
],
out
->
dims
()[
1
]});
tmp_t
.
Resize
(
std
::
vector
<
int64_t
>
{
x_dims
[
0
],
x_dims
[
2
],
x_dims
[
1
]});
axis
=
{
0
,
2
,
1
};
break
;
case
4
:
out
->
Resize
(
std
::
vector
<
int64_t
>
{
out
->
dims
()[
0
],
out
->
dims
()[
3
],
out
->
dims
()[
1
],
out
->
dims
()[
2
]});
tmp_t
.
Resize
(
std
::
vector
<
int64_t
>
{
x_dims
[
0
],
x_dims
[
2
],
x_dims
[
3
],
x_dims
[
1
]});
axis
=
{
0
,
3
,
1
,
2
};
break
;
default:
...
...
@@ -156,12 +142,8 @@ class LayoutNhwcToNchwCompute
}
LayoutTransCompute
<
lite
::
TargetType
::
kX86
,
typename
FPTypeTraits
<
Precision
>::
T
>
(
x_dims
,
context
,
*
x
,
out
,
axis
);
if
(
x_dims
>
2
)
{
out
->
Resize
(
origin_dims
);
}
typename
subgraph
::
mlu
::
MLUTypeTraits
<
Precision
>::
type
>
(
x_ndims
,
context
,
tmp_t
,
out
,
axis
);
}
std
::
string
doc
()
const
override
{
...
...
lite/kernels/mlu/subgraph_compute.cc
浏览文件 @
cc927184
...
...
@@ -36,8 +36,14 @@ REGISTER_LITE_KERNEL(
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
SubgraphCompute
<
PRECISION
(
kFloat
)
>
,
def_kFloat
)
.
BindInput
(
"Inputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindOutput
(
"Outputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindInput
(
"Inputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Outputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
...
...
@@ -47,6 +53,12 @@ REGISTER_LITE_KERNEL(
kNHWC
,
paddle
::
lite
::
kernels
::
mlu
::
SubgraphCompute
<
PRECISION
(
kFP16
)
>
,
def_FP16
)
.
BindInput
(
"Inputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindOutput
(
"Outputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindInput
(
"Inputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Outputs"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
lite/kernels/mlu/subgraph_compute.h
浏览文件 @
cc927184
...
...
@@ -14,17 +14,24 @@
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
#include "lite/core/types.h"
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/tensor.h"
#include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/utils/env.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -40,10 +47,19 @@ class SubgraphEngine : public subgraph::Engine {
const
std
::
vector
<
std
::
string
>&
input_names
,
const
std
::
vector
<
std
::
string
>&
output_names
,
Scope
*
scope
,
::
paddle
::
lite_api
::
PrecisionType
type
)
paddle
::
lite_api
::
PrecisionType
type
)
:
subgraph
::
Engine
(
ctx
,
block_idx
,
block_desc
,
input_names
,
output_names
,
scope
)
{
graph_
.
SetFPType
(
type
);
ctx
,
block_idx
,
block_desc
,
input_names
,
output_names
,
scope
),
fp_type_
(
type
)
{
VLOG
(
4
)
<<
"[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is "
<<
GetBoolFromEnv
(
"PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL"
);
VLOG
(
4
)
<<
"[MLU] PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE is "
<<
GetBoolFromEnv
(
"PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE"
);
VLOG
(
4
)
<<
"[MLU] LITE_DISABLE_MLU_CAST is "
<<
GetBoolFromEnv
(
"LITE_DISABLE_MLU_CAST"
);
if
(
GetBoolFromEnv
(
"PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE"
))
{
disable_batch_size_changeable_
=
true
;
}
}
int
Build
()
{
...
...
@@ -72,24 +88,97 @@ class SubgraphEngine : public subgraph::Engine {
return
0
;
}
bool
InputShapeChanged
()
{
std
::
vector
<
std
::
vector
<
int64_t
>>
new_shape
;
// used in batch changable situation
std
::
vector
<
std
::
vector
<
int64_t
>>
all_shape
;
for
(
auto
origin_itensor
:
origin_itensors_
)
{
if
(
!
disable_batch_size_changeable_
)
{
auto
iv
=
origin_itensor
->
dims
().
Vectorize
();
all_shape
.
push_back
(
iv
);
iv
.
erase
(
iv
.
begin
());
new_shape
.
push_back
(
iv
);
}
else
{
new_shape
.
push_back
(
origin_itensor
->
dims
().
Vectorize
());
}
}
inputs_shape_
=
new_shape
;
all_inputs_shape_
=
all_shape
;
if
(
shape_graph_map_
.
count
(
inputs_shape_
)
>
0
)
{
return
false
;
}
VLOG
(
3
)
<<
"MLU graph input shape changed"
<<
std
::
endl
;
return
true
;
}
inline
cnmlDataType_t
PrecisionToDatatype
(
PrecisionType
data_type
)
{
switch
(
data_type
)
{
case
paddle
::
lite_api
::
PrecisionType
::
kFP16
:
return
CNML_DATA_FLOAT16
;
case
paddle
::
lite_api
::
PrecisionType
::
kFloat
:
return
CNML_DATA_FLOAT32
;
case
paddle
::
lite_api
::
PrecisionType
::
kInt32
:
return
CNML_DATA_INT32
;
case
paddle
::
lite_api
::
PrecisionType
::
kInt8
:
return
CNML_DATA_UINT8
;
default:
return
PrecisionToDatatype
(
fp_type_
);
}
}
protected:
int
BuildDeviceProgram
()
override
{
if
(
!
error_compile_batch_size_changeable_
&&
!
disable_batch_size_changeable_
)
{
int
status
=
BuildDeviceProgramImpl
();
if
(
subgraph
::
CHECK_SUCCESS
(
status
))
{
return
status
;
}
LOG
(
INFO
)
<<
"[MLU] build batch_size changeable subgraph op failed, "
"changed to input_shape changeable"
;
}
error_compile_batch_size_changeable_
=
true
;
disable_batch_size_changeable_
=
true
;
return
BuildDeviceProgramImpl
();
}
int
BuildDeviceProgramImpl
()
{
int
status
=
0
;
auto
graph
=
std
::
make_shared
<
paddle
::
lite
::
subgraph
::
mlu
::
Graph
>
();
graph
->
SetFPType
(
fp_type_
);
std
::
vector
<
std
::
vector
<
int64_t
>>
new_shape
;
origin_itensors_
.
clear
();
origin_otensors_
.
clear
();
auto
data_order
=
block_desc_
->
GetOp
<
cpp
::
OpDesc
>
(
0
)
->
Type
()
==
"layout"
?
CNML_NCHW
:
CNML_NHWC
;
// Convert all of input data vars and added into the MLU IR graph
status
|=
subgraph
::
REBUILD_WHEN_SHAPE_CHANGED
;
for
(
auto
&
input_name
:
input_names_
)
{
auto
input_tensor
=
scope_
->
FindMutableTensor
(
input_name
);
auto
data_type
=
input_tensor
->
precision
();
cnmlDataType_t
fp_type
=
PrecisionToDatatype
(
data_type
);
origin_itensors_
.
push_back
(
input_tensor
);
if
(
!
disable_batch_size_changeable_
)
{
auto
iv
=
input_tensor
->
dims
().
Vectorize
();
iv
.
erase
(
iv
.
begin
());
new_shape
.
push_back
(
iv
);
}
else
{
new_shape
.
push_back
(
input_tensor
->
dims
().
Vectorize
());
}
CHECK
(
input_tensor
);
auto
input_node
=
graph_
.
AddNode
(
input_name
,
input_tensor
->
dims
().
Vectorize
(),
CNML_TENSOR
,
CNML_NCHW
,
graph_
.
FPType
()
,
const_cast
<
void
*>
(
input_tensor
->
raw_data
())
);
VLOG
(
4
)
<<
"subgraph input tensor "
<<
input_name
<<
std
::
endl
;
auto
input_node
=
graph
->
AddNode
(
input_name
,
input_tensor
->
dims
().
Vectorize
(),
CNML_TENSOR
,
CNML_NCHW
,
fp_type
,
data_order
);
CHECK
(
input_node
);
// MLU doesn't support dynamic dimensions/shapes, so need to rebuild
// the program when the shape of any input tensor is changed.
status
|=
subgraph
::
REBUILD_WHEN_SHAPE_CHANGED
;
}
LOG
(
INFO
)
<<
"START TO CONVERT "
;
// Convert all of ops and its weights and added into the MLU IR graph
...
...
@@ -98,6 +187,18 @@ class SubgraphEngine : public subgraph::Engine {
auto
op
=
inst
.
op
();
CHECK
(
op
);
std
::
string
op_type
=
op
->
op_info
()
->
Type
();
// since cnml's compile api will not return error now, we simply check
// op's type
if
(
!
disable_batch_size_changeable_
&&
std
::
find
(
unsupport_batch_size_changeable_op_type_
.
begin
(),
unsupport_batch_size_changeable_op_type_
.
end
(),
op_type
)
!=
unsupport_batch_size_changeable_op_type_
.
end
())
{
status
|=
subgraph
::
FAILED
;
VLOG
(
4
)
<<
"[MLU] found unsupported batch_size changeable op type: "
<<
op_type
;
return
status
;
}
op
->
CheckShape
();
const_cast
<
OpLite
*>
(
op
)
->
InferShape
();
if
(
!
bridges
.
Exists
(
op_type
,
TARGET
(
kMLU
)))
{
...
...
@@ -106,7 +207,7 @@ class SubgraphEngine : public subgraph::Engine {
}
auto
kernel
=
inst
.
kernel
();
status
|=
bridges
.
Select
(
op_type
,
TARGET
(
kMLU
))(
reinterpret_cast
<
void
*>
(
&
graph_
),
reinterpret_cast
<
void
*>
(
graph
.
get
()
),
const_cast
<
OpLite
*>
(
op
),
const_cast
<
KernelBase
*>
(
kernel
));
if
(
subgraph
::
CHECK_FAILED
(
status
))
{
...
...
@@ -115,46 +216,272 @@ class SubgraphEngine : public subgraph::Engine {
}
// Obtain the output nodes of the MLU IR graph and build the graph to MLU
// runtime
std
::
vector
<
std
::
string
>
valid_output_names
;
for
(
auto
&
output_name
:
output_names_
)
{
if
(
graph
_
.
HasNode
(
output_name
))
{
graph
_
.
AddOutput
(
graph_
.
GetNode
(
output_name
));
if
(
graph
->
HasNode
(
output_name
))
{
graph
->
AddOutput
(
graph
->
GetNode
(
output_name
));
auto
output_tensor
=
scope_
->
FindMutableTensor
(
output_name
);
void
*
p_data
=
static_cast
<
void
*>
(
output_tensor
->
mutable_data
<
typename
::
paddle
::
lite
::
subgraph
::
mlu
::
FPTypeTraits
<
Precision
>::
T
>
(
TARGET
(
kMLU
)));
auto
node
=
graph_
.
GetNode
(
output_name
);
CHECK
(
p_data
);
node
->
set_mlu_ptr
(
p_data
);
valid_output_names
.
push_back
(
output_name
);
origin_otensors_
.
push_back
(
output_tensor
);
VLOG
(
4
)
<<
"subgraph output tensor "
<<
output_name
<<
std
::
endl
;
// auto node = graph->GetNode(output_name);
// CHECK(p_data);
// node->set_mlu_ptr(p_data);
}
}
for
(
auto
&
input_name
:
input_names_
)
{
graph_
.
AddInput
(
graph_
.
GetNode
(
input_name
));
graph
->
AddInput
(
graph
->
GetNode
(
input_name
),
disable_batch_size_changeable_
);
}
CHECK
(
!
valid_output_names
.
empty
())
<<
"[MLU] no valid output names"
;
CHECK
(
!
origin_otensors_
.
empty
())
<<
"[MLU] no valid output names"
;
auto
&
mlu_context
=
this
->
ctx_
->
template
As
<
MLUContext
>();
auto
core_version
=
mlu_context
.
MLUCoreVersion
();
auto
core_number
=
mlu_context
.
MLUCoreNumber
();
graph_
.
Compile
(
core_version
,
core_number
);
graph
->
Compile
(
core_version
,
core_number
);
shape_graph_map_
[
new_shape
]
=
graph
;
if
(
GetBoolFromEnv
(
"PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL"
))
{
graph
->
GenOfflineModel
(
GetOfflineModName
());
}
return
status
;
}
std
::
string
TrimStrings
(
const
std
::
string
&
origin_str
)
{
std
::
string
str
=
origin_str
;
std
::
size_t
found
=
str
.
find
(
"0x"
);
std
::
size_t
found_end
=
0
;
const
std
::
vector
<
std
::
string
>
del_strs
=
{
"/trans_io_copy"
,
"/trans_cast"
,
"/trans_layout"
};
for
(
const
auto
&
iterm
:
del_strs
)
{
found_end
=
str
.
find
(
iterm
);
// trim point address and one of the del_strs
if
(
found
!=
std
::
string
::
npos
&&
found_end
!=
std
::
string
::
npos
)
{
str
.
replace
(
found
,
found_end
-
found
,
""
);
found_end
=
str
.
find
(
iterm
);
str
.
replace
(
found_end
,
iterm
.
size
(),
""
);
break
;
}
}
return
str
;
}
std
::
string
GetOfflineModName
()
{
sort
(
input_names_
.
begin
(),
input_names_
.
end
());
sort
(
output_names_
.
begin
(),
output_names_
.
end
());
const
auto
&
delimiter
=
"__"
;
const
auto
&
delimiter_num
=
"_"
;
const
auto
&
input_shape_str
=
"input_shape_"
;
const
auto
&
output_shape_str
=
"output_shape_"
;
std
::
string
name
=
""
;
std
::
string
tmp
=
""
;
for
(
const
auto
&
input_name
:
input_names_
)
{
tmp
=
input_name
;
name
+=
TrimStrings
(
tmp
)
+
delimiter
+
input_shape_str
;
auto
input_tensor
=
scope_
->
FindMutableTensor
(
input_name
);
for
(
const
auto
&
iterm
:
input_tensor
->
dims
().
Vectorize
())
{
name
+=
std
::
to_string
(
iterm
)
+
delimiter_num
;
}
name
+=
delimiter
;
}
for
(
const
auto
&
output_name
:
output_names_
)
{
tmp
=
output_name
;
name
+=
TrimStrings
(
tmp
)
+
delimiter
+
output_shape_str
;
auto
output_tensor
=
scope_
->
FindMutableTensor
(
output_name
);
for
(
const
auto
&
iterm
:
output_tensor
->
dims
().
Vectorize
())
{
name
+=
std
::
to_string
(
iterm
)
+
delimiter_num
;
}
name
+=
delimiter
;
}
std
::
replace
(
name
.
begin
(),
name
.
end
(),
'/'
,
'-'
);
return
name
;
}
void
InferOutputsShapeOnly
()
{
// infer outputs shape when enable BATCH_SIZE_CHANGEABLE
const
auto
iter
=
in_out_shape_map_
.
find
(
all_inputs_shape_
);
if
(
iter
!=
in_out_shape_map_
.
end
())
{
for
(
size_t
i
=
0
;
i
<
origin_otensors_
.
size
();
++
i
)
{
origin_otensors_
[
i
]
->
Resize
(
iter
->
second
[
i
]);
}
}
else
{
for
(
auto
&
inst
:
origin_program_
)
{
auto
op
=
inst
.
op
();
CHECK
(
op
);
op
->
CheckShape
();
const_cast
<
OpLite
*>
(
op
)
->
InferShape
();
}
std
::
vector
<
std
::
vector
<
int64_t
>>
outs_shape
;
for
(
size_t
i
=
0
;
i
<
origin_otensors_
.
size
();
++
i
)
{
outs_shape
.
push_back
(
origin_otensors_
[
i
]
->
dims
().
Vectorize
());
}
in_out_shape_map_
[
all_inputs_shape_
]
=
outs_shape
;
}
}
inline
void
*
GetOutputDataPtr
(
Tensor
*
tensor
,
bool
use_mlu_cast
)
{
if
(
use_mlu_cast
)
{
// output is float, since cast fused in subgraph
return
static_cast
<
void
*>
(
tensor
->
mutable_data
<
float
>
(
TARGET
(
kMLU
)));
}
else
{
return
static_cast
<
void
*>
(
tensor
->
template
mutable_data
<
typename
subgraph
::
mlu
::
MLUTypeTraits
<
Precision
>
::
type
>
(
TARGET
(
kMLU
)));
}
}
int
LaunchDeviceProgram
()
override
{
// prepare input and output memory
auto
&
mlu_context
=
this
->
ctx_
->
template
As
<
MLUContext
>();
auto
exec_queue
=
mlu_context
.
exec_queue
();
u32_t
affinity
=
mlu_context
.
affinity
();
cnrtInvokeFuncParam_t
forward_param
=
mlu_context
.
forward_param
();
int
data_param
=
1
;
forward_param
.
data_parallelism
=
&
data_param
;
forward_param
.
affinity
=
&
affinity
;
forward_param
.
end
=
CNRT_PARAM_END
;
graph_
.
Compute
(
forward_param
,
exec_queue
);
auto
graph
=
shape_graph_map_
[
inputs_shape_
];
auto
*
graph_input
=
graph
->
MutableInputs
();
auto
*
graph_output
=
graph
->
MutableOutputs
();
CHECK_EQ
(
graph_input
->
size
(),
origin_itensors_
.
size
());
CHECK_EQ
(
graph_output
->
size
(),
origin_otensors_
.
size
());
bool
disable_mlu_cast
=
GetBoolFromEnv
(
"LITE_DISABLE_MLU_CAST"
);
if
(
!
disable_batch_size_changeable_
)
{
std
::
vector
<
std
::
shared_ptr
<
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
>>
graph_in
;
if
(
shape_tensor_map_in_
.
find
(
all_inputs_shape_
)
!=
shape_tensor_map_in_
.
end
())
{
graph_in
=
shape_tensor_map_in_
[
all_inputs_shape_
];
for
(
size_t
i
=
0
;
i
<
origin_itensors_
.
size
();
++
i
)
{
graph_in
[
i
]
->
set_mlu_ptr
(
const_cast
<
void
*>
(
origin_itensors_
[
i
]
->
raw_data
()));
}
}
else
{
graph_in
.
reserve
(
origin_itensors_
.
size
());
for
(
size_t
i
=
0
;
i
<
origin_itensors_
.
size
();
++
i
)
{
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
tmp
(
origin_itensors_
[
i
]
->
dims
().
Vectorize
());
tmp
.
set_mlu_dtype
(
graph_input
->
at
(
i
)
->
dtype
());
tmp
.
set_mlu_ptr
(
const_cast
<
void
*>
(
origin_itensors_
[
i
]
->
raw_data
()));
graph_in
.
push_back
(
std
::
make_shared
<
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
>
(
tmp
));
}
shape_tensor_map_in_
[
all_inputs_shape_
]
=
graph_in
;
}
// TODO(zhangmingwei): we just call every op's infer_shape to get outputs'
// shape, may be it's better to use cnml's api to get output shape. This
// can be done when cnml's tensor dimension is totally equal to lite's
// tensor
// shape.
InferOutputsShapeOnly
();
// const std::vector<std::vector<int64_t>> new_output_size =
// graph->InferOutputsShape(graph_in);
std
::
vector
<
std
::
shared_ptr
<
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
>>
graph_out
;
if
(
shape_tensor_map_out_
.
find
(
all_inputs_shape_
)
!=
shape_tensor_map_out_
.
end
())
{
graph_out
=
shape_tensor_map_out_
[
all_inputs_shape_
];
for
(
size_t
i
=
0
;
i
<
origin_otensors_
.
size
();
++
i
)
{
// origin_otensors_[i]->Resize(new_output_size.at(i));
graph_out
[
i
]
->
set_mlu_ptr
(
GetOutputDataPtr
(
origin_otensors_
[
i
],
!
disable_mlu_cast
));
}
}
else
{
graph_out
.
reserve
(
origin_otensors_
.
size
());
for
(
size_t
i
=
0
;
i
<
origin_otensors_
.
size
();
++
i
)
{
// origin_otensors_[i]->Resize(new_output_size.at(i));
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
tmp
(
origin_otensors_
[
i
]
->
dims
().
Vectorize
());
tmp
.
set_mlu_dtype
(
graph_output
->
at
(
i
)
->
dtype
());
tmp
.
set_mlu_ptr
(
GetOutputDataPtr
(
origin_otensors_
[
i
],
!
disable_mlu_cast
));
graph_out
.
push_back
(
std
::
make_shared
<
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
>
(
tmp
));
}
shape_tensor_map_out_
[
all_inputs_shape_
]
=
graph_out
;
}
graph
->
Compute
(
exec_queue
,
graph_in
,
graph_out
);
}
else
{
for
(
size_t
i
=
0
;
i
<
origin_itensors_
.
size
();
++
i
)
{
graph_input
->
at
(
i
)
->
set_mlu_ptr
(
const_cast
<
void
*>
(
origin_itensors_
[
i
]
->
raw_data
()));
}
for
(
size_t
i
=
0
;
i
<
origin_otensors_
.
size
();
++
i
)
{
origin_otensors_
[
i
]
->
Resize
(
graph_output
->
at
(
i
)
->
get_origin_shape
());
graph_output
->
at
(
i
)
->
set_mlu_ptr
(
GetOutputDataPtr
(
origin_otensors_
[
i
],
!
disable_mlu_cast
));
}
// only cnmlComputeFusionOpForward_V3 need cnrtInvokeFuncParam_t
cnrtInvokeFuncParam_t
forward_param
=
mlu_context
.
forward_param
();
int
data_param
=
1
;
forward_param
.
data_parallelism
=
&
data_param
;
u32_t
affinity
=
mlu_context
.
affinity
();
forward_param
.
affinity
=
&
affinity
;
forward_param
.
end
=
CNRT_PARAM_END
;
graph
->
Compute
(
forward_param
,
exec_queue
);
#ifdef MLU_DUMP_SUBGRAPH_IO
// Graph node store compile-time tensor while batchsize mutable is set.
// Only batchsize mutable is disabled, data exists in graph node at
// runtime
// =========== DUMP ===================
for
(
auto
input_name
:
input_names_
)
{
auto
input_tensor
=
shape_graph_map_
[
inputs_shape_
]
->
GetNode
(
input_name
);
auto
dump_name
=
input_name
;
while
(
dump_name
.
find
(
"/"
)
!=
std
::
string
::
npos
)
{
dump_name
=
dump_name
.
replace
(
dump_name
.
find
(
"/"
),
1
,
"_"
);
}
VLOG
(
6
)
<<
"dump_name: "
<<
dump_name
;
input_tensor
->
ToFile
(
dump_name
);
}
for
(
auto
output_name
:
output_names_
)
{
if
(
shape_graph_map_
[
inputs_shape_
]
->
HasNode
(
output_name
))
{
auto
output_tensor
=
shape_graph_map_
[
inputs_shape_
]
->
GetNode
(
output_name
);
auto
dump_name
=
output_name
;
while
(
dump_name
.
find
(
"/"
)
!=
std
::
string
::
npos
)
{
dump_name
=
dump_name
.
replace
(
dump_name
.
find
(
"/"
),
1
,
"_"
);
}
VLOG
(
6
)
<<
"dump_name: "
<<
dump_name
;
output_tensor
->
ToFile
(
dump_name
);
}
else
{
VLOG
(
6
)
<<
"graph does not have "
<<
output_name
<<
" as output"
<<
std
::
endl
;
}
}
#endif
// =========== DUMP END ================
}
return
0
;
}
paddle
::
lite
::
subgraph
::
mlu
::
Graph
graph_
;
paddle
::
lite_api
::
PrecisionType
fp_type_
;
std
::
vector
<
std
::
vector
<
int64_t
>>
inputs_shape_
{};
std
::
vector
<
std
::
vector
<
int64_t
>>
all_inputs_shape_
{};
std
::
map
<
std
::
vector
<
std
::
vector
<
int64_t
>>
,
std
::
shared_ptr
<
paddle
::
lite
::
subgraph
::
mlu
::
Graph
>>
shape_graph_map_
{};
// enable batch size changeable by default, this cound be changed by
// environment variable PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE and
// whether the op can be compiled with batch size changeable way
bool
disable_batch_size_changeable_
{
false
};
bool
error_compile_batch_size_changeable_
{
false
};
std
::
vector
<
std
::
string
>
unsupport_batch_size_changeable_op_type_
{
"concat"
};
// search output runtime MLUTensor for certain output shape when enable
// BATCH_SIZE_CHANGEABLE
std
::
map
<
std
::
vector
<
std
::
vector
<
int64_t
>>
,
std
::
vector
<
std
::
shared_ptr
<
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
>>>
shape_tensor_map_out_
{};
// search input runtime MLUTensor for certain input shape when enable
// BATCH_SIZE_CHANGEABLE
std
::
map
<
std
::
vector
<
std
::
vector
<
int64_t
>>
,
std
::
vector
<
std
::
shared_ptr
<
paddle
::
lite
::
subgraph
::
mlu
::
MLUTensor
>>>
shape_tensor_map_in_
{};
// search output shape for certain input shape when enable
// BATCH_SIZE_CHANGEABLE
std
::
map
<
std
::
vector
<
std
::
vector
<
int64_t
>>
,
std
::
vector
<
std
::
vector
<
int64_t
>>>
in_out_shape_map_
{};
};
template
<
PrecisionType
Precision
>
...
...
lite/kernels/x86/activation_compute.cc
浏览文件 @
cc927184
...
...
@@ -78,3 +78,13 @@ REGISTER_LITE_KERNEL(softsign,
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
sigmoid
,
kX86
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
x86
::
SoftsignCompute
<
float
>
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
Finalize
();
lite/tools/build_mlu.sh
浏览文件 @
cc927184
...
...
@@ -4,7 +4,7 @@ set -ex
# global variables with default value
NEUWARE_HOME
=
"
${
NEUWARE_HOME
}
"
TARGET_NAME
=
"all"
# default target
BUILD_EXTRA
=
O
FF
# ON(with sequence ops)/OFF
BUILD_EXTRA
=
O
N
# ON(with sequence ops)/OFF
WITH_TESTING
=
ON
# ON/OFF
function
print_usage
{
...
...
@@ -28,16 +28,13 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
readonly
workspace
=
$(
pwd
)
function
prepare_thirdparty
{
if
[
!
-d
$workspace
/third-party
-o
-f
$workspace
/third-party-05b862.tar.gz
]
;
then
if
[
!
-d
$workspace
/third-party
]
;
then
rm
-rf
$workspace
/third-party
if
[
!
-f
$workspace
/third-party-05b862.tar.gz
]
;
then
wget
$THIRDPARTY_TAR
fi
tar
xzf third-party-05b862.tar.gz
else
git submodule update
--init
--recursive
fi
if
[
!
-f
$workspace
/third-party-05b862.tar.gz
]
;
then
wget
$THIRDPARTY_TAR
fi
tar
xvf third-party-05b862.tar.gz
}
# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录