Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
28c42d55
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
28c42d55
编写于
9月 07, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
9月 07, 2020
浏览文件
操作
浏览文件
下载
差异文件
!5725 Add fp16 matmul & fc ops for arm64
Merge pull request !5725 from zhanyuan/dev
上级
77dd91a6
9646c850
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
623 addition
and
0 deletion
+623
-0
mindspore/lite/nnacl/fp16/matmul_fp16.c
mindspore/lite/nnacl/fp16/matmul_fp16.c
+60
-0
mindspore/lite/nnacl/fp16/matmul_fp16.h
mindspore/lite/nnacl/fp16/matmul_fp16.h
+12
-0
mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
...e/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
+187
-0
mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
...re/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
+56
-0
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
+250
-0
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h
+58
-0
未找到文件。
mindspore/lite/nnacl/fp16/matmul_fp16.c
浏览文件 @
28c42d55
...
...
@@ -238,3 +238,63 @@ void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row,
}
return
;
}
void
Fp32RowMajor2Fp16Col16Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
)
{
for
(
int
r
=
0
;
r
<
row
;
r
++
)
{
for
(
int
c
=
0
;
c
<
col
;
c
++
)
{
int
r_div16
=
r
/
16
;
int
r_mod16
=
r
%
16
;
dst
[
r_div16
*
16
*
col
+
c
*
16
+
r_mod16
]
=
(
float16_t
)(
src
[
r
*
col
+
c
]);
}
}
}
void
Fp16RowMajor2Fp16Col16Major
(
float16_t
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
)
{
for
(
int
r
=
0
;
r
<
row
;
r
++
)
{
for
(
int
c
=
0
;
c
<
col
;
c
++
)
{
int
r_div16
=
r
/
16
;
int
r_mod16
=
r
%
16
;
dst
[
r_div16
*
16
*
col
+
c
*
16
+
r_mod16
]
=
src
[
r
*
col
+
c
];
}
}
}
void
Fp32RowMajor2Fp16Row16Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
)
{
for
(
int
r
=
0
;
r
<
row
;
r
++
)
{
for
(
int
c
=
0
;
c
<
col
;
c
++
)
{
int
c_div16
=
c
/
16
;
int
c_mod16
=
c
%
16
;
dst
[
c_div16
*
16
*
row
+
r
*
16
+
c_mod16
]
=
(
float16_t
)(
src
[
r
*
col
+
c
]);
}
}
}
void
Fp16RowMajor2Fp16Row16Major
(
float16_t
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
)
{
for
(
int
r
=
0
;
r
<
row
;
r
++
)
{
for
(
int
c
=
0
;
c
<
col
;
c
++
)
{
int
c_div16
=
c
/
16
;
int
c_mod16
=
c
%
16
;
dst
[
c_div16
*
16
*
row
+
r
*
16
+
c_mod16
]
=
src
[
r
*
col
+
c
];
}
}
}
void
Fp32RowMajor2Fp16Row8Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
)
{
for
(
int
r
=
0
;
r
<
row
;
r
++
)
{
for
(
int
c
=
0
;
c
<
col
;
c
++
)
{
int
c_div8
=
c
/
8
;
int
c_mod8
=
c
%
8
;
dst
[
c_div8
*
8
*
row
+
r
*
8
+
c_mod8
]
=
(
float16_t
)
src
[
r
*
col
+
c
];
}
}
}
void
Fp32RowMajor2Fp16Col8Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
)
{
for
(
int
r
=
0
;
r
<
row
;
r
++
)
{
for
(
int
c
=
0
;
c
<
col
;
c
++
)
{
int
r_div8
=
r
/
8
;
int
r_mod8
=
r
%
8
;
dst
[
r_div8
*
8
*
col
+
c
*
8
+
r_mod8
]
=
(
float16_t
)
src
[
r
*
col
+
c
];
}
}
}
mindspore/lite/nnacl/fp16/matmul_fp16.h
浏览文件 @
28c42d55
...
...
@@ -39,6 +39,18 @@ void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row,
void
MatmulFp16Neon64
(
const
float16_t
*
a
,
const
float16_t
*
b
,
float16_t
*
c
,
const
float16_t
*
bias
,
int
act_type
,
size_t
depth
,
size_t
row
,
size_t
col
,
size_t
stride
,
bool
write_nhwc
);
void
Fp32RowMajor2Fp16Col16Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
);
void
Fp16RowMajor2Fp16Col16Major
(
float16_t
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
);
void
Fp32RowMajor2Fp16Row16Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
);
void
Fp16RowMajor2Fp16Row16Major
(
float16_t
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
);
void
Fp32RowMajor2Fp16Row8Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
);
void
Fp32RowMajor2Fp16Col8Major
(
float
*
src
,
float16_t
*
dst
,
size_t
row
,
size_t
col
);
#ifdef __cplusplus
}
#endif
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
0 → 100644
浏览文件 @
28c42d55
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/fullconnection_fp16.h"
#include "nnacl/fp16/matmul_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"
#include "src/kernel_registry.h"
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
lite
::
RET_ERROR
;
using
mindspore
::
lite
::
RET_INPUT_TENSOR_ERROR
;
using
mindspore
::
lite
::
RET_MEMORY_FAILED
;
using
mindspore
::
lite
::
RET_OK
;
using
mindspore
::
schema
::
PrimitiveType_FullConnection
;
namespace
mindspore
::
kernel
{
FullconnectionFP16CPUKernel
::~
FullconnectionFP16CPUKernel
()
{
FreeTmpBuffer
();
}
void
FullconnectionFP16CPUKernel
::
FreeTmpBuffer
()
{
if
(
a_pack_ptr_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
a_pack_ptr_
);
a_pack_ptr_
=
nullptr
;
}
if
(
b_pack_ptr_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
b_pack_ptr_
);
b_pack_ptr_
=
nullptr
;
}
if
(
bias_ptr_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
bias_ptr_
);
bias_ptr_
=
nullptr
;
}
if
(
output_fp16_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
output_fp16_
);
output_fp16_
=
nullptr
;
}
}
int
FullconnectionFP16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
fc_param_
->
row_
=
(
in_tensors_
[
0
]
->
shape
())[
0
];
fc_param_
->
col_
=
(
in_tensors_
[
1
]
->
shape
())[
0
];
fc_param_
->
deep_
=
(
in_tensors_
[
1
]
->
shape
())[
1
];
fc_param_
->
row_16_
=
UP_ROUND
(
fc_param_
->
row_
,
C16NUM
);
fc_param_
->
col_8_
=
UP_ROUND
(
fc_param_
->
col_
,
C8NUM
);
thread_count_
=
MSMIN
(
thread_count_
,
UP_DIV
(
fc_param_
->
col_
,
C8NUM
));
thread_stride_
=
UP_DIV
(
UP_DIV
(
fc_param_
->
col_
,
C8NUM
),
thread_count_
)
*
C8NUM
;
a_pack_ptr_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
fc_param_
->
row_16_
*
fc_param_
->
deep_
*
sizeof
(
float16_t
)));
if
(
a_pack_ptr_
==
nullptr
)
{
FreeTmpBuffer
();
return
RET_MEMORY_FAILED
;
}
memset
(
a_pack_ptr_
,
0
,
fc_param_
->
row_16_
*
fc_param_
->
deep_
*
sizeof
(
float16_t
));
b_pack_ptr_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
fc_param_
->
col_8_
*
fc_param_
->
deep_
*
sizeof
(
float16_t
)));
if
(
b_pack_ptr_
==
nullptr
)
{
FreeTmpBuffer
();
return
RET_MEMORY_FAILED
;
}
memset
(
b_pack_ptr_
,
0
,
fc_param_
->
col_8_
*
fc_param_
->
deep_
*
sizeof
(
float16_t
));
InitMatrixB
(
reinterpret_cast
<
float
*>
(
in_tensors_
[
1
]
->
Data
()),
b_pack_ptr_
);
if
(
in_tensors_
.
size
()
==
3
)
{
bias_ptr_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
fc_param_
->
col_8_
*
sizeof
(
float16_t
)));
if
(
bias_ptr_
==
nullptr
)
{
FreeTmpBuffer
();
return
RET_MEMORY_FAILED
;
}
memset
(
bias_ptr_
,
0
,
fc_param_
->
col_8_
*
sizeof
(
float16_t
));
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
in_tensors_
[
2
]
->
Data
()),
bias_ptr_
,
fc_param_
->
col_
);
}
if
(
out_tensors_
[
0
]
->
data_type
()
==
kNumberTypeFloat32
)
{
output_fp16_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
fc_param_
->
row_
*
fc_param_
->
col_
*
sizeof
(
float16_t
)));
}
return
RET_OK
;
}
void
FullconnectionFP16CPUKernel
::
InitMatrixA
(
float
*
a_ptr
,
float16_t
*
a_pack_ptr
)
{
Fp32RowMajor2Fp16Col16Major
(
a_ptr
,
a_pack_ptr
,
fc_param_
->
row_
,
fc_param_
->
deep_
);
}
void
FullconnectionFP16CPUKernel
::
InitMatrixA
(
float16_t
*
a_ptr
,
float16_t
*
a_pack_ptr
)
{
Fp16RowMajor2Fp16Col16Major
(
a_ptr
,
a_pack_ptr
,
fc_param_
->
row_
,
fc_param_
->
deep_
);
}
void
FullconnectionFP16CPUKernel
::
InitMatrixB
(
float
*
b_ptr
,
float16_t
*
b_pack_ptr
)
{
Fp32RowMajor2Fp16Col8Major
(
b_ptr
,
b_pack_ptr
,
fc_param_
->
col_
,
fc_param_
->
deep_
);
}
int
FullconnectionFP16CPUKernel
::
Init
()
{
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
FullconnectionFP16CPUKernel
::
RunImpl
(
int
task_id
)
{
int
cur_stride
=
fc_param_
->
col_
-
task_id
*
thread_stride_
;
int
cur_oc
=
MSMIN
(
thread_stride_
,
cur_stride
);
if
(
cur_oc
<=
0
)
{
return
RET_OK
;
}
auto
b
=
b_pack_ptr_
+
task_id
*
thread_stride_
*
fc_param_
->
deep_
;
auto
bias
=
(
bias_ptr_
==
nullptr
)
?
nullptr
:
bias_ptr_
+
thread_stride_
*
task_id
;
auto
c
=
output_ptr_
+
task_id
*
thread_stride_
;
MatMulFp16
(
a_pack_ptr_
,
b
,
c
,
bias
,
fc_param_
->
act_type_
,
fc_param_
->
deep_
,
fc_param_
->
row_
,
cur_oc
,
fc_param_
->
col_
,
true
);
return
RET_OK
;
}
int
FcFP16Run
(
void
*
cdata
,
int
task_id
)
{
auto
op
=
reinterpret_cast
<
FullconnectionFP16CPUKernel
*>
(
cdata
);
auto
error_code
=
op
->
RunImpl
(
task_id
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"MatmulFp32Run error task_id["
<<
task_id
<<
"] error_code["
<<
error_code
<<
"]"
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
FullconnectionFP16CPUKernel
::
Run
()
{
auto
prepare_ret
=
Prepare
();
if
(
prepare_ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Prepare fail!ret: "
<<
prepare_ret
;
return
prepare_ret
;
}
auto
out_tensor
=
out_tensors_
[
0
];
if
(
out_tensor
->
data_type
()
==
kNumberTypeFloat32
)
{
output_ptr_
=
output_fp16_
;
}
else
{
output_ptr_
=
reinterpret_cast
<
float16_t
*>
(
out_tensor
->
Data
());
}
if
(
in_tensors_
[
0
]
->
data_type
()
==
kNumberTypeFloat32
)
{
InitMatrixA
(
reinterpret_cast
<
float
*>
(
in_tensors_
[
0
]
->
Data
()),
a_pack_ptr_
);
}
else
{
InitMatrixA
(
reinterpret_cast
<
float16_t
*>
(
in_tensors_
[
0
]
->
Data
()),
a_pack_ptr_
);
}
ParallelLaunch
(
THREAD_POOL_DEFAULT
,
FcFP16Run
,
this
,
thread_count_
);
if
(
out_tensor
->
data_type
()
==
kNumberTypeFloat32
)
{
auto
size
=
out_tensor
->
ElementsNum
();
auto
out_tensor_data
=
reinterpret_cast
<
float
*>
(
out_tensor
->
Data
());
Float16ToFloat32
(
output_fp16_
,
out_tensor_data
,
size
);
}
return
RET_OK
;
}
kernel
::
LiteKernel
*
CpuFullConnectionFp16KernelCreator
(
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
OpParameter
*
opParameter
,
const
lite
::
Context
*
ctx
,
const
kernel
::
KernelKey
&
desc
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
auto
*
kernel
=
new
(
std
::
nothrow
)
FullconnectionFP16CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
if
(
kernel
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"kernel is nullptr."
;
return
nullptr
;
}
auto
ret
=
kernel
->
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init kernel failed, name: "
<<
opParameter
->
name_
<<
", type: "
<<
schema
::
EnumNamePrimitiveType
(
static_cast
<
schema
::
PrimitiveType
>
(
opParameter
->
type_
));
delete
kernel
;
return
nullptr
;
}
return
kernel
;
}
REG_KERNEL
(
kCPU
,
kNumberTypeFloat16
,
PrimitiveType_FullConnection
,
CpuFullConnectionFp16KernelCreator
)
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
0 → 100644
浏览文件 @
28c42d55
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_
#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/matmul_parameter.h"
#include "src/runtime/kernel/arm/base/fullconnection_base.h"
namespace
mindspore
::
kernel
{
class
FullconnectionFP16CPUKernel
:
public
FullconnectionBaseCPUKernel
{
public:
explicit
FullconnectionFP16CPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
lite
::
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
FullconnectionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
FullconnectionFP16CPUKernel
()
override
;
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
RunImpl
(
int
task_id
);
private:
void
InitMatrixA
(
float
*
a_ptr
,
float16_t
*
a_pack_ptr
);
void
InitMatrixA
(
float16_t
*
a_ptr
,
float16_t
*
a_pack_ptr
);
void
InitMatrixB
(
float
*
b_ptr
,
float16_t
*
b_pack_ptr
);
void
FreeTmpBuffer
();
private:
float16_t
*
a_pack_ptr_
=
nullptr
;
float16_t
*
b_pack_ptr_
=
nullptr
;
float16_t
*
bias_ptr_
=
nullptr
;
float16_t
*
output_fp16_
=
nullptr
;
float16_t
*
output_ptr_
=
nullptr
;
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FULLCONNECTION_H_
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
0 → 100644
浏览文件 @
28c42d55
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/matmul_fp16.h"
#include "nnacl/fp16/matmul_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"
#include "src/kernel_registry.h"
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
lite
::
RET_ERROR
;
using
mindspore
::
lite
::
RET_INPUT_TENSOR_ERROR
;
using
mindspore
::
lite
::
RET_MEMORY_FAILED
;
using
mindspore
::
lite
::
RET_OK
;
using
mindspore
::
schema
::
PrimitiveType_MatMul
;
namespace
mindspore
::
kernel
{
MatmulFP16CPUKernel
::~
MatmulFP16CPUKernel
()
{
FreeTmpBuffer
();
}
void
MatmulFP16CPUKernel
::
FreeTmpBuffer
()
{
if
(
a_pack_ptr_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
a_pack_ptr_
);
a_pack_ptr_
=
nullptr
;
}
if
(
b_pack_ptr_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
b_pack_ptr_
);
b_pack_ptr_
=
nullptr
;
}
if
(
bias_ptr_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
bias_ptr_
);
bias_ptr_
=
nullptr
;
}
if
(
output_ptr_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
output_ptr_
);
output_ptr_
=
nullptr
;
}
}
int
MatmulFP16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
int
batch
=
1
;
auto
a_shape
=
in_tensors_
[
0
]
->
shape
();
auto
c_shape
=
out_tensors_
[
0
]
->
shape
();
if
(
in_tensors_
.
size
()
==
3
)
{
auto
bias_shape
=
in_tensors_
[
2
]
->
shape
();
if
(
bias_shape
[
bias_shape
.
size
()
-
1
]
!=
c_shape
[
c_shape
.
size
()
-
1
])
{
MS_LOG
(
ERROR
)
<<
"The bias' dimension is not equal with column"
;
return
RET_INPUT_TENSOR_ERROR
;
}
}
for
(
size_t
i
=
0
;
i
<
a_shape
.
size
()
-
2
;
++
i
)
{
batch
*=
a_shape
[
i
];
}
params_
->
batch
=
batch
;
params_
->
row_
=
c_shape
[
c_shape
.
size
()
-
2
];
params_
->
col_
=
c_shape
[
c_shape
.
size
()
-
1
];
params_
->
deep_
=
params_
->
a_transpose_
?
a_shape
[
a_shape
.
size
()
-
2
]
:
a_shape
[
a_shape
.
size
()
-
1
];
params_
->
row_16_
=
UP_ROUND
(
params_
->
row_
,
C16NUM
);
params_
->
col_8_
=
UP_ROUND
(
params_
->
col_
,
C8NUM
);
thread_count_
=
MSMIN
(
thread_count_
,
UP_DIV
(
params_
->
col_
,
C8NUM
));
thread_stride_
=
UP_DIV
(
UP_DIV
(
params_
->
col_
,
C8NUM
),
thread_count_
)
*
C8NUM
;
a_pack_ptr_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
params_
->
batch
*
params_
->
row_16_
*
params_
->
deep_
*
sizeof
(
float16_t
)));
if
(
a_pack_ptr_
==
nullptr
)
{
FreeTmpBuffer
();
return
RET_MEMORY_FAILED
;
}
memset
(
a_pack_ptr_
,
0
,
params_
->
batch
*
params_
->
row_16_
*
params_
->
deep_
*
sizeof
(
float16_t
));
b_pack_ptr_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
params_
->
batch
*
params_
->
col_8_
*
params_
->
deep_
*
sizeof
(
float16_t
)));
if
(
b_pack_ptr_
==
nullptr
)
{
FreeTmpBuffer
();
return
RET_MEMORY_FAILED
;
}
memset
(
b_pack_ptr_
,
0
,
params_
->
batch
*
params_
->
col_8_
*
params_
->
deep_
*
sizeof
(
float16_t
));
params_
->
a_const_
=
(
in_tensors_
[
0
]
->
Data
()
!=
nullptr
);
params_
->
b_const_
=
(
in_tensors_
[
1
]
->
Data
()
!=
nullptr
);
if
(
params_
->
a_const_
==
true
)
{
if
(
in_tensors_
[
0
]
->
data_type
()
==
kNumberTypeFloat32
)
{
InitMatrixA
(
reinterpret_cast
<
float
*>
(
in_tensors_
[
0
]
->
Data
()),
a_pack_ptr_
);
}
else
{
InitMatrixA
(
reinterpret_cast
<
float16_t
*>
(
in_tensors_
[
0
]
->
Data
()),
a_pack_ptr_
);
}
}
if
(
params_
->
b_const_
==
true
)
{
InitMatrixB
(
reinterpret_cast
<
float
*>
(
in_tensors_
[
1
]
->
Data
()),
b_pack_ptr_
);
}
if
(
in_tensors_
.
size
()
==
3
)
{
bias_ptr_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
params_
->
col_8_
*
sizeof
(
float16_t
)));
if
(
bias_ptr_
==
nullptr
)
{
FreeTmpBuffer
();
return
RET_MEMORY_FAILED
;
}
memset
(
bias_ptr_
,
0
,
params_
->
col_8_
*
sizeof
(
float16_t
));
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
in_tensors_
[
2
]
->
Data
()),
bias_ptr_
,
params_
->
col_
);
}
if
(
out_tensors_
[
0
]
->
data_type
()
==
kNumberTypeFloat32
)
{
output_ptr_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
params_
->
batch
*
params_
->
row_
*
params_
->
col_
*
sizeof
(
float16_t
)));
}
return
RET_OK
;
}
void
MatmulFP16CPUKernel
::
InitMatrixA
(
float
*
a_ptr
,
float16_t
*
a_pack_ptr
)
{
for
(
int
i
=
0
;
i
<
params_
->
batch
;
i
++
)
{
float
*
src
=
a_ptr
+
i
*
params_
->
deep_
*
params_
->
row_
;
float16_t
*
dst
=
a_pack_ptr
+
i
*
params_
->
deep_
*
params_
->
row_16_
;
if
(
params_
->
a_transpose_
)
{
Fp32RowMajor2Fp16Row16Major
(
src
,
dst
,
params_
->
deep_
,
params_
->
row_
);
}
else
{
Fp32RowMajor2Fp16Col16Major
(
src
,
dst
,
params_
->
row_
,
params_
->
deep_
);
}
}
}
void
MatmulFP16CPUKernel
::
InitMatrixA
(
float16_t
*
a_ptr
,
float16_t
*
a_pack_ptr
)
{
for
(
int
i
=
0
;
i
<
params_
->
batch
;
i
++
)
{
float16_t
*
src
=
a_ptr
+
i
*
params_
->
deep_
*
params_
->
row_
;
float16_t
*
dst
=
a_pack_ptr
+
i
*
params_
->
deep_
*
params_
->
row_16_
;
if
(
params_
->
a_transpose_
)
{
Fp16RowMajor2Fp16Row16Major
(
src
,
dst
,
params_
->
deep_
,
params_
->
row_
);
}
else
{
Fp16RowMajor2Fp16Col16Major
(
src
,
dst
,
params_
->
row_
,
params_
->
deep_
);
}
}
}
void
MatmulFP16CPUKernel
::
InitMatrixB
(
float
*
b_ptr
,
float16_t
*
b_pack_ptr
)
{
for
(
int
i
=
0
;
i
<
params_
->
batch
;
i
++
)
{
float
*
src
=
b_ptr
+
i
*
params_
->
deep_
*
params_
->
col_
;
float16_t
*
dst
=
b_pack_ptr
+
i
*
params_
->
deep_
*
params_
->
col_8_
;
if
(
params_
->
b_transpose_
)
{
Fp32RowMajor2Fp16Col8Major
(
src
,
dst
,
params_
->
col_
,
params_
->
deep_
);
}
else
{
Fp32RowMajor2Fp16Row8Major
(
src
,
dst
,
params_
->
deep_
,
params_
->
col_
);
}
}
}
int
MatmulFP16CPUKernel
::
Init
()
{
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
MatmulFP16CPUKernel
::
RunImpl
(
int
task_id
)
{
int
cur_stride
=
params_
->
col_
-
task_id
*
thread_stride_
;
int
cur_oc
=
MSMIN
(
thread_stride_
,
cur_stride
);
if
(
cur_oc
<=
0
)
{
return
RET_OK
;
}
auto
b
=
current_b_
+
task_id
*
thread_stride_
*
params_
->
deep_
;
auto
bias
=
(
bias_ptr_
==
nullptr
)
?
nullptr
:
bias_ptr_
+
thread_stride_
*
task_id
;
auto
c
=
current_c_
+
task_id
*
thread_stride_
;
MatMulFp16
(
current_a_
,
b
,
c
,
bias
,
ActType_No
,
params_
->
deep_
,
params_
->
row_
,
cur_oc
,
params_
->
col_
,
true
);
return
RET_OK
;
}
int
MatmulFP16Run
(
void
*
cdata
,
int
task_id
)
{
auto
op
=
reinterpret_cast
<
MatmulFP16CPUKernel
*>
(
cdata
);
auto
error_code
=
op
->
RunImpl
(
task_id
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"MatmulFp32Run error task_id["
<<
task_id
<<
"] error_code["
<<
error_code
<<
"]"
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
MatmulFP16CPUKernel
::
Run
()
{
auto
prepare_ret
=
Prepare
();
if
(
prepare_ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Prepare fail!ret: "
<<
prepare_ret
;
return
prepare_ret
;
}
auto
b
=
reinterpret_cast
<
float
*>
(
in_tensors_
[
1
]
->
Data
());
auto
out_tensor
=
out_tensors_
[
0
];
float16_t
*
c_ptr
;
if
(
out_tensor
->
data_type
()
==
kNumberTypeFloat32
)
{
c_ptr
=
output_ptr_
;
}
else
{
c_ptr
=
reinterpret_cast
<
float16_t
*>
(
out_tensor
->
Data
());
}
if
(
params_
->
a_const_
==
false
)
{
if
(
in_tensors_
[
0
]
->
data_type
()
==
kNumberTypeFloat32
)
{
InitMatrixA
(
reinterpret_cast
<
float
*>
(
in_tensors_
[
0
]
->
Data
()),
a_pack_ptr_
);
}
else
{
InitMatrixA
(
reinterpret_cast
<
float16_t
*>
(
in_tensors_
[
0
]
->
Data
()),
a_pack_ptr_
);
}
}
if
(
params_
->
b_const_
==
false
)
{
InitMatrixB
(
b
,
b_pack_ptr_
);
}
for
(
int
i
=
0
;
i
<
params_
->
batch
;
++
i
)
{
current_a_
=
a_pack_ptr_
+
i
*
params_
->
row_16_
*
params_
->
deep_
;
current_b_
=
b_pack_ptr_
+
i
*
params_
->
deep_
*
params_
->
col_8_
;
current_c_
=
c_ptr
+
i
*
params_
->
row_
*
params_
->
col_
;
ParallelLaunch
(
THREAD_POOL_DEFAULT
,
MatmulFP16Run
,
this
,
thread_count_
);
}
if
(
out_tensor
->
data_type
()
==
kNumberTypeFloat32
)
{
auto
size
=
out_tensor
->
ElementsNum
();
auto
out_tensor_data
=
reinterpret_cast
<
float
*>
(
out_tensor
->
Data
());
Float16ToFloat32
(
output_ptr_
,
out_tensor_data
,
size
);
}
return
RET_OK
;
}
kernel
::
LiteKernel
*
CpuMatmulFp16KernelCreator
(
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
OpParameter
*
opParameter
,
const
lite
::
Context
*
ctx
,
const
kernel
::
KernelKey
&
desc
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
auto
*
kernel
=
new
(
std
::
nothrow
)
MatmulFP16CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
if
(
kernel
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"kernel is nullptr."
;
return
nullptr
;
}
auto
ret
=
kernel
->
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init kernel failed, name: "
<<
opParameter
->
name_
<<
", type: "
<<
schema
::
EnumNamePrimitiveType
(
static_cast
<
schema
::
PrimitiveType
>
(
opParameter
->
type_
));
delete
kernel
;
return
nullptr
;
}
return
kernel
;
}
REG_KERNEL
(
kCPU
,
kNumberTypeFloat16
,
PrimitiveType_MatMul
,
CpuMatmulFp16KernelCreator
)
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h
0 → 100644
浏览文件 @
28c42d55
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_
#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/matmul_parameter.h"
#include "src/runtime/kernel/arm/base/matmul_base.h"
namespace
mindspore
::
kernel
{
class
MatmulFP16CPUKernel
:
public
MatmulBaseCPUKernel
{
public:
explicit
MatmulFP16CPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
lite
::
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
MatmulBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
MatmulFP16CPUKernel
()
override
;
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
RunImpl
(
int
task_id
);
private:
void
InitMatrixA
(
float
*
a_ptr
,
float16_t
*
a_pack_ptr
);
void
InitMatrixA
(
float16_t
*
a_ptr
,
float16_t
*
a_pack_ptr
);
void
InitMatrixB
(
float
*
b_ptr
,
float16_t
*
b_pack_ptr
);
void
FreeTmpBuffer
();
private:
float16_t
*
a_pack_ptr_
=
nullptr
;
float16_t
*
b_pack_ptr_
=
nullptr
;
float16_t
*
bias_ptr_
=
nullptr
;
float16_t
*
output_ptr_
=
nullptr
;
float16_t
*
current_a_
=
nullptr
;
float16_t
*
current_b_
=
nullptr
;
float16_t
*
current_c_
=
nullptr
;
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录