Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
e3c053c4
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e3c053c4
编写于
8月 23, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 23, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4961 Optimize the performance of BatchNorm and FusedBatchNorm, add Fp16 kernel
Merge pull request !4961 from sunsuodong/batch_norm_fp16
上级
80d570f0
9734f2a8
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
534 addition
and
313 deletion
+534
-313
mindspore/lite/nnacl/fp16/batchnorm_fp16.c
mindspore/lite/nnacl/fp16/batchnorm_fp16.c
+52
-0
mindspore/lite/nnacl/fp16/batchnorm_fp16.h
mindspore/lite/nnacl/fp16/batchnorm_fp16.h
+37
-0
mindspore/lite/nnacl/fp32/batchnorm.c
mindspore/lite/nnacl/fp32/batchnorm.c
+29
-13
mindspore/lite/nnacl/fp32/batchnorm.h
mindspore/lite/nnacl/fp32/batchnorm.h
+4
-6
mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
+87
-0
mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.h
mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.h
+36
-0
mindspore/lite/src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.cc
.../lite/src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.cc
+103
-0
mindspore/lite/src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.h
...e/lite/src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.h
+36
-0
mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
+46
-75
mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
+13
-13
mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.cc
...spore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.cc
+30
-109
mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.h
mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.h
+10
-21
mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
...st/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
+51
-76
未找到文件。
mindspore/lite/nnacl/fp16/batchnorm_fp16.c
0 → 100644
浏览文件 @
e3c053c4
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16/batchnorm_fp16.h"
#include <math.h>
void
BatchNormFp16
(
const
void
*
input
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
)
{
int
units_per_thread
=
UP_DIV
(
param
->
unit_
,
param
->
op_parameter_
.
thread_num_
);
int
completed_units
=
task_id
*
units_per_thread
;
int
cur_unit
=
MSMIN
(
units_per_thread
,
param
->
unit_
-
completed_units
);
int
cur_offset
=
completed_units
*
param
->
channel_
;
for
(
int
i
=
0
;
i
<
cur_unit
;
i
++
)
{
for
(
int
c
=
0
;
c
<
param
->
channel_
;
c
++
)
{
float16_t
variance_sqrt
=
sqrt
(((
const
float16_t
*
)
variance
)[
c
]
+
param
->
epsilon_
);
((
float16_t
*
)
output
)[
cur_offset
+
c
]
=
(((
const
float16_t
*
)
input
)[
cur_offset
+
c
]
-
((
const
float16_t
*
)
mean
)[
c
])
/
variance_sqrt
;
}
cur_offset
+=
param
->
channel_
;
}
}
void
FusedBatchNormFp16
(
const
void
*
input
,
const
void
*
scale
,
const
void
*
offset
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
)
{
int
units_per_thread
=
UP_DIV
(
param
->
unit_
,
param
->
op_parameter_
.
thread_num_
);
int
completed_units
=
task_id
*
units_per_thread
;
int
cur_unit
=
MSMIN
(
units_per_thread
,
param
->
unit_
-
completed_units
);
int
cur_offset
=
completed_units
*
param
->
channel_
;
for
(
int
i
=
0
;
i
<
cur_unit
;
i
++
)
{
for
(
int
c
=
0
;
c
<
param
->
channel_
;
c
++
)
{
float16_t
variance_sqrt
=
sqrt
(((
const
float16_t
*
)
variance
)[
c
]
+
param
->
epsilon_
);
float16_t
norm_val
=
(((
const
float16_t
*
)
input
)[
cur_offset
+
c
]
-
((
const
float16_t
*
)
mean
)[
c
])
/
variance_sqrt
;
((
float16_t
*
)
output
)[
cur_offset
+
c
]
=
norm_val
*
((
const
float16_t
*
)
scale
)[
c
]
+
((
const
float16_t
*
)
offset
)[
c
];
}
cur_offset
+=
param
->
channel_
;
}
}
mindspore/lite/nnacl/fp16/batchnorm_fp16.h
0 → 100644
浏览文件 @
e3c053c4
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_BATCHNORM_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_BATCHNORM_FP16_H_
#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include "nnacl/batchnorm_parameter.h"
#ifdef __cplusplus
extern
"C"
{
#endif
void
BatchNormFp16
(
const
void
*
input
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
);
void
FusedBatchNormFp16
(
const
void
*
input
,
const
void
*
scale
,
const
void
*
offset
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP16_BATCHNORM_FP16_H_
mindspore/lite/nnacl/fp32/batchnorm.c
浏览文件 @
e3c053c4
...
...
@@ -15,26 +15,42 @@
*/
#include "nnacl/fp32/batchnorm.h"
#include "nnacl/fp16/batchnorm_fp16.h"
#include <math.h>
#include "nnacl/batchnorm_parameter.h"
#include "nnacl/op_base.h"
#include "nnacl/errorcode.h"
void
BatchNorm
(
float
*
output_ptr
,
const
float
*
input_ptr
,
const
float
*
mean_ptr
,
const
float
*
variance_ptr
,
int
task_id
,
BatchNormParameter
*
param
)
{
for
(
int
c
=
task_id
;
c
<
param
->
channel_
;
c
+=
param
->
op_parameter_
.
thread_num_
)
{
float
variance_sqrt
=
sqrt
(
variance_ptr
[
c
]
+
param
->
epsilon_
);
for
(
int
u
=
0
;
u
<
param
->
unit_
;
u
++
)
{
output_ptr
[
u
*
param
->
channel_
+
c
]
=
(
input_ptr
[
u
*
param
->
channel_
+
c
]
-
mean_ptr
[
c
])
/
variance_sqrt
;
void
BatchNormFp32
(
const
void
*
input
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
)
{
int
units_per_thread
=
UP_DIV
(
param
->
unit_
,
param
->
op_parameter_
.
thread_num_
);
int
completed_units
=
task_id
*
units_per_thread
;
int
cur_unit
=
MSMIN
(
units_per_thread
,
param
->
unit_
-
completed_units
);
int
cur_offset
=
completed_units
*
param
->
channel_
;
for
(
int
i
=
0
;
i
<
cur_unit
;
i
++
)
{
for
(
int
c
=
0
;
c
<
param
->
channel_
;
c
++
)
{
float
variance_sqrt
=
sqrt
(((
const
float
*
)
variance
)[
c
]
+
param
->
epsilon_
);
((
float
*
)
output
)[
cur_offset
+
c
]
=
(((
const
float
*
)
input
)[
cur_offset
+
c
]
-
((
const
float
*
)
mean
)[
c
])
/
variance_sqrt
;
}
cur_offset
+=
param
->
channel_
;
}
}
void
FusedBatchNorm
(
float
*
output_ptr
,
const
float
*
input_ptr
,
const
float
*
scale_ptr
,
const
float
*
offest_ptr
,
const
float
*
mean_ptr
,
const
float
*
variance_ptr
,
int
task_id
,
BatchNormParameter
*
param
)
{
for
(
int
c
=
task_id
;
c
<
param
->
channel_
;
c
+=
param
->
op_parameter_
.
thread_num_
)
{
float
variance_sqrt
=
sqrt
(
variance_ptr
[
c
]
+
param
->
epsilon_
);
for
(
int
u
=
0
;
u
<
param
->
unit_
;
u
++
)
{
output_ptr
[
u
*
param
->
channel_
+
c
]
=
(
input_ptr
[
u
*
param
->
channel_
+
c
]
-
mean_ptr
[
c
])
/
variance_sqrt
*
scale_ptr
[
c
]
+
offest_ptr
[
c
];
void
FusedBatchNormFp32
(
const
void
*
input
,
const
void
*
scale
,
const
void
*
offset
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
)
{
int
units_per_thread
=
UP_DIV
(
param
->
unit_
,
param
->
op_parameter_
.
thread_num_
);
int
completed_units
=
task_id
*
units_per_thread
;
int
cur_unit
=
MSMIN
(
units_per_thread
,
param
->
unit_
-
completed_units
);
int
cur_offset
=
completed_units
*
param
->
channel_
;
for
(
int
i
=
0
;
i
<
cur_unit
;
i
++
)
{
for
(
int
c
=
0
;
c
<
param
->
channel_
;
c
++
)
{
float
variance_sqrt
=
sqrt
(((
const
float
*
)
variance
)[
c
]
+
param
->
epsilon_
);
float
norm_val
=
(((
const
float
*
)
input
)[
cur_offset
+
c
]
-
((
const
float
*
)
mean
)[
c
])
/
variance_sqrt
;
((
float
*
)
output
)[
cur_offset
+
c
]
=
norm_val
*
((
const
float
*
)
scale
)[
c
]
+
((
const
float
*
)
offset
)[
c
];
}
cur_offset
+=
param
->
channel_
;
}
}
mindspore/lite/nnacl/fp32/batchnorm.h
浏览文件 @
e3c053c4
...
...
@@ -17,18 +17,16 @@
#ifndef MINDSPORE_LITE_NNACL_FP32_BATCHNORM_H_
#define MINDSPORE_LITE_NNACL_FP32_BATCHNORM_H_
#include "nnacl/op_base.h"
#include "nnacl/batchnorm_parameter.h"
#ifdef __cplusplus
extern
"C"
{
#endif
void
BatchNorm
(
float
*
output_ptr
,
const
float
*
input_ptr
,
const
float
*
mean_ptr
,
const
float
*
variance_ptr
,
int
task_id
,
BatchNormParameter
*
param
);
void
FusedBatchNorm
(
float
*
output_ptr
,
const
float
*
input_ptr
,
const
float
*
scale_ptr
,
const
float
*
offest_ptr
,
const
float
*
mean_ptr
,
const
float
*
variance_ptr
,
int
task_id
,
BatchNormParameter
*
param
);
void
BatchNormFp32
(
const
void
*
input
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
);
void
FusedBatchNormFp32
(
const
void
*
input
,
const
void
*
scale
,
const
void
*
offset
,
const
void
*
mean
,
const
void
*
variance
,
BatchNormParameter
*
param
,
int
task_id
,
void
*
output
);
#ifdef __cplusplus
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
0 → 100644
浏览文件 @
e3c053c4
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/batchnorm_fp16.h"
#include "nnacl/fp16/batchnorm_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/kernel_registry.h"
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
schema
::
PrimitiveType_BatchNorm
;
namespace
mindspore
::
kernel
{
int
BatchnormFp16CPUKernel
::
DoExecute
(
int
task_id
)
{
auto
param
=
reinterpret_cast
<
BatchNormParameter
*>
(
op_parameter_
);
if
(
in_tensors_
.
at
(
0
)
->
data_type
()
==
kNumberTypeFloat32
)
{
auto
input
=
in_tensors_
.
at
(
0
);
auto
mean
=
in_tensors_
.
at
(
1
);
auto
variance
=
in_tensors_
.
at
(
2
);
auto
output
=
out_tensors_
.
at
(
0
);
auto
input_fp16
=
context_
->
allocator
->
Malloc
(
input
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
mean_fp16
=
context_
->
allocator
->
Malloc
(
mean
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
variance_fp16
=
context_
->
allocator
->
Malloc
(
variance
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
output_fp16
=
context_
->
allocator
->
Malloc
(
output
->
ElementsNum
()
*
sizeof
(
float16_t
));
if
(
input_fp16
==
nullptr
||
mean_fp16
==
nullptr
||
variance_fp16
==
nullptr
||
output_fp16
==
nullptr
)
{
context_
->
allocator
->
Free
(
input_fp16
);
context_
->
allocator
->
Free
(
mean_fp16
);
context_
->
allocator
->
Free
(
variance_fp16
);
context_
->
allocator
->
Free
(
output_fp16
);
}
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
input
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
input_fp16
),
input
->
ElementsNum
());
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
mean
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
mean_fp16
),
mean
->
ElementsNum
());
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
variance
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
variance_fp16
),
variance
->
ElementsNum
());
BatchNormFp16
(
input_fp16
,
mean_fp16
,
variance_fp16
,
param
,
task_id
,
output_fp16
);
Float16ToFloat32
(
reinterpret_cast
<
float16_t
*>
(
output_fp16
),
reinterpret_cast
<
float
*>
(
output
),
output
->
ElementsNum
());
context_
->
allocator
->
Free
(
input_fp16
);
context_
->
allocator
->
Free
(
mean_fp16
);
context_
->
allocator
->
Free
(
variance_fp16
);
context_
->
allocator
->
Free
(
output_fp16
);
return
mindspore
::
lite
::
RET_OK
;
}
BatchNormFp16
(
in_tensors_
.
at
(
0
)
->
Data
(),
mean_
,
variance_
,
param
,
task_id
,
out_tensors_
.
at
(
0
)
->
Data
());
return
mindspore
::
lite
::
RET_OK
;
}
kernel
::
LiteKernel
*
CpuBatchnormFp16KernelCreator
(
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
OpParameter
*
opParameter
,
const
lite
::
Context
*
ctx
,
const
kernel
::
KernelKey
&
desc
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
auto
*
kernel
=
new
(
std
::
nothrow
)
BatchnormFp16CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
if
(
kernel
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new BatchnormFp16CPUKernel fail!"
;
return
nullptr
;
}
auto
ret
=
kernel
->
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init kernel failed, name: "
<<
opParameter
->
name_
<<
", type: "
<<
schema
::
EnumNamePrimitiveType
(
static_cast
<
schema
::
PrimitiveType
>
(
opParameter
->
type_
));
delete
kernel
;
return
nullptr
;
}
return
kernel
;
}
// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, CpuBatchnormFp16KernelCreator)
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.h
0 → 100644
浏览文件 @
e3c053c4
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BATCHNORM_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BATCHNORM_FP16_H_
#include <vector>
#include "src/runtime/kernel/arm/fp32/batchnorm.h"
namespace
mindspore
::
kernel
{
class
BatchnormFp16CPUKernel
:
public
BatchnormCPUKernel
{
public:
BatchnormFp16CPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
BatchnormCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
virtual
~
BatchnormFp16CPUKernel
()
{}
virtual
int
DoExecute
(
int
task_id
);
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BATCHNORM_FP16_H_
mindspore/lite/src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.cc
0 → 100644
浏览文件 @
e3c053c4
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.h"
#include "nnacl/fp16/batchnorm_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/kernel_registry.h"
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
schema
::
PrimitiveType_FusedBatchNorm
;
namespace
mindspore
::
kernel
{
int
FusedBatchnormFp16CPUKernel
::
DoExecute
(
int
task_id
)
{
auto
param
=
reinterpret_cast
<
BatchNormParameter
*>
(
op_parameter_
);
if
(
in_tensors_
.
at
(
0
)
->
data_type
()
==
kNumberTypeFloat32
)
{
auto
input
=
in_tensors_
.
at
(
0
);
auto
scale
=
in_tensors_
.
at
(
1
);
auto
offset
=
in_tensors_
.
at
(
2
);
auto
mean
=
in_tensors_
.
at
(
3
);
auto
variance
=
in_tensors_
.
at
(
4
);
auto
output
=
out_tensors_
.
at
(
0
);
auto
input_fp16
=
context_
->
allocator
->
Malloc
(
input
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
scale_fp16
=
context_
->
allocator
->
Malloc
(
scale
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
offset_fp16
=
context_
->
allocator
->
Malloc
(
offset
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
mean_fp16
=
context_
->
allocator
->
Malloc
(
mean
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
variance_fp16
=
context_
->
allocator
->
Malloc
(
variance
->
ElementsNum
()
*
sizeof
(
float16_t
));
auto
output_fp16
=
context_
->
allocator
->
Malloc
(
output
->
ElementsNum
()
*
sizeof
(
float16_t
));
if
(
input_fp16
==
nullptr
||
scale_fp16
==
nullptr
||
offset_fp16
==
nullptr
||
mean_fp16
==
nullptr
||
variance_fp16
==
nullptr
||
output_fp16
==
nullptr
)
{
context_
->
allocator
->
Free
(
input_fp16
);
context_
->
allocator
->
Free
(
scale_fp16
);
context_
->
allocator
->
Free
(
offset_fp16
);
context_
->
allocator
->
Free
(
mean_fp16
);
context_
->
allocator
->
Free
(
variance_fp16
);
context_
->
allocator
->
Free
(
output_fp16
);
}
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
input
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
input_fp16
),
input
->
ElementsNum
());
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
scale
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
scale_fp16
),
scale
->
ElementsNum
());
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
offset
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
offset_fp16
),
offset
->
ElementsNum
());
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
mean
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
mean_fp16
),
mean
->
ElementsNum
());
Float32ToFloat16
(
reinterpret_cast
<
float
*>
(
variance
->
Data
()),
reinterpret_cast
<
float16_t
*>
(
variance_fp16
),
variance
->
ElementsNum
());
FusedBatchNormFp16
(
input_fp16
,
scale_fp16
,
offset_fp16
,
mean_fp16
,
variance_fp16
,
param
,
task_id
,
output_fp16
);
Float16ToFloat32
(
reinterpret_cast
<
float16_t
*>
(
output_fp16
),
reinterpret_cast
<
float
*>
(
output
),
output
->
ElementsNum
());
context_
->
allocator
->
Free
(
input_fp16
);
context_
->
allocator
->
Free
(
scale_fp16
);
context_
->
allocator
->
Free
(
offset_fp16
);
context_
->
allocator
->
Free
(
mean_fp16
);
context_
->
allocator
->
Free
(
variance_fp16
);
context_
->
allocator
->
Free
(
output_fp16
);
return
mindspore
::
lite
::
RET_OK
;
}
FusedBatchNormFp16
(
in_tensors_
.
at
(
0
)
->
Data
(),
scale_
,
offset_
,
mean_
,
variance_
,
param
,
task_id
,
out_tensors_
.
at
(
0
)
->
Data
());
return
mindspore
::
lite
::
RET_OK
;
}
kernel
::
LiteKernel
*
CpuFusedBatchnormFp16KernelCreator
(
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
OpParameter
*
op_parameter
,
const
lite
::
Context
*
ctx
,
const
kernel
::
KernelKey
&
desc
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
FusedBatchnormFp16CPUKernel
*
kernel
=
new
(
std
::
nothrow
)
FusedBatchnormFp16CPUKernel
(
op_parameter
,
inputs
,
outputs
,
ctx
,
primitive
);
if
(
kernel
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new FusedBatchnormFp16CPUKernel fail!"
;
return
nullptr
;
}
auto
ret
=
kernel
->
Init
();
if
(
ret
!=
RET_OK
)
{
delete
kernel
;
MS_LOG
(
ERROR
)
<<
"Init kernel failed, name: "
<<
op_parameter
->
name_
<<
", type: "
<<
schema
::
EnumNamePrimitiveType
(
static_cast
<
schema
::
PrimitiveType
>
(
op_parameter
->
type_
));
return
nullptr
;
}
return
kernel
;
}
// REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FusedBatchNorm, CpuFusedBatchnormFp16KernelCreator)
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.h
0 → 100644
浏览文件 @
e3c053c4
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FUSED_BATCHNORM_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FUSED_BATCHNORM_FP16_H_
#include <vector>
#include "src/runtime/kernel/arm/fp32/fused_batchnorm.h"
namespace
mindspore
::
kernel
{
class
FusedBatchnormFp16CPUKernel
:
public
FusedBatchnormCPUKernel
{
public:
FusedBatchnormFp16CPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
FusedBatchnormCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
virtual
~
FusedBatchnormFp16CPUKernel
()
{}
virtual
int
DoExecute
(
int
task_id
);
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_FUSED_BATCHNORM_FP16_H_
mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
浏览文件 @
e3c053c4
...
...
@@ -15,50 +15,12 @@
*/
#include "src/runtime/kernel/arm/fp32/batchnorm.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
#include "nnacl/batchnorm_parameter.h"
#include "nnacl/fp32/batchnorm.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kCPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
lite
::
RET_ERROR
;
using
mindspore
::
lite
::
RET_OK
;
using
mindspore
::
schema
::
PrimitiveType_BatchNorm
;
namespace
mindspore
::
kernel
{
BatchnormCPUKernel
::~
BatchnormCPUKernel
()
{
if
(
mean_addr_
!=
nullptr
)
{
free
(
mean_addr_
);
mean_addr_
=
nullptr
;
}
if
(
var_addr_
!=
nullptr
)
{
free
(
var_addr_
);
var_addr_
=
nullptr
;
}
}
int
BatchnormCPUKernel
::
InitConstTensor
()
{
auto
mean
=
in_tensors_
[
1
];
mean_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
mean
->
ElementsNum
()
*
sizeof
(
float
)));
if
(
mean_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memcpy
(
mean_addr_
,
mean
->
Data
(),
mean
->
ElementsNum
()
*
sizeof
(
float
));
auto
variance
=
in_tensors_
[
2
];
var_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
variance
->
ElementsNum
()
*
sizeof
(
float
)));
if
(
var_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memcpy
(
var_addr_
,
variance
->
Data
(),
variance
->
ElementsNum
()
*
sizeof
(
float
));
return
RET_OK
;
}
int
BatchnormCPUKernel
::
Init
()
{
if
(
!
InferShapeDone
())
{
return
RET_OK
;
...
...
@@ -67,62 +29,72 @@ int BatchnormCPUKernel::Init() {
}
int
BatchnormCPUKernel
::
ReSize
()
{
if
(
mean_addr_
!=
nullptr
)
{
free
(
mean_addr_
);
mean_addr_
=
nullptr
;
FreeMeanAndVariance
();
FillParam
();
return
InitConstTensor
();
}
void
BatchnormCPUKernel
::
FreeMeanAndVariance
()
{
if
(
mean_
!=
nullptr
)
{
free
(
mean_
);
mean_
=
nullptr
;
}
if
(
var
_addr
_
!=
nullptr
)
{
free
(
var
_addr
_
);
var
_addr
_
=
nullptr
;
if
(
var
iance
_
!=
nullptr
)
{
free
(
var
iance
_
);
var
iance
_
=
nullptr
;
}
}
void
BatchnormCPUKernel
::
FillParam
()
{
auto
input_shapes
=
in_tensors_
[
0
]
->
shape
();
auto
n_dim
=
input_shapes
.
size
();
batchnorm_param_
->
channel_
=
input_shapes
[
n_dim
-
1
];
batchnorm_param_
->
unit_
=
1
;
auto
param
=
reinterpret_cast
<
BatchNormParameter
*>
(
op_parameter_
);
param
->
channel_
=
input_shapes
[
n_dim
-
1
];
param
->
unit_
=
1
;
for
(
size_t
i
=
0
;
i
<
n_dim
-
1
;
i
++
)
{
batchnorm_param_
->
unit_
*=
input_shapes
[
i
];
param
->
unit_
*=
input_shapes
[
i
];
}
batchnorm_param_
->
op_parameter_
.
thread_num_
=
MSMIN
(
batchnorm_param_
->
op_parameter_
.
thread_num_
,
batchnorm_param_
->
channel_
);
}
auto
ret
=
InitConstTensor
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Batchnorm fp32 InitConstTensor failed."
;
int
BatchnormCPUKernel
::
InitConstTensor
()
{
mean_
=
malloc
(
in_tensors_
[
1
]
->
Size
());
variance_
=
malloc
(
in_tensors_
[
2
]
->
Size
());
if
(
mean_
==
nullptr
||
variance_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Memory allocation failed"
;
FreeMeanAndVariance
();
return
RET_ERROR
;
}
memcpy
(
mean_
,
in_tensors_
[
1
]
->
Data
(),
in_tensors_
[
1
]
->
Size
());
memcpy
(
variance_
,
in_tensors_
[
2
]
->
Data
(),
in_tensors_
[
2
]
->
Size
());
return
RET_OK
;
}
int
BatchnormCPUKernel
::
DoExecute
(
int
task_id
)
{
BatchNorm
(
out_addr_
,
in_addr_
,
mean_addr_
,
var_addr_
,
task_id
,
batchnorm_param_
);
return
RET_OK
;
}
int
BatchNormRun
(
int
task_id
,
LiteParallelGroupEnv
*
penv
,
void
*
cdata
)
{
auto
g_kernel
=
reinterpret_cast
<
BatchnormCPUKernel
*>
(
cdata
);
auto
ret
=
g_kernel
->
DoExecute
(
task_id
);
int
BatchnormCPUKernel
::
Run
()
{
auto
ret
=
Prepare
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"
BatchnormRun error task_id["
<<
task_id
<<
"] error_code["
<<
ret
<<
"]"
;
MS_LOG
(
ERROR
)
<<
"
Prepare fail! Ret error code: "
<<
ret
;
return
ret
;
}
return
RET_OK
;
ret
=
LiteBackendParallelLaunch
(
BatchNormRun
,
this
,
op_parameter_
->
thread_num_
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"BatchnormRun error error_code["
<<
ret
<<
"]"
;
}
return
ret
;
}
int
BatchnormCPUKernel
::
Run
()
{
auto
prepare_ret
=
Prepare
();
if
(
prepare_ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Prepare fail! Ret error code: "
<<
prepare_ret
;
return
prepare_ret
;
}
in_addr_
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
0
)
->
Data
());
out_addr_
=
reinterpret_cast
<
float
*>
(
out_tensors_
.
at
(
0
)
->
Data
());
int
BatchnormCPUKernel
::
DoExecute
(
int
task_id
)
{
auto
param
=
reinterpret_cast
<
BatchNormParameter
*>
(
op_parameter_
);
BatchNormFp32
(
in_tensors_
.
at
(
0
)
->
Data
(),
mean_
,
variance_
,
param
,
task_id
,
out_tensors_
.
at
(
0
)
->
Data
());
return
mindspore
::
lite
::
RET_OK
;
}
int
ret
=
LiteBackendParallelLaunch
(
BatchNormRun
,
this
,
batchnorm_param_
->
op_parameter_
.
thread_num_
);
int
BatchNormRun
(
int
task_id
,
LiteParallelGroupEnv
*
penv
,
void
*
cdata
)
{
auto
kernel
=
reinterpret_cast
<
BatchnormCPUKernel
*>
(
cdata
);
auto
ret
=
kernel
->
DoExecute
(
task_id
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"BatchnormRun error error_code["
<<
ret
<<
"]"
;
return
ret
;
MS_LOG
(
ERROR
)
<<
"BatchnormRun error task_id["
<<
task_id
<<
"] error_code["
<<
ret
<<
"]"
;
}
return
RET_OK
;
return
ret
;
}
kernel
::
LiteKernel
*
CpuBatchnormKernelCreator
(
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
...
...
@@ -131,7 +103,6 @@ kernel::LiteKernel *CpuBatchnormKernelCreator(const std::vector<lite::tensor::Te
const
kernel
::
KernelKey
&
desc
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
MS_ASSERT
(
opParameter
!=
nullptr
);
MS_ASSERT
(
desc
.
type
==
schema
::
PrimitiveType_BatchNorm
);
auto
*
kernel
=
new
(
std
::
nothrow
)
BatchnormCPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
if
(
kernel
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new BatchNormCPUKernel fail!"
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
浏览文件 @
e3c053c4
...
...
@@ -22,6 +22,7 @@
#include "include/context.h"
#include "nnacl/fp32/batchnorm.h"
#include "nnacl/batchnorm_parameter.h"
#include "src/runtime/runtime_api.h"
using
mindspore
::
lite
::
Context
;
...
...
@@ -31,24 +32,23 @@ class BatchnormCPUKernel : public LiteKernel {
BatchnormCPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
LiteKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{
batchnorm_param_
=
reinterpret_cast
<
BatchNormParameter
*>
(
parameter
);
}
~
BatchnormCPUKernel
()
override
;
:
LiteKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
virtual
~
BatchnormCPUKernel
()
{
FreeMeanAndVariance
();
}
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
InitConstTensor
();
int
DoExecute
(
int
tid
);
private:
float
*
in_addr_
=
nullptr
;
float
*
mean_addr_
=
nullptr
;
float
*
var_addr_
=
nullptr
;
float
*
out_addr_
=
nullptr
;
BatchNormParameter
*
batchnorm_param_
;
virtual
int
InitConstTensor
();
virtual
int
DoExecute
(
int
task_id
);
protected:
void
FillParam
();
void
FreeMeanAndVariance
();
void
*
mean_
=
nullptr
;
void
*
variance_
=
nullptr
;
};
int
BatchNormRun
(
int
task_id
,
LiteParallelGroupEnv
*
penv
,
void
*
cdata
);
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BATCHNORM_H_
mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.cc
浏览文件 @
e3c053c4
...
...
@@ -15,133 +15,59 @@
*/
#include "src/runtime/kernel/arm/fp32/fused_batchnorm.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
#include "nnacl/batchnorm_parameter.h"
#include "nnacl/fp32/batchnorm.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kCPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
lite
::
RET_ERROR
;
using
mindspore
::
lite
::
RET_OK
;
using
mindspore
::
schema
::
PrimitiveType_FusedBatchNorm
;
namespace
mindspore
::
kernel
{
FusedBatchnormCPUKernel
::~
FusedBatchnormCPUKernel
()
{
FreeTmpBuffer
();
}
int
FusedBatchnormCPUKernel
::
ReSize
()
{
FreeMeanAndVariance
();
FreeScaleAndOffset
();
FillParam
();
return
InitConstTensor
();
}
void
FusedBatchnormCPUKernel
::
FreeTmpBuffer
()
{
if
(
scale_addr_
!=
nullptr
)
{
free
(
scale_addr_
);
scale_addr_
=
nullptr
;
}
if
(
offset_addr_
!=
nullptr
)
{
free
(
offset_addr_
);
offset_addr_
=
nullptr
;
void
FusedBatchnormCPUKernel
::
FreeScaleAndOffset
()
{
if
(
scale_
!=
nullptr
)
{
free
(
scale_
);
scale_
=
nullptr
;
}
if
(
mean_addr_
!=
nullptr
)
{
free
(
mean_addr_
);
mean_addr_
=
nullptr
;
}
if
(
var_addr_
!=
nullptr
)
{
free
(
var_addr_
);
var_addr_
=
nullptr
;
if
(
offset_
!=
nullptr
)
{
free
(
offset_
);
offset_
=
nullptr
;
}
}
int
FusedBatchnormCPUKernel
::
InitConstTensor
()
{
auto
scale
=
in_tensors_
[
1
];
scale_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
scale
->
ElementsNum
()
*
sizeof
(
float
)));
if
(
scale_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memcpy
(
scale_addr_
,
scale
->
Data
(),
scale
->
ElementsNum
()
*
sizeof
(
float
));
auto
offset
=
in_tensors_
[
2
];
offset_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
offset
->
ElementsNum
()
*
sizeof
(
float
)));
if
(
offset_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memcpy
(
offset_addr_
,
offset
->
Data
(),
offset
->
ElementsNum
()
*
sizeof
(
float
));
auto
mean
=
in_tensors_
[
3
];
mean_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
mean
->
ElementsNum
()
*
sizeof
(
float
)));
if
(
mean_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memcpy
(
mean_addr_
,
mean
->
Data
(),
mean
->
ElementsNum
()
*
sizeof
(
float
));
auto
variance
=
in_tensors_
[
4
];
var_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
variance
->
ElementsNum
()
*
sizeof
(
float
)));
if
(
var_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memcpy
(
var_addr_
,
variance
->
Data
(),
variance
->
ElementsNum
()
*
sizeof
(
float
));
return
RET_OK
;
}
int
FusedBatchnormCPUKernel
::
Init
()
{
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
scale_
=
malloc
(
scale
->
Size
());
offset_
=
malloc
(
offset
->
Size
());
mean_
=
malloc
(
mean
->
Size
());
variance_
=
malloc
(
variance
->
Size
());
int
FusedBatchnormCPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
input_shapes
=
in_tensors_
[
0
]
->
shape
();
auto
n_dim
=
input_shapes
.
size
();
batchnorm_param_
->
channel_
=
input_shapes
[
n_dim
-
1
];
batchnorm_param_
->
unit_
=
1
;
for
(
size_t
i
=
0
;
i
<
n_dim
-
1
;
i
++
)
{
batchnorm_param_
->
unit_
*=
input_shapes
[
i
];
}
batchnorm_param_
->
op_parameter_
.
thread_num_
=
MSMIN
(
batchnorm_param_
->
op_parameter_
.
thread_num_
,
batchnorm_param_
->
channel_
);
auto
ret
=
InitConstTensor
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"FusedBatchnorm fp32 InitConstTensor failed."
;
if
(
scale_
==
nullptr
||
offset_
==
nullptr
||
mean_
==
nullptr
||
variance_
==
nullptr
)
{
FreeMeanAndVariance
();
FreeScaleAndOffset
();
MS_LOG
(
ERROR
)
<<
"Memory allocation failed"
;
return
RET_ERROR
;
}
memcpy
(
scale_
,
scale
->
Data
(),
scale
->
Size
());
memcpy
(
offset_
,
offset
->
Data
(),
offset
->
Size
());
memcpy
(
mean_
,
mean
->
Data
(),
mean
->
Size
());
memcpy
(
variance_
,
variance
->
Data
(),
variance
->
Size
());
return
RET_OK
;
}
int
FusedBatchnormCPUKernel
::
Execute
(
int
task_id
)
{
FusedBatchNorm
(
out_addr_
,
in_addr_
,
scale_addr_
,
offset_addr_
,
mean_addr_
,
var_addr_
,
task_id
,
batchnorm_param_
);
return
RET_OK
;
}
int
FusedBatchNormRun
(
int
task_id
,
LiteParallelGroupEnv
*
penv
,
void
*
cdata
)
{
auto
g_kernel
=
reinterpret_cast
<
FusedBatchnormCPUKernel
*>
(
cdata
);
auto
ret
=
g_kernel
->
Execute
(
task_id
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"FusedBatchnormRun error task_id["
<<
task_id
<<
"] error_code["
<<
ret
<<
"]"
;
return
ret
;
}
return
RET_OK
;
}
int
FusedBatchnormCPUKernel
::
Run
()
{
auto
prepare_ret
=
Prepare
();
if
(
prepare_ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Prepare fail! Ret error code: "
<<
prepare_ret
;
return
prepare_ret
;
}
in_addr_
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
0
)
->
Data
());
out_addr_
=
reinterpret_cast
<
float
*>
(
out_tensors_
.
at
(
0
)
->
Data
());
int
ret
=
LiteBackendParallelLaunch
(
FusedBatchNormRun
,
this
,
batchnorm_param_
->
op_parameter_
.
thread_num_
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"FusedBatchnormRun error error_code["
<<
ret
<<
"]"
;
return
ret
;
}
return
RET_OK
;
int
FusedBatchnormCPUKernel
::
DoExecute
(
int
task_id
)
{
auto
param
=
reinterpret_cast
<
BatchNormParameter
*>
(
op_parameter_
);
FusedBatchNormFp32
(
in_tensors_
.
at
(
0
)
->
Data
(),
scale_
,
offset_
,
mean_
,
variance_
,
param
,
task_id
,
out_tensors_
.
at
(
0
)
->
Data
());
return
mindspore
::
lite
::
RET_OK
;
}
kernel
::
LiteKernel
*
CpuFusedBatchnormKernelCreator
(
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
...
...
@@ -149,11 +75,6 @@ kernel::LiteKernel *CpuFusedBatchnormKernelCreator(const std::vector<lite::tenso
OpParameter
*
op_parameter
,
const
lite
::
Context
*
ctx
,
const
kernel
::
KernelKey
&
desc
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
if
(
op_parameter
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Input parameter is nullptr!"
;
return
nullptr
;
}
MS_ASSERT
(
desc
.
type
==
schema
::
PrimitiveType_FusedBatchNorm
);
FusedBatchnormCPUKernel
*
kernel
=
new
(
std
::
nothrow
)
FusedBatchnormCPUKernel
(
op_parameter
,
inputs
,
outputs
,
ctx
,
primitive
);
if
(
kernel
==
nullptr
)
{
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.h
浏览文件 @
e3c053c4
...
...
@@ -18,37 +18,26 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FUSED_BATCHNORM_H_
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/batchnorm_parameter.h"
#include "src/runtime/kernel/arm/fp32/batchnorm.h"
namespace
mindspore
::
kernel
{
class
FusedBatchnormCPUKernel
:
public
Lite
Kernel
{
class
FusedBatchnormCPUKernel
:
public
BatchnormCPU
Kernel
{
public:
FusedBatchnormCPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
lite
::
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
LiteKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{
batchnorm_param_
=
reinterpret_cast
<
BatchNormParameter
*>
(
parameter
);
}
~
FusedBatchnormCPUKernel
()
override
;
:
BatchnormCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
FusedBatchnormCPUKernel
()
{
FreeScaleAndOffset
();
}
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
InitConstTensor
();
int
Execute
(
int
task_id
)
;
int
InitConstTensor
()
override
;
int
DoExecute
(
int
task_id
)
override
;
private:
void
FreeTmpBuffer
();
float
*
in_addr_
=
nullptr
;
float
*
mean_addr_
=
nullptr
;
float
*
var_addr_
=
nullptr
;
float
*
scale_addr_
=
nullptr
;
float
*
offset_addr_
=
nullptr
;
float
*
out_addr_
=
nullptr
;
BatchNormParameter
*
batchnorm_param_
;
protected:
void
FreeScaleAndOffset
();
void
*
scale_
=
nullptr
;
void
*
offset_
=
nullptr
;
};
}
// namespace mindspore::kernel
...
...
mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
浏览文件 @
e3c053c4
...
...
@@ -31,40 +31,32 @@ TEST_F(TestBatchnormFp32, BNTest) {
-
1.1983503
,
-
6.6790967
,
6.383416
,
-
13.3213005
,
-
8.693595
,
9.476344
};
std
::
vector
<
float
>
in_data1
=
{
12.352293
,
5.122387
,
14.249514
};
std
::
vector
<
float
>
in_data2
=
{
14.632595
,
0.70900035
,
11.179003
};
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs_tensor
;
std
::
vector
<
lite
::
tensor
::
Tensor
*>
outputs_tensor
;
BatchNormParameter
op_param
;
op_param
.
op_parameter_
.
type_
=
schema
::
PrimitiveType_BatchNorm
;
op_param
.
epsilon_
=
0.001
f
;
std
::
vector
<
int
>
shape
=
{
1
,
2
,
2
,
3
};
lite
::
tensor
::
Tensor
input0_tensor
;
lite
::
tensor
::
Tensor
input1_tensor
;
lite
::
tensor
::
Tensor
input2_tensor
;
inputs_tensor
.
push_back
(
&
input0_tensor
);
inputs_tensor
.
push_back
(
&
input1_tensor
);
inputs_tensor
.
push_back
(
&
input2_tensor
);
lite
::
tensor
::
Tensor
input0_tensor
(
kNumberTypeFloat32
,
{
1
,
2
,
2
,
3
});
lite
::
tensor
::
Tensor
input1_tensor
(
kNumberTypeFloat32
,
{
3
});
lite
::
tensor
::
Tensor
input2_tensor
(
kNumberTypeFloat32
,
{
3
});
input0_tensor
.
SetData
(
in_data
.
data
());
input1_tensor
.
SetData
(
in_data1
.
data
());
input2_tensor
.
SetData
(
in_data2
.
data
());
input0_tensor
.
set_shape
(
shape
);
input1_tensor
.
set_shape
({
3
});
input2_tensor
.
set_shape
({
3
});
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs_tensor
=
{
&
input0_tensor
,
&
input1_tensor
,
&
input2_tensor
};
std
::
vector
<
float
>
output
(
12
);
std
::
vector
<
float
>
corr_out
=
{
-
6.1533737
,
7.4904885
,
-
0.8563998
,
-
0.289212
,
-
9.356432
,
0.13245535
,
-
3.5422924
,
-
14.005781
,
-
2.3525476
,
-
6.7113695
,
-
16.396551
,
-
1.4275324
};
lite
::
tensor
::
Tensor
output0_tensor
;
outputs_tensor
.
push_back
(
&
output0_tensor
);
lite
::
tensor
::
Tensor
output0_tensor
(
kNumberTypeFloat32
,
{
1
,
2
,
2
,
3
});
output0_tensor
.
SetData
(
output
.
data
());
output0_tensor
.
set_shape
(
shape
);
std
::
vector
<
lite
::
tensor
::
Tensor
*>
outputs_tensor
=
{
&
output0_tensor
};
kernel
::
KernelKey
desc
=
{
kernel
::
KERNEL_ARCH
::
kCPU
,
kNumberTypeFloat32
,
schema
::
PrimitiveType_BatchNorm
};
auto
creator
=
lite
::
KernelRegistry
::
GetInstance
()
->
GetCreator
(
desc
);
ASSERT_NE
(
creator
,
nullptr
);
lite
::
Context
ctx
;
ctx
.
thread_num_
=
1
;
ctx
.
thread_num_
=
2
;
kernel
::
LiteKernel
*
kernel
=
creator
(
inputs_tensor
,
outputs_tensor
,
reinterpret_cast
<
OpParameter
*>
(
&
op_param
),
&
ctx
,
desc
,
nullptr
);
ASSERT_NE
(
kernel
,
nullptr
);
...
...
@@ -82,7 +74,6 @@ TEST_F(TestBatchnormFp32, BNTest) {
input1_tensor
.
SetData
(
nullptr
);
input2_tensor
.
SetData
(
nullptr
);
output0_tensor
.
SetData
(
nullptr
);
MS_LOG
(
INFO
)
<<
"TestBathNormFp32 accuracy passed"
;
}
TEST_F
(
TestBatchnormFp32
,
FusedBNTest
)
{
...
...
@@ -92,118 +83,102 @@ TEST_F(TestBatchnormFp32, FusedBNTest) {
std
::
vector
<
float
>
offset
=
{
27.888096
,
24.533648
,
15.335093
};
std
::
vector
<
float
>
mean
=
{
11.5127125
,
0.47681615
,
5.851508
};
std
::
vector
<
float
>
var
=
{
1.270583
,
13.005714
,
6.089223
};
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs_tensor
;
std
::
vector
<
lite
::
tensor
::
Tensor
*>
outputs_tensor
;
BatchNormParameter
op_param
;
op_param
.
op_parameter_
.
type_
=
schema
::
PrimitiveType_BatchNorm
;
op_param
.
epsilon_
=
0.001
f
;
std
::
vector
<
int
>
shape
=
{
1
,
2
,
2
,
3
};
lite
::
tensor
::
Tensor
input
[
5
];
input
[
0
].
SetData
(
in_data
.
data
());
input
[
1
].
SetData
(
scale
.
data
());
input
[
2
].
SetData
(
offset
.
data
());
input
[
3
].
SetData
(
mean
.
data
());
input
[
4
].
SetData
(
var
.
data
());
input
[
0
].
set_shape
(
shape
);
for
(
int
i
=
1
;
i
<
5
;
i
++
)
{
input
[
i
].
set_shape
({
3
});
}
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
inputs_tensor
.
push_back
(
&
input
[
i
]);
}
lite
::
tensor
::
Tensor
input0
(
kNumberTypeFloat32
,
{
1
,
2
,
2
,
3
});
lite
::
tensor
::
Tensor
input1
(
kNumberTypeFloat32
,
{
3
});
lite
::
tensor
::
Tensor
input2
(
kNumberTypeFloat32
,
{
3
});
lite
::
tensor
::
Tensor
input3
(
kNumberTypeFloat32
,
{
3
});
lite
::
tensor
::
Tensor
input4
(
kNumberTypeFloat32
,
{
3
});
input0
.
SetData
(
in_data
.
data
());
input1
.
SetData
(
scale
.
data
());
input2
.
SetData
(
offset
.
data
());
input3
.
SetData
(
mean
.
data
());
input4
.
SetData
(
var
.
data
());
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs_tensor
=
{
&
input0
,
&
input1
,
&
input2
,
&
input3
,
&
input4
};
std
::
vector
<
float
>
output
(
12
);
std
::
vector
<
float
>
corr_out
=
{
-
195.5765
,
67.03745
,
-
4.243883
,
-
42.028015
,
74.37044
,
9.075897
,
5.1857452
,
56.60399
,
-
77.215096
,
-
181.18402
,
49.81066
,
-
59.204563
};
lite
::
tensor
::
Tensor
output0
_tensor
;
output
s_tensor
.
push_back
(
&
output0_tensor
);
output0_tensor
.
SetData
(
output
.
data
())
;
output0_tensor
.
set_shape
(
shape
);
lite
::
tensor
::
Tensor
output0
(
kNumberTypeFloat32
,
{
1
,
2
,
2
,
3
})
;
output
0
.
SetData
(
output
.
data
()
);
std
::
vector
<
lite
::
tensor
::
Tensor
*>
outputs_tensor
=
{
&
output0
}
;
kernel
::
KernelKey
desc
=
{
kernel
::
KERNEL_ARCH
::
kCPU
,
kNumberTypeFloat32
,
schema
::
PrimitiveType_FusedBatchNorm
};
auto
creator
=
lite
::
KernelRegistry
::
GetInstance
()
->
GetCreator
(
desc
);
ASSERT_NE
(
creator
,
nullptr
);
lite
::
Context
ctx
;
ctx
.
thread_num_
=
1
;
ctx
.
thread_num_
=
2
;
kernel
::
LiteKernel
*
kernel
=
creator
(
inputs_tensor
,
outputs_tensor
,
reinterpret_cast
<
OpParameter
*>
(
&
op_param
),
&
ctx
,
desc
,
nullptr
);
ASSERT_NE
(
kernel
,
nullptr
);
auto
output_tensor_shape
=
output0_tensor
.
shape
();
kernel
->
Run
();
printf
(
"==================output data=================
\n
"
);
for
(
int
i
=
0
;
i
<
output0
_tensor
.
ElementsNum
();
i
++
)
{
for
(
int
i
=
0
;
i
<
output0
.
ElementsNum
();
i
++
)
{
std
::
cout
<<
output
[
i
]
<<
" ,"
;
}
std
::
cout
<<
std
::
endl
;
CompareOutputData
(
output
.
data
(),
corr_out
.
data
(),
output0_tensor
.
ElementsNum
(),
0.001
);
for
(
int
i
=
1
;
i
<
5
;
i
++
)
{
input
[
i
].
SetData
(
nullptr
);
}
output0_tensor
.
SetData
(
nullptr
);
MS_LOG
(
INFO
)
<<
"TestFusedBathNormFp32 accuracy passed"
;
CompareOutputData
(
output
.
data
(),
corr_out
.
data
(),
output0
.
ElementsNum
(),
0.001
);
input0
.
SetData
(
nullptr
);
input1
.
SetData
(
nullptr
);
input2
.
SetData
(
nullptr
);
input3
.
SetData
(
nullptr
);
input4
.
SetData
(
nullptr
);
output0
.
SetData
(
nullptr
);
}
TEST_F
(
TestBatchnormFp32
,
easyTest
)
{
std
::
vector
<
float
>
in_data
=
{
1
,
4
,
2
,
5
,
3
,
6
,
-
1
,
-
4
,
-
2
,
-
5
,
-
3
,
-
6
};
std
::
vector
<
float
>
in_data1
=
{
0.1
,
0.6
};
std
::
vector
<
float
>
in_data2
=
{
3
,
4
};
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs_tensor
;
std
::
vector
<
lite
::
tensor
::
Tensor
*>
outputs_tensor
;
BatchNormParameter
op_param
;
op_param
.
op_parameter_
.
type_
=
schema
::
PrimitiveType_BatchNorm
;
op_param
.
epsilon_
=
0.001
f
;
std
::
vector
<
int
>
shape
=
{
1
,
1
,
6
,
2
};
lite
::
tensor
::
Tensor
input0_tensor
;
lite
::
tensor
::
Tensor
input1_tensor
;
lite
::
tensor
::
Tensor
input2_tensor
;
inputs_tensor
.
push_back
(
&
input0_tensor
);
inputs_tensor
.
push_back
(
&
input1_tensor
);
inputs_tensor
.
push_back
(
&
input2_tensor
);
input0_tensor
.
SetData
(
in_data
.
data
());
input1_tensor
.
SetData
(
in_data1
.
data
());
input2_tensor
.
SetData
(
in_data2
.
data
());
input0_tensor
.
set_shape
(
shape
);
input1_tensor
.
set_shape
({
2
});
input2_tensor
.
set_shape
({
2
});
lite
::
tensor
::
Tensor
input0
(
kNumberTypeFloat32
,
{
1
,
1
,
6
,
2
});
lite
::
tensor
::
Tensor
input1
(
kNumberTypeFloat32
,
{
2
});
lite
::
tensor
::
Tensor
input2
(
kNumberTypeFloat32
,
{
2
});
input0
.
SetData
(
in_data
.
data
());
input1
.
SetData
(
in_data1
.
data
());
input2
.
SetData
(
in_data2
.
data
());
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs_tensor
=
{
&
input0
,
&
input1
,
&
input2
};
std
::
vector
<
float
>
output
(
12
);
std
::
vector
<
float
>
corr_out
=
{
0.519529
,
1.69979
,
1.09678
,
2.19973
,
1.67404
,
2.69966
,
-
0.63498
,
-
2.29971
,
-
1.21223
,
-
2.79965
,
-
1.78949
,
-
3.29959
};
lite
::
tensor
::
Tensor
output0
_tensor
;
output
s_tensor
.
push_back
(
&
output0_tensor
);
output0_tensor
.
SetData
(
output
.
data
())
;
output0_tensor
.
set_shape
(
shape
);
lite
::
tensor
::
Tensor
output0
(
kNumberTypeFloat32
,
{
1
,
1
,
6
,
2
})
;
output
0
.
SetData
(
output
.
data
()
);
std
::
vector
<
lite
::
tensor
::
Tensor
*>
outputs_tensor
=
{
&
output0
}
;
kernel
::
KernelKey
desc
=
{
kernel
::
KERNEL_ARCH
::
kCPU
,
kNumberTypeFloat32
,
schema
::
PrimitiveType_BatchNorm
};
auto
creator
=
lite
::
KernelRegistry
::
GetInstance
()
->
GetCreator
(
desc
);
ASSERT_NE
(
creator
,
nullptr
);
lite
::
Context
ctx
;
ctx
.
thread_num_
=
1
;
ctx
.
thread_num_
=
2
;
kernel
::
LiteKernel
*
kernel
=
creator
(
inputs_tensor
,
outputs_tensor
,
reinterpret_cast
<
OpParameter
*>
(
&
op_param
),
&
ctx
,
desc
,
nullptr
);
ASSERT_NE
(
kernel
,
nullptr
);
auto
output_tensor_shape
=
output0_tensor
.
shape
();
kernel
->
Run
();
printf
(
"==================output data=================
\n
"
);
for
(
int
i
=
0
;
i
<
output0
_tensor
.
ElementsNum
();
i
++
)
{
for
(
int
i
=
0
;
i
<
output0
.
ElementsNum
();
i
++
)
{
std
::
cout
<<
output
[
i
]
<<
" ,"
;
}
std
::
cout
<<
std
::
endl
;
CompareOutputData
(
output
.
data
(),
corr_out
.
data
(),
output0
_tensor
.
ElementsNum
(),
0.001
);
CompareOutputData
(
output
.
data
(),
corr_out
.
data
(),
output0
.
ElementsNum
(),
0.001
);
input0_tensor
.
SetData
(
nullptr
);
input1_tensor
.
SetData
(
nullptr
);
input2_tensor
.
SetData
(
nullptr
);
output0_tensor
.
SetData
(
nullptr
);
MS_LOG
(
INFO
)
<<
"TestBathNormFp32 accuracy passed"
;
input0
.
SetData
(
nullptr
);
input1
.
SetData
(
nullptr
);
input2
.
SetData
(
nullptr
);
output0
.
SetData
(
nullptr
);
}
}
// namespace mindspore
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录