Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
f2423c2d
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f2423c2d
编写于
6月 28, 2018
作者:
D
dolphin8
浏览文件
操作
浏览文件
下载
差异文件
merge with upstream
上级
2ebd3c16
5f84ccc8
变更
49
隐藏空白更改
内联
并排
Showing
49 changed file
with
228 addition
and
81 deletion
+228
-81
CMakeLists.txt
CMakeLists.txt
+3
-2
src/framework/operator.h
src/framework/operator.h
+7
-4
src/io/io.cpp
src/io/io.cpp
+9
-0
src/operators/feed_op.h
src/operators/feed_op.h
+2
-0
src/operators/fetch_op.h
src/operators/fetch_op.h
+2
-0
src/operators/kernel/arm/batchnorm_kernel.cpp
src/operators/kernel/arm/batchnorm_kernel.cpp
+5
-0
src/operators/kernel/arm/box_coder_kernel.cpp
src/operators/kernel/arm/box_coder_kernel.cpp
+5
-0
src/operators/kernel/arm/concat_kernel.cpp
src/operators/kernel/arm/concat_kernel.cpp
+5
-0
src/operators/kernel/arm/conv_add_kernel.cpp
src/operators/kernel/arm/conv_add_kernel.cpp
+5
-0
src/operators/kernel/arm/conv_add_relu_kernel.cpp
src/operators/kernel/arm/conv_add_relu_kernel.cpp
+6
-0
src/operators/kernel/arm/conv_kernel.cpp
src/operators/kernel/arm/conv_kernel.cpp
+5
-0
src/operators/kernel/arm/depthwise_conv_kernel.cpp
src/operators/kernel/arm/depthwise_conv_kernel.cpp
+5
-0
src/operators/kernel/arm/elementwise_add_kernel.cpp
src/operators/kernel/arm/elementwise_add_kernel.cpp
+6
-0
src/operators/kernel/arm/fusion_fc_kernel.cpp
src/operators/kernel/arm/fusion_fc_kernel.cpp
+5
-0
src/operators/kernel/arm/lrn_kernel.cpp
src/operators/kernel/arm/lrn_kernel.cpp
+5
-0
src/operators/kernel/arm/mul_kernel.cpp
src/operators/kernel/arm/mul_kernel.cpp
+5
-0
src/operators/kernel/arm/multiclass_nms_kernel.cpp
src/operators/kernel/arm/multiclass_nms_kernel.cpp
+6
-0
src/operators/kernel/arm/pool_kernel.cpp
src/operators/kernel/arm/pool_kernel.cpp
+5
-0
src/operators/kernel/arm/prior_box_kernel.cpp
src/operators/kernel/arm/prior_box_kernel.cpp
+5
-0
src/operators/kernel/arm/relu_kernel.cpp
src/operators/kernel/arm/relu_kernel.cpp
+5
-0
src/operators/kernel/arm/reshape_kernel.cpp
src/operators/kernel/arm/reshape_kernel.cpp
+5
-0
src/operators/kernel/arm/sigmoid_kernel.cpp
src/operators/kernel/arm/sigmoid_kernel.cpp
+5
-0
src/operators/kernel/arm/softmax_kernel.cpp
src/operators/kernel/arm/softmax_kernel.cpp
+5
-0
src/operators/kernel/arm/transpose_kernel.cpp
src/operators/kernel/arm/transpose_kernel.cpp
+5
-0
src/operators/kernel/batchnorm_kernel.h
src/operators/kernel/batchnorm_kernel.h
+1
-0
src/operators/kernel/box_coder_kernel.h
src/operators/kernel/box_coder_kernel.h
+1
-0
src/operators/kernel/concat_kernel.h
src/operators/kernel/concat_kernel.h
+1
-0
src/operators/kernel/conv_add_kernel.h
src/operators/kernel/conv_add_kernel.h
+1
-0
src/operators/kernel/conv_add_relu_kernel.h
src/operators/kernel/conv_add_relu_kernel.h
+1
-0
src/operators/kernel/conv_kernel.h
src/operators/kernel/conv_kernel.h
+1
-0
src/operators/kernel/depthwise_conv_kernel.h
src/operators/kernel/depthwise_conv_kernel.h
+1
-0
src/operators/kernel/elementwise_add_kernel.h
src/operators/kernel/elementwise_add_kernel.h
+1
-0
src/operators/kernel/fpga/conv_kernel.cpp
src/operators/kernel/fpga/conv_kernel.cpp
+5
-0
src/operators/kernel/fusion_fc_kernel.h
src/operators/kernel/fusion_fc_kernel.h
+1
-0
src/operators/kernel/lrn_kernel.h
src/operators/kernel/lrn_kernel.h
+1
-0
src/operators/kernel/mali/batchnorm_kernel.cpp
src/operators/kernel/mali/batchnorm_kernel.cpp
+5
-0
src/operators/kernel/mali/conv_kernel.cpp
src/operators/kernel/mali/conv_kernel.cpp
+5
-0
src/operators/kernel/mul_kernel.h
src/operators/kernel/mul_kernel.h
+1
-0
src/operators/kernel/multiclass_nms_kernel.h
src/operators/kernel/multiclass_nms_kernel.h
+1
-0
src/operators/kernel/pool_kernel.h
src/operators/kernel/pool_kernel.h
+1
-0
src/operators/kernel/prior_box_kernel.h
src/operators/kernel/prior_box_kernel.h
+1
-0
src/operators/kernel/relu_kernel.h
src/operators/kernel/relu_kernel.h
+1
-0
src/operators/kernel/reshape_kernel.h
src/operators/kernel/reshape_kernel.h
+1
-0
src/operators/kernel/sigmoid_kernel.h
src/operators/kernel/sigmoid_kernel.h
+1
-0
src/operators/kernel/softmax_kernel.h
src/operators/kernel/softmax_kernel.h
+1
-0
src/operators/kernel/transpose_kernel.h
src/operators/kernel/transpose_kernel.h
+1
-0
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+55
-67
src/operators/math/gemm.h
src/operators/math/gemm.h
+4
-4
test/common/test_gemm.cpp
test/common/test_gemm.cpp
+14
-4
未找到文件。
CMakeLists.txt
浏览文件 @
f2423c2d
...
...
@@ -9,7 +9,7 @@ option(LOG_PROFILE "log profile" ON)
option
(
CPU
"armv7 with neon"
ON
)
option
(
MALI_GPU
"mali gpu"
OFF
)
option
(
FPGA
"fpga"
OFF
)
set
(
DEBUGING ON
)
if
(
CPU
)
add_definitions
(
-DPADDLE_MOBILE_CPU
)
endif
()
...
...
@@ -28,7 +28,7 @@ set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
if
(
DEBUGING
)
message
(
STATUS
"debug"
)
set
(
CMAKE_BUILD_TYPE Debug
)
set
(
CMAKE_CXX_FLAGS_DEBUG
"-g"
)
set
(
CMAKE_CXX_FLAGS_DEBUG
"-g
-DNDEBUG
"
)
add_definitions
(
-DPADDLE_MOBILE_DEBUG
)
if
(
ANDROID_NDK_TOOLCHAIN_INCLUDED
)
add_definitions
(
-DARMV7
)
...
...
@@ -36,6 +36,7 @@ if (DEBUGING)
endif
()
else
()
set
(
CMAKE_BUILD_TYPE Release
)
set
(
CMAKE_CXX_FLAGS_RELEASE
"-DNDEBUG"
)
add_definitions
(
-fvisibility=hidden -fvisibility-inlines-hidden
)
endif
()
...
...
src/framework/operator.h
浏览文件 @
f2423c2d
...
...
@@ -63,6 +63,7 @@ class OperatorBase {
std
::
vector
<
string
>
GetOutKeys
()
const
;
virtual
void
RunImpl
()
const
=
0
;
virtual
void
Init
()
const
=
0
;
/*
* @b op 运算所需的输入, 如上一层的输出结果、卷积核
* */
...
...
@@ -110,15 +111,17 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
std
::
shared_ptr
<
Scope
>
scope
)
:
OperatorBase
<
Dtype
>
(
type
,
inputs
,
outputs
,
attrs
,
scope
),
param_
(
inputs
,
outputs
,
attrs
,
*
scope
)
{
PADDLE_MOBILE_ENFORCE
(
kernel_
.
Init
(
param_
),
" %s kernel init failed"
,
this
->
type_
.
c_str
());
}
param_
(
inputs
,
outputs
,
attrs
,
*
scope
)
{}
virtual
void
RunImpl
()
const
{
this
->
kernel_
.
Compute
(
this
->
param_
);
}
virtual
void
InferShape
()
const
=
0
;
void
Init
()
const
{
PADDLE_MOBILE_ENFORCE
(
kernel_
.
Init
(
param_
),
" %s kernel init failed"
,
this
->
type_
.
c_str
());
}
protected:
KernelType
kernel_
;
ParamType
param_
;
...
...
src/io/io.cpp
浏览文件 @
f2423c2d
...
...
@@ -198,6 +198,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
}
else
{
InitMemory
();
}
std
::
shared_ptr
<
framework
::
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
0
);
auto
&
ops
=
ops_of_block_
[
*
to_predict_block
.
get
()];
for
(
const
auto
&
op
:
ops
)
{
op
->
Init
();
}
}
template
<
typename
Dtype
,
Precision
P
>
...
...
@@ -416,6 +423,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
i
].
runBegin
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
#endif
// to Run
ops
[
i
]
->
Run
();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
...
...
src/operators/feed_op.h
浏览文件 @
f2423c2d
...
...
@@ -32,6 +32,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
param_
(
inputs
,
outputs
,
attrs
,
*
scope
)
{}
void
RunImpl
()
const
{
param_
.
Out
()
->
ShareDataWith
(
*
param_
.
InputX
());
}
void
Init
()
const
{}
void
InferShape
()
const
{
auto
out_dims
=
param_
.
Out
()
->
dims
();
out_dims
[
0
]
=
param_
.
BatchSize
();
...
...
src/operators/fetch_op.h
浏览文件 @
f2423c2d
...
...
@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
param_
(
inputs
,
outputs
,
attrs
,
*
scope
)
{}
void
RunImpl
()
const
{
param_
.
Out
()
->
ShareDataWith
(
*
param_
.
InputX
());
}
void
Init
()
const
{}
void
InferShape
()
const
{
auto
x_dims
=
param_
.
InputX
()
->
dims
();
param_
.
Out
()
->
Resize
(
x_dims
);
...
...
src/operators/kernel/arm/batchnorm_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -21,6 +21,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
BatchNormKernel
<
CPU
,
float
>::
Init
(
const
BatchNormParam
&
para
)
const
{
return
true
;
}
template
<
>
void
BatchNormKernel
<
CPU
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
...
...
src/operators/kernel/arm/box_coder_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -110,6 +110,11 @@ void DecodeCenterSize(const framework::Tensor& target_box,
}
}
template
<
>
bool
BoxCoderKernel
<
CPU
,
float
>::
Init
(
const
BoxCoderParam
&
para
)
const
{
return
true
;
}
template
<
>
void
BoxCoderKernel
<
CPU
,
float
>::
Compute
(
const
BoxCoderParam
&
param
)
const
{
const
auto
*
input_priorbox
=
param
.
InputPriorBox
();
...
...
src/operators/kernel/arm/concat_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -52,6 +52,11 @@ class ConcatFunctor {
}
};
template
<
>
bool
ConcatKernel
<
CPU
,
float
>::
Init
(
const
ConcatParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ConcatKernel
<
CPU
,
float
>::
Compute
(
const
ConcatParam
&
param
)
const
{
auto
inputs
=
param
.
Inputs
();
...
...
src/operators/kernel/arm/conv_add_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -18,6 +18,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddKernel
<
CPU
,
float
>::
Init
(
const
FusionConvAddParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ConvAddKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddParam
&
param
)
const
{
const
Tensor
*
input
=
param
.
Input
();
...
...
src/operators/kernel/arm/conv_add_relu_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -19,6 +19,12 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddReluKernel
<
CPU
,
float
>::
Init
(
const
FusionConvAddReluParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ConvAddReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddReluParam
&
param
)
const
{
...
...
src/operators/kernel/arm/conv_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -19,6 +19,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvKernel
<
CPU
,
float
>::
Init
(
const
ConvParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
const
Tensor
*
input
=
param
.
Input
();
...
...
src/operators/kernel/arm/depthwise_conv_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -20,6 +20,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
DepthwiseConvKernel
<
CPU
,
float
>::
Init
(
const
ConvParam
&
para
)
const
{
return
true
;
}
template
<
>
void
DepthwiseConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
LOG
(
kLOG_DEBUG
)
<<
param
;
...
...
src/operators/kernel/arm/elementwise_add_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -26,6 +26,12 @@ struct AddFunctor {
inline
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
>
bool
ElementwiseAddKernel
<
CPU
,
float
>::
Init
(
const
ElementwiseAddParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ElementwiseAddKernel
<
CPU
,
float
>::
Compute
(
const
ElementwiseAddParam
&
param
)
const
{
...
...
src/operators/kernel/arm/fusion_fc_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -21,6 +21,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
FusionFcKernel
<
CPU
,
float
>::
Init
(
const
FusionFcParam
&
para
)
const
{
return
true
;
}
template
<
>
void
FusionFcKernel
<
CPU
,
float
>::
Compute
(
const
FusionFcParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
...
...
src/operators/kernel/arm/lrn_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -21,6 +21,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
LrnKernel
<
CPU
,
float
>::
Init
(
const
LrnParam
&
para
)
const
{
return
true
;
}
template
<
>
void
LrnKernel
<
CPU
,
float
>::
Compute
(
const
LrnParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
...
...
src/operators/kernel/arm/mul_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -21,6 +21,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
MulKernel
<
CPU
,
float
>::
Init
(
const
MulParam
&
para
)
const
{
return
true
;
}
template
<
>
void
MulKernel
<
CPU
,
float
>::
Compute
(
const
MulParam
&
param
)
const
{
const
Tensor
*
input_x
=
param
.
InputX
();
...
...
src/operators/kernel/arm/multiclass_nms_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -203,6 +203,12 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
}
}
template
<
>
bool
MultiClassNMSKernel
<
CPU
,
float
>::
Init
(
const
MultiClassNMSParam
&
para
)
const
{
return
true
;
}
template
<
>
void
MultiClassNMSKernel
<
CPU
,
float
>::
Compute
(
const
MultiClassNMSParam
&
param
)
const
{
...
...
src/operators/kernel/arm/pool_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -35,6 +35,11 @@ inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
}
}
template
<
>
bool
PoolKernel
<
CPU
,
float
>::
Init
(
const
PoolParam
&
para
)
const
{
return
true
;
}
template
<
>
void
PoolKernel
<
CPU
,
float
>::
Compute
(
const
PoolParam
&
param
)
const
{
const
Tensor
*
in_x
=
param
.
Input
();
...
...
src/operators/kernel/arm/prior_box_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -26,6 +26,11 @@ struct ClipFunctor {
}
};
template
<
>
bool
PriorBoxKernel
<
CPU
,
float
>::
Init
(
const
PriorBoxParam
&
para
)
const
{
return
true
;
}
template
<
>
void
PriorBoxKernel
<
CPU
,
float
>::
Compute
(
const
PriorBoxParam
&
param
)
const
{
const
auto
*
input_
=
param
.
Input
();
...
...
src/operators/kernel/arm/relu_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -25,6 +25,11 @@ struct ReluFunctor {
inline
T
operator
()(
T
in
)
const
{
return
in
>
0
?
in
:
0
;
}
};
template
<
>
bool
ReluKernel
<
CPU
,
float
>::
Init
(
const
ReluParam
&
para
)
const
{
return
true
;
}
/*
* @b 特化到具体平台的实现, param 从 op 层传入
* */
...
...
src/operators/kernel/arm/reshape_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -19,6 +19,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ReshapeKernel
<
CPU
,
float
>::
Init
(
const
ReshapeParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ReshapeKernel
<
CPU
,
float
>::
Compute
(
const
ReshapeParam
&
param
)
const
{
const
auto
*
input_x
=
param
.
InputX
();
...
...
src/operators/kernel/arm/sigmoid_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -71,6 +71,11 @@ void sigmoid(const Tensor *X, Tensor *Y) {
#endif
}
template
<
>
bool
SigmoidKernel
<
CPU
,
float
>::
Init
(
const
SigmoidParam
&
para
)
const
{
return
true
;
}
template
<
>
void
SigmoidKernel
<
CPU
,
float
>::
Compute
(
const
SigmoidParam
&
param
)
const
{
const
Tensor
*
in_x
=
param
.
InputX
();
...
...
src/operators/kernel/arm/softmax_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -19,6 +19,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
SoftmaxKernel
<
CPU
,
float
>::
Init
(
const
SoftmaxParam
&
para
)
const
{
return
true
;
}
template
<
>
void
SoftmaxKernel
<
CPU
,
float
>::
Compute
(
const
SoftmaxParam
&
param
)
const
{
const
Tensor
*
in_x
=
param
.
InputX
();
...
...
src/operators/kernel/arm/transpose_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -34,6 +34,11 @@ namespace operators {
// }
// }
template
<
>
bool
TransposeKernel
<
CPU
,
float
>::
Init
(
const
TransposeParam
&
para
)
const
{
return
true
;
}
template
<
>
void
TransposeKernel
<
CPU
,
float
>::
Compute
(
const
TransposeParam
&
param
)
const
{
const
auto
*
input_x
=
param
.
InputX
();
...
...
src/operators/kernel/batchnorm_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -29,6 +29,7 @@ class BatchNormKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
BatchNormParam
>
{
public:
void
Compute
(
const
BatchNormParam
&
param
)
const
;
bool
Init
(
const
BatchNormParam
&
para
)
const
;
};
}
// namespace operators
...
...
src/operators/kernel/box_coder_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -30,6 +30,7 @@ class BoxCoderKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
BoxCoderParam
>
{
public:
void
Compute
(
const
BoxCoderParam
&
param
)
const
;
bool
Init
(
const
BoxCoderParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/concat_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -27,6 +27,7 @@ template <typename DeviceType, typename T>
class
ConcatKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
ConcatParam
>
{
public:
void
Compute
(
const
ConcatParam
&
param
)
const
;
bool
Init
(
const
ConcatParam
&
para
)
const
;
};
}
// namespace operators
...
...
src/operators/kernel/conv_add_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -38,6 +38,7 @@ template <typename DeviceType, typename T>
class
ConvAddKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddParam
>
{
public:
void
Compute
(
const
FusionConvAddParam
&
param
)
const
;
bool
Init
(
const
FusionConvAddParam
&
para
)
const
;
};
}
// namespace operators
...
...
src/operators/kernel/conv_add_relu_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -36,6 +36,7 @@ class ConvAddReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddReluParam
>
{
public:
void
Compute
(
const
FusionConvAddReluParam
&
param
)
const
;
bool
Init
(
const
FusionConvAddReluParam
&
para
)
const
;
};
}
// namespace operators
...
...
src/operators/kernel/conv_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -32,6 +32,7 @@ template <typename DeviceType, typename T>
class
ConvKernel
:
public
OpKernelBase
<
DeviceType
,
ConvParam
>
{
public:
void
Compute
(
const
ConvParam
&
param
)
const
;
bool
Init
(
const
ConvParam
&
para
)
const
;
};
inline
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
...
...
src/operators/kernel/depthwise_conv_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -31,6 +31,7 @@ template <typename DeviceType, typename T>
class
DepthwiseConvKernel
:
public
OpKernelBase
<
DeviceType
,
ConvParam
>
{
public:
void
Compute
(
const
ConvParam
&
param
)
const
;
bool
Init
(
const
ConvParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/elementwise_add_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -30,6 +30,7 @@ class ElementwiseAddKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
ElementwiseAddParam
>
{
public:
void
Compute
(
const
ElementwiseAddParam
&
param
)
const
;
bool
Init
(
const
ElementwiseAddParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/fpga/conv_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -19,6 +19,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvKernel
<
FPGA
,
float
>::
Init
(
const
ConvParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ConvKernel
<
FPGA
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{}
template
class
ConvKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fusion_fc_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -28,6 +28,7 @@ class FusionFcKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
FusionFcParam
>
{
public:
void
Compute
(
const
FusionFcParam
&
param
)
const
;
bool
Init
(
const
FusionFcParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/lrn_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -170,6 +170,7 @@ template <typename DeviceType, typename T>
class
LrnKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
LrnParam
>
{
public:
void
Compute
(
const
LrnParam
&
param
)
const
;
bool
Init
(
const
LrnParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/mali/batchnorm_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -20,6 +20,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
BatchNormKernel
<
GPU_MALI
,
float
>::
Init
(
const
BatchNormParam
&
para
)
const
{
return
true
;
}
template
<
>
void
BatchNormKernel
<
GPU_MALI
,
float
>::
Compute
(
const
BatchNormParam
&
param
)
const
{}
...
...
src/operators/kernel/mali/conv_kernel.cpp
浏览文件 @
f2423c2d
...
...
@@ -19,6 +19,11 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvKernel
<
GPU_MALI
,
float
>::
Init
(
const
ConvParam
&
para
)
const
{
return
true
;
}
template
<
>
void
ConvKernel
<
GPU_MALI
,
float
>::
Compute
(
const
ConvParam
&
param
)
const
{
// ArmConvImplement imp;
...
...
src/operators/kernel/mul_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -29,6 +29,7 @@ template <typename DeviceType, typename T>
class
MulKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
MulParam
>
{
public:
void
Compute
(
const
MulParam
&
param
)
const
;
bool
Init
(
const
MulParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/multiclass_nms_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -28,6 +28,7 @@ class MultiClassNMSKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
MultiClassNMSParam
>
{
public:
void
Compute
(
const
MultiClassNMSParam
&
param
)
const
;
bool
Init
(
const
MultiClassNMSParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/pool_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -28,6 +28,7 @@ template <typename DeviceType, typename T>
class
PoolKernel
:
public
OpKernelBase
<
DeviceType
,
PoolParam
>
{
public:
void
Compute
(
const
PoolParam
&
param
)
const
override
;
bool
Init
(
const
PoolParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/prior_box_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -55,6 +55,7 @@ class PriorBoxKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
PriorBoxParam
>
{
public:
void
Compute
(
const
PriorBoxParam
&
param
)
const
;
bool
Init
(
const
PriorBoxParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/relu_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -27,6 +27,7 @@ template <typename DeviceType, typename T>
class
ReluKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
ReluParam
>
{
public:
void
Compute
(
const
ReluParam
&
param
)
const
;
bool
Init
(
const
ReluParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/reshape_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -71,6 +71,7 @@ template <typename DeviceType, typename T>
class
ReshapeKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
ReshapeParam
>
{
public:
void
Compute
(
const
ReshapeParam
&
param
)
const
;
bool
Init
(
const
ReshapeParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/sigmoid_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -26,6 +26,7 @@ template <typename DeviceType, typename T>
class
SigmoidKernel
:
public
OpKernelBase
<
DeviceType
,
SigmoidParam
>
{
public:
void
Compute
(
const
SigmoidParam
&
param
)
const
override
;
bool
Init
(
const
SigmoidParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/softmax_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -29,6 +29,7 @@ template <typename DeviceType, typename T>
class
SoftmaxKernel
:
public
OpKernelBase
<
DeviceType
,
SoftmaxParam
>
{
public:
void
Compute
(
const
SoftmaxParam
&
param
)
const
override
;
bool
Init
(
const
SoftmaxParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/transpose_kernel.h
浏览文件 @
f2423c2d
...
...
@@ -29,6 +29,7 @@ class TransposeKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
TransposeParam
>
{
public:
void
Compute
(
const
TransposeParam
&
param
)
const
;
bool
Init
(
const
TransposeParam
&
para
)
const
;
};
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/math/gemm.cpp
浏览文件 @
f2423c2d
...
...
@@ -26,12 +26,12 @@ alignas(64) float packedA[MC * KC];
alignas
(
64
)
float
packedB
[
KC
*
NC
];
alignas
(
64
)
float
ab
[
MR
*
NR
];
// 将A矩阵分块复制到连续内存(ColMajor)
void
PackMatrixA
(
int
m
,
int
k
,
int
paddingM
,
const
float
*
A
,
int
lda
,
void
PackMatrixA
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
int
i
,
j
;
const
float
*
Aij
;
for
(
i
=
0
;
i
<
m
-
paddingM
;
i
+=
MR
)
{
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
for
(
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
for
(
j
=
0
;
j
<
k
;
++
j
)
{
Aij
=
&
A
(
i
,
j
);
*
buffer
++
=
*
Aij
;
*
buffer
++
=
*
(
Aij
+
1
);
...
...
@@ -39,13 +39,13 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
*
buffer
++
=
*
(
Aij
+
3
);
}
}
if
(
paddingM
!=
0
)
{
if
(
m_tail
!=
0
)
{
for
(
j
=
0
;
j
<
k
;
++
j
)
{
Aij
=
&
A
(
m
-
paddingM
,
j
);
for
(
i
=
0
;
i
<
paddingM
;
++
i
)
{
Aij
=
&
A
(
m
-
m_tail
,
j
);
for
(
i
=
0
;
i
<
m_tail
;
++
i
)
{
*
buffer
++
=
*
(
Aij
+
i
);
}
for
(
i
=
paddingM
;
i
<
MR
;
++
i
)
{
for
(
i
=
m_tail
;
i
<
MR
;
++
i
)
{
*
buffer
++
=
0
;
}
}
...
...
@@ -53,11 +53,11 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
}
// 将A矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_
(
int
m
,
int
k
,
int
paddingM
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
int
i
,
j
;
const
float
*
Ai
,
*
Ai1
,
*
Ai2
,
*
Ai3
;
for
(
i
=
0
;
i
<
m
-
paddingM
;
i
+=
MR
)
{
for
(
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
Ai
=
&
A
(
i
,
0
);
Ai1
=
&
A
(
i
+
1
,
0
);
Ai2
=
&
A
(
i
+
2
,
0
);
...
...
@@ -69,12 +69,12 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
*
buffer
++
=
*
Ai3
++
;
}
}
if
(
paddingM
!=
0
)
{
if
(
m_tail
!=
0
)
{
for
(
j
=
0
;
j
<
k
;
++
j
)
{
for
(
i
=
m
-
paddingM
;
i
<
m
;
++
i
)
{
for
(
i
=
m
-
m_tail
;
i
<
m
;
++
i
)
{
*
buffer
++
=
A
(
i
,
j
);
}
for
(
i
=
m
;
i
<
m
+
(
MR
-
paddingM
);
++
i
)
{
for
(
i
=
m
;
i
<
m
+
(
MR
-
m_tail
);
++
i
)
{
*
buffer
++
=
0
;
}
}
...
...
@@ -82,11 +82,11 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
}
// 将B矩阵分块复制到连续内存(ColMajor)
void
PackMatrixB
(
int
k
,
int
n
,
int
paddingN
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
int
i
,
j
;
const
float
*
Bj
,
*
Bj1
,
*
Bj2
,
*
Bj3
;
for
(
j
=
0
;
j
<
n
-
paddingN
;
j
+=
NR
)
{
for
(
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
Bj
=
&
B
(
0
,
j
);
Bj1
=
&
B
(
0
,
j
+
1
);
Bj2
=
&
B
(
0
,
j
+
2
);
...
...
@@ -98,12 +98,12 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
*
buffer
++
=
*
Bj3
++
;
}
}
if
(
paddingN
!=
0
)
{
if
(
n_tail
!=
0
)
{
for
(
i
=
0
;
i
<
k
;
++
i
)
{
for
(
int
j
=
n
-
paddingN
;
j
<
n
;
++
j
)
{
for
(
int
j
=
n
-
n_tail
;
j
<
n
;
++
j
)
{
*
buffer
++
=
B
(
i
,
j
);
}
for
(
int
j
=
n
;
j
<
n
+
(
NR
-
paddingN
);
++
j
)
{
for
(
int
j
=
n
;
j
<
n
+
(
NR
-
n_tail
);
++
j
)
{
*
buffer
++
=
0
;
}
}
...
...
@@ -111,11 +111,11 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
}
// 将B矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_
(
int
k
,
int
n
,
int
paddingN
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
int
i
,
j
;
const
float
*
Bij
;
for
(
j
=
0
;
j
<
n
-
paddingN
;
j
+=
NR
)
{
for
(
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
for
(
i
=
0
;
i
<
k
;
++
i
)
{
Bij
=
&
B
(
i
,
j
);
asm
volatile
(
...
...
@@ -126,13 +126,13 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
:
"memory"
,
"q0"
);
}
}
if
(
paddingN
!=
0
)
{
if
(
n_tail
!=
0
)
{
for
(
i
=
0
;
i
<
k
;
++
i
)
{
Bij
=
&
B
(
i
,
n
-
paddingN
);
for
(
int
j
=
n
-
paddingN
;
j
<
n
;
++
j
)
{
Bij
=
&
B
(
i
,
n
-
n_tail
);
for
(
int
j
=
n
-
n_tail
;
j
<
n
;
++
j
)
{
*
buffer
++
=
*
Bij
++
;
}
for
(
int
j
=
n
;
j
<
n
+
(
NR
-
paddingN
);
++
j
)
{
for
(
int
j
=
n
;
j
<
n
+
(
NR
-
n_tail
);
++
j
)
{
*
buffer
++
=
0
;
}
}
...
...
@@ -143,33 +143,25 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
void
InnerKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
first_time
)
{
int
Buff_A_M
=
m
;
int
Buff_B_N
=
n
;
int
m_block
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
int
n_block
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
int
_mc
=
m
%
MR
;
int
_nc
=
n
%
NR
;
if
(
_mc
!=
0
)
{
Buff_A_M
=
m
+
(
MR
-
_mc
);
}
if
(
_nc
!=
0
)
{
Buff_B_N
=
n
+
(
NR
-
_nc
);
}
int
m_tail
=
m
%
MR
;
int
n_tail
=
n
%
NR
;
if
(
first_time
)
{
PackMatrixB_
(
k
,
n
,
_nc
,
B
,
ldb
,
packedB
);
PackMatrixB_
(
k
,
n
,
n_tail
,
B
,
ldb
,
packedB
);
}
PackMatrixA_
(
m
,
k
,
_mc
,
A
,
lda
,
packedA
);
PackMatrixA_
(
m
,
k
,
m_tail
,
A
,
lda
,
packedA
);
int
i
,
j
,
mc
,
nc
;
// B 取 4 列, 打包预热
for
(
j
=
0
;
j
<
Buff_B_N
;
j
+=
NR
)
{
nc
=
(
n
-
j
)
<
NR
?
_nc
:
NR
;
for
(
j
=
0
;
j
<
n_block
;
j
+=
NR
)
{
nc
=
(
n
-
j
)
<
NR
?
n_tail
:
NR
;
// A 取 4 行,打包预热
for
(
i
=
0
;
i
<
Buff_A_M
;
i
+=
MR
)
{
mc
=
(
m
-
i
)
<
MR
?
_mc
:
MR
;
for
(
i
=
0
;
i
<
m_block
;
i
+=
MR
)
{
mc
=
(
m
-
i
)
<
MR
?
m_tail
:
MR
;
AddDot4x4
(
k
,
alpha
,
&
packedA
[
i
*
k
],
4
,
&
packedB
[
j
*
k
],
k
,
beta
,
&
C
(
i
,
j
),
ldc
,
mc
,
nc
);
}
...
...
@@ -180,36 +172,25 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
void
InnerKernel_relu
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
int
first_time
,
bool
relu
=
false
)
{
int
Buff_A_M
=
m
;
int
Buff_B_N
=
n
;
int
_mc
=
m
%
MR
;
int
_nc
=
n
%
NR
;
int
m_block
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
int
n_block
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
if
(
_mc
!=
0
)
{
Buff_A_M
=
m
+
(
MR
-
_mc
);
}
if
(
_nc
!=
0
)
{
Buff_B_N
=
n
+
(
NR
-
_nc
);
}
float
packedA
[
MC
*
KC
];
static
float
packedB
[
KC
*
NC
];
int
m_tail
=
m
%
MR
;
int
n_tail
=
n
%
NR
;
if
(
first_time
)
{
PackMatrixB_
(
k
,
n
,
_nc
,
B
,
ldb
,
packedB
);
PackMatrixB_
(
k
,
n
,
n_tail
,
B
,
ldb
,
packedB
);
}
PackMatrixA_
(
m
,
k
,
_mc
,
A
,
lda
,
packedA
);
PackMatrixA_
(
m
,
k
,
m_tail
,
A
,
lda
,
packedA
);
int
i
,
j
,
mc
,
nc
;
// B 取 4 列, 打包预热
for
(
j
=
0
;
j
<
Buff_B_N
;
j
+=
NR
)
{
nc
=
(
n
-
j
)
<
NR
?
_nc
:
NR
;
for
(
j
=
0
;
j
<
n_block
;
j
+=
NR
)
{
nc
=
(
n
-
j
)
<
NR
?
n_tail
:
NR
;
// A 取 4 行,打包预热
for
(
i
=
0
;
i
<
Buff_A_M
;
i
+=
MR
)
{
mc
=
(
m
-
i
)
<
MR
?
_mc
:
MR
;
for
(
i
=
0
;
i
<
m_block
;
i
+=
MR
)
{
mc
=
(
m
-
i
)
<
MR
?
m_tail
:
MR
;
AddDot4x4_relu
(
k
,
alpha
,
&
packedA
[
i
*
k
],
4
,
&
packedB
[
j
*
k
],
k
,
beta
,
&
C
(
i
,
j
),
ldc
,
mc
,
nc
,
relu
);
}
...
...
@@ -375,12 +356,15 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a]]!
\n\t
"
"vld1.32 {q1}, [%[b]]!
\n\t
"
"vmla.f32 q10, q1, d0[0]
\n\t
"
"vmla.f32 q11, q1, d0[1]
\n\t
"
"vmla.f32 q12, q1, d1[0]
\n\t
"
"vmla.f32 q13, q1, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"cmp %[mc], #4
\n\t
"
...
...
@@ -525,12 +509,15 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a]]!
\n\t
"
"vld1.32 {q1}, [%[b]]!
\n\t
"
"vmla.f32 q10, q1, d0[0]
\n\t
"
"vmla.f32 q11, q1, d0[1]
\n\t
"
"vmla.f32 q12, q1, d1[0]
\n\t
"
"vmla.f32 q13, q1, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"cmp %[mc], #4
\n\t
"
...
...
@@ -578,10 +565,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
"vmla.f32 q13, q3, d8[1]
\n\t
"
"memory_%=:
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"mov r5, %[C]
\n\t
"
"mov r6, %[bytes_ldc]
\n\t
"
"vst1.32 {q10}, [r5], r6
\n\t
"
...
...
@@ -599,7 +586,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
[
kc2
]
"r"
(
kc2
),
[
mc
]
"r"
(
mc
),
[
nc
]
"r"
(
nc
),
[
alpha
]
"r"
(
alpha
),
[
beta
]
"r"
(
beta
),
[
bytes_ldc
]
"r"
(
bytes_ldc
),
[
flag_alpha
]
"r"
(
flag_alpha
),
[
flag_beta
]
"r"
(
flag_beta
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
if
(
mc
!=
MR
||
nc
!=
NR
)
{
int
i
,
j
;
...
...
src/operators/math/gemm.h
浏览文件 @
f2423c2d
...
...
@@ -33,19 +33,19 @@ namespace operators {
namespace
math
{
// 将 A 矩阵分块复制到连续内存(ColMajor)
void
PackMatrixA
(
int
m
,
int
k
,
int
paddingM
,
const
float
*
A
,
int
lda
,
void
PackMatrixA
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(ColMajor)
void
PackMatrixB
(
int
k
,
int
n
,
int
paddingN
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
// 将 A 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_
(
int
m
,
int
k
,
int
paddingM
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_
(
int
k
,
int
n
,
int
paddingN
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
// 分块矩阵乘法
...
...
test/common/test_gemm.cpp
浏览文件 @
f2423c2d
...
...
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h"
#define a(i, j) a[(i)*lda + (j)]
...
...
@@ -29,10 +31,15 @@ int main() {
int
ldb
=
n
;
int
ldc
=
n
;
float
a
[
62
*
74
];
float
b
[
74
*
63
];
float
c
[
62
*
63
]
=
{
0
};
float
c1
[
62
*
63
]
=
{
0
};
float
*
a
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
k
));
float
*
b
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
k
*
n
));
float
*
c
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
float
*
c1
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
a
[
i
]
=
2
;
}
...
...
@@ -44,8 +51,11 @@ int main() {
c1
[
i
]
=
2
;
}
auto
time1
=
time
();
paddle_mobile
::
operators
::
math
::
sgemm
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
);
auto
time2
=
time
();
DLOG
<<
"gemm cost :"
<<
time_diff
(
time1
,
time2
)
<<
"ms
\n
"
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
std
::
cout
<<
c
[
i
]
<<
" | "
;
if
(
i
%
n
==
(
n
-
1
))
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录