Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
d333db62
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d333db62
编写于
11月 12, 2018
作者:
X
xiebaiyuan
提交者:
GitHub
11月 12, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
aeb6d50e
d487a285
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
399 addition
and
19 deletion
+399
-19
src/io/api_paddle_mobile.cc
src/io/api_paddle_mobile.cc
+4
-0
src/io/api_paddle_mobile.h
src/io/api_paddle_mobile.h
+2
-0
src/io/paddle_inference_api.h
src/io/paddle_inference_api.h
+1
-1
src/io/paddle_mobile.cpp
src/io/paddle_mobile.cpp
+242
-1
src/io/paddle_mobile.h
src/io/paddle_mobile.h
+3
-0
src/operators/kernel/cl/cl_kernel/feed_kernel.cl
src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+9
-1
src/operators/kernel/cl/feed_kernel.cpp
src/operators/kernel/cl/feed_kernel.cpp
+3
-0
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+12
-12
test/net/test_yologpu.cpp
test/net/test_yologpu.cpp
+123
-4
未找到文件。
src/io/api_paddle_mobile.cc
浏览文件 @
d333db62
...
...
@@ -52,6 +52,10 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
paddle_mobile_
->
SetThreadNum
(
config
.
thread_num
);
return
true
;
}
template
<
typename
Dtype
,
Precision
P
>
double
PaddleMobilePredictor
<
Dtype
,
P
>::
CaculatePredictTime
()
{
return
paddle_mobile_
->
GetPredictTime
();
};
template
<
typename
Dtype
,
Precision
P
>
bool
PaddleMobilePredictor
<
Dtype
,
P
>::
Run
(
...
...
src/io/api_paddle_mobile.h
浏览文件 @
d333db62
...
...
@@ -40,6 +40,8 @@ class PaddleMobilePredictor : public PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
override
;
double
CaculatePredictTime
()
override
;
~
PaddleMobilePredictor
()
override
;
private:
...
...
src/io/paddle_inference_api.h
浏览文件 @
d333db62
...
...
@@ -98,7 +98,7 @@ class PaddlePredictor {
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
virtual
double
CaculatePredictTime
()
=
0
;
// Destroy the Predictor.
virtual
~
PaddlePredictor
()
=
default
;
...
...
src/io/paddle_mobile.cpp
浏览文件 @
d333db62
...
...
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "io/paddle_mobile.h"
#ifdef PADDLE_MOBILE_CL
#include <CL/cl.h>
#include "framework/cl/cl_tensor.h"
#endif
#include "common/common.h"
#include "operators/math/gemm.h"
namespace
paddle_mobile
{
static
std
::
mutex
lc
;
...
...
@@ -119,6 +124,40 @@ void PaddleMobile<Dtype, P>::Clear() {
loader_
=
nullptr
;
}
template
<
typename
Dtype
,
Precision
P
>
double
PaddleMobile
<
Dtype
,
P
>::
GetPredictTime
()
{
int
m
=
32
;
int
n
=
224
*
224
;
int
k
=
27
;
int
lda
=
k
;
int
ldb
=
n
;
int
ldc
=
n
;
float
*
a
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
k
));
float
*
b
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
k
*
n
));
float
*
c
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
int
t1
=
1
;
int
t2
=
1
;
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
a
[
i
]
=
t1
+
rand
()
%
t2
;
}
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
b
[
i
]
=
t1
+
rand
()
%
t2
;
}
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
auto
time1
=
paddle_mobile
::
time
();
gemm
.
Sgemm
(
m
,
n
,
k
,
static_cast
<
float
>
(
1
),
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
false
,
nullptr
);
auto
time2
=
paddle_mobile
::
time
();
double
cost
=
paddle_mobile
::
time_diff
(
time1
,
time2
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
return
cost
;
}
template
<
typename
Dtype
,
Precision
P
>
PaddleMobile
<
Dtype
,
P
>::~
PaddleMobile
()
{
executor_
=
nullptr
;
...
...
@@ -167,6 +206,208 @@ void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
framework
::
CLEngine
::
Instance
()
->
setClPath
(
path
);
}
}
template
<
>
double
PaddleMobile
<
GPU_CL
,
Precision
::
FP32
>::
GetPredictTime
()
{
cl_int
status
;
cl_uint
nPlatform
;
clGetPlatformIDs
(
0
,
NULL
,
&
nPlatform
);
cl_platform_id
*
listPlatform
=
(
cl_platform_id
*
)
malloc
(
nPlatform
*
sizeof
(
cl_platform_id
));
clGetPlatformIDs
(
nPlatform
,
listPlatform
,
NULL
);
cl_uint
nDevice
=
0
;
clGetDeviceIDs
(
listPlatform
[
0
],
CL_DEVICE_TYPE_GPU
,
0
,
NULL
,
&
nDevice
);
cl_device_id
*
listDevice
=
(
cl_device_id
*
)
malloc
(
nDevice
*
sizeof
(
cl_device_id
));
clGetDeviceIDs
(
listPlatform
[
0
],
CL_DEVICE_TYPE_GPU
,
nDevice
,
listDevice
,
NULL
);
cl_context
context
=
clCreateContext
(
NULL
,
nDevice
,
listDevice
,
NULL
,
NULL
,
&
status
);
cl_command_queue
queue
=
clCreateCommandQueue
(
context
,
listDevice
[
0
],
0
,
&
status
);
int
n
=
1
;
int
c
=
3
;
int
h
=
224
;
int
w
=
224
;
float
*
input
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
3
*
224
*
224
));
float
*
filter
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
32
*
27
));
int
input_w
=
w
*
(
c
+
3
)
/
4
;
int
input_h
=
n
*
h
;
int
filter_w
=
3
*
(
3
+
3
)
/
4
;
int
filter_h
=
32
*
3
;
int
output_w
=
224
*
(
32
+
3
)
/
4
;
int
output_h
=
1
*
224
;
framework
::
DDim
input_dims
=
{
1
,
3
,
224
,
224
};
framework
::
CLTensor
input_cl_tensor
(
context
,
queue
);
input_cl_tensor
.
Resize
(
input_dims
);
cl_mem
inputBuffer
=
input_cl_tensor
.
mutable_with_data
<
float
>
(
input
);
framework
::
DDim
filter_dims
=
{
32
,
3
,
3
,
3
};
framework
::
CLTensor
filter_cl_tensor
(
context
,
queue
);
input_cl_tensor
.
Resize
(
filter_dims
);
cl_mem
filterBuffer
=
filter_cl_tensor
.
mutable_with_data
<
float
>
(
filter
);
cl_mem
cl_filter_image
=
NULL
;
cl_mem
cl_input_image
=
NULL
;
cl_mem
cl_output_image
=
NULL
;
cl_image_format
cf
=
{.
image_channel_order
=
CL_RGBA
,
.
image_channel_data_type
=
CL_HALF_FLOAT
};
cl_input_image
=
clCreateImage2D
(
context
,
CL_MEM_READ_WRITE
|
0
,
&
cf
,
input_w
,
input_h
,
0
,
NULL
,
&
status
);
cl_filter_image
=
clCreateImage2D
(
context
,
CL_MEM_READ_WRITE
|
0
,
&
cf
,
filter_w
,
filter_h
,
0
,
NULL
,
&
status
);
cl_output_image
=
clCreateImage2D
(
context
,
CL_MEM_READ_WRITE
|
0
,
&
cf
,
output_w
,
output_h
,
0
,
NULL
,
&
status
);
char
*
code
;
std
::
string
path
=
framework
::
CLEngine
::
Instance
()
->
GetCLPath
()
+
"/cl_kernel/feed_kernel.cl"
;
size_t
length
=
readText
(
path
.
c_str
(),
&
code
);
cl_program
program
=
clCreateProgramWithSource
(
context
,
1
,
(
const
char
**
)
&
code
,
&
length
,
NULL
);
std
::
string
path1
=
"-cl-fast-relaxed-math -I "
+
framework
::
CLEngine
::
Instance
()
->
GetCLPath
()
+
"/cl_kernel"
;
clBuildProgram
(
program
,
0
,
0
,
path1
.
c_str
(),
NULL
,
NULL
);
cl_kernel
kernel
=
clCreateKernel
(
program
,
"feed"
,
&
status
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
inputBuffer
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
cl_input_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
cl_int
),
&
input_w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_int
),
&
input_h
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_int
),
&
c
);
CL_CHECK_ERRORS
(
status
);
size_t
global_work_size
[
2
]
=
{
input_w
,
input_h
};
// cl_event out_event = param.Out()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
queue
,
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
filterBuffer
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
cl_filter_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
cl_int
),
&
filter_w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_int
),
&
filter_h
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_int
),
&
c
);
CL_CHECK_ERRORS
(
status
);
size_t
global_work_size1
[
2
]
=
{
filter_w
,
filter_h
};
// cl_event out_event = param.Out()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
queue
,
kernel
,
2
,
NULL
,
global_work_size1
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
clFinish
(
queue
);
queue
=
clCreateCommandQueue
(
context
,
listDevice
[
0
],
0
,
&
status
);
path
=
framework
::
CLEngine
::
Instance
()
->
GetCLPath
()
+
"/cl_kernel/conv_kernel.cl"
;
size_t
length1
=
readText
(
path
.
c_str
(),
&
code
);
program
=
clCreateProgramWithSource
(
context
,
1
,
(
const
char
**
)
&
code
,
&
length1
,
&
status
);
CL_CHECK_ERRORS
(
status
);
clBuildProgram
(
program
,
0
,
0
,
path1
.
c_str
(),
NULL
,
NULL
);
kernel
=
clCreateKernel
(
program
,
"conv_3x3"
,
&
status
);
CL_CHECK_ERRORS
(
status
);
int
c_block
=
(
32
+
3
)
/
4
;
int
nh
=
n
*
h
;
int
stride
=
1
;
int
offset
=
0
;
int
input_c
=
(
c
+
3
)
/
4
;
int
dilation
=
1
;
int
input_width
=
224
;
int
input_height
=
224
;
int
output_width
=
224
;
int
output_height
=
224
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
cl_input_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
cl_filter_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
cl_output_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
size_t
global_work_size2
[
3
]
=
{
8
,
224
,
224
};
auto
time1
=
paddle_mobile
::
time
();
status
=
clEnqueueNDRangeKernel
(
queue
,
kernel
,
3
,
NULL
,
global_work_size2
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
clFinish
(
queue
);
auto
time2
=
paddle_mobile
::
time
();
paddle_mobile
::
memory
::
Free
(
input
);
paddle_mobile
::
memory
::
Free
(
filter
);
return
paddle_mobile
::
time_diff
(
time1
,
time2
);
}
template
<
typename
Dtype
,
Precision
P
>
int
PaddleMobile
<
Dtype
,
P
>::
readText
(
const
char
*
kernelPath
,
char
**
pcode
)
// 读取文本文件放入 pcode,返回字符串长度
{
FILE
*
fp
;
int
size
;
// printf("<readText> File: %s\n", kernelPath);
fp
=
fopen
(
kernelPath
,
"rb"
);
if
(
!
fp
)
{
printf
(
"<readText> Open file failed
\n
"
);
return
-
1
;
}
if
(
fseek
(
fp
,
0
,
SEEK_END
)
!=
0
)
{
printf
(
"<readText> Seek end of file failed
\n
"
);
return
-
1
;
}
if
((
size
=
ftell
(
fp
))
<
0
)
{
printf
(
"<readText> Get file position failed
\n
"
);
return
-
1
;
}
rewind
(
fp
);
if
((
*
pcode
=
(
char
*
)
malloc
(
size
+
1
))
==
NULL
)
{
printf
(
"<readText> Allocate space failed
\n
"
);
return
-
1
;
}
fread
(
*
pcode
,
1
,
size
,
fp
);
(
*
pcode
)[
size
]
=
'\0'
;
fclose
(
fp
);
return
size
+
1
;
}
#endif
template
class
PaddleMobile
<
CPU
,
Precision
::
FP32
>;
...
...
src/io/paddle_mobile.h
浏览文件 @
d333db62
...
...
@@ -65,6 +65,7 @@ class PaddleMobile {
void
SetThreadNum
(
int
num
);
void
Clear
();
double
GetPredictTime
();
~
PaddleMobile
();
...
...
@@ -80,6 +81,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_CL
public:
void
SetCLPath
(
std
::
string
cl_path
);
int
readText
(
const
char
*
kernelPath
,
char
**
pcode
);
// 读取文本文件放入 pcode,返回字符串长度
#endif
private:
...
...
src/operators/kernel/cl/cl_kernel/feed_kernel.cl
浏览文件 @
d333db62
...
...
@@ -13,14 +13,22 @@ See the License for the specific language governing permissions and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
feed
(
__global
float
*in,
__write_only
image2d_t
outputImage,int
h,int
w
)
__kernel
void
feed
(
__global
float
*in,
__write_only
image2d_t
outputImage,int
h,int
w
,int
c
)
{
int
i
=
get_global_id
(
0
)
;
int
j
=
get_global_id
(
1
)
;
half4
pixel
;
pixel.x
=
convert_half
(
in[
(
i
*
w
+
j
)
]
)
;
if
(
c>=2
)
{
pixel.y
=
convert_half
(
in[h
*
w
+
(
i
*
w
+
j
)
]
)
;
}else{
pixel.y
=
0.0
;
}
if
(
c>=3
)
{
pixel.z
=
convert_half
(
in[2
*
h
*
w
+
(
i
*
w
+
j
)
]
)
;
}else{
pixel.z
=
0.0
;
}
pixel.w
=
0.0
;
int2
coords
;
coords.x
=
j
;
...
...
src/operators/kernel/cl/feed_kernel.cpp
浏览文件 @
d333db62
...
...
@@ -34,6 +34,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
const
float
*
input_data
=
input
->
data
<
float
>
();
int
numel
=
input
->
numel
();
cl_mem
cl_image
=
output
->
GetCLImage
();
int
c
=
input
->
dims
()[
1
];
int
height
=
output
->
dims
()[
2
];
int
width
=
output
->
dims
()[
3
];
CLTensor
input_cl_tensor
(
this
->
cl_helper_
.
CLContext
(),
...
...
@@ -49,6 +50,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_int
),
&
height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_int
),
&
c
);
CL_CHECK_ERRORS
(
status
);
size_t
global_work_size
[
2
]
=
{
width
,
height
};
...
...
src/operators/math/gemm.cpp
浏览文件 @
d333db62
...
...
@@ -3230,6 +3230,8 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
int
L1
=
64
/
max_threads
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
...
...
@@ -3255,7 +3257,7 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
...
...
@@ -3284,12 +3286,10 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
...
...
@@ -3352,6 +3352,8 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
int
L1
=
64
/
max_threads
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
...
...
@@ -3377,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
...
...
@@ -3405,12 +3407,10 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
...
...
@@ -3480,6 +3480,8 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
int
L1
=
8
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
...
...
@@ -3505,7 +3507,7 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
...
...
@@ -3533,12 +3535,10 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
...
...
test/net/test_yologpu.cpp
浏览文件 @
d333db62
...
...
@@ -13,17 +13,75 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <thread>
#include "../../src/common/types.h"
#include "../test_helper.h"
#include "../test_include.h"
void
t1
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
GPU_CL
>
paddle_mobile_gpu
;
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile_cpu
;
// paddle_mobile.SetThreadNum(4);
#ifdef PADDLE_MOBILE_CL
paddle_mobile_gpu
.
SetCLPath
(
"/data/local/tmp/bin"
);
#endif
printf
(
"cpu time:%f
\n
"
,
paddle_mobile_cpu
.
GetPredictTime
());
printf
(
"gpu time:%f
\n
"
,
paddle_mobile_gpu
.
GetPredictTime
());
auto
time1
=
paddle_mobile
::
time
();
auto
isok
=
paddle_mobile_gpu
.
Load
(
std
::
string
(
g_yolo_mul
)
+
"/model"
,
std
::
string
(
g_yolo_mul
)
+
"/params"
,
true
);
int
main
()
{
// auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
if
(
isok
)
{
auto
time2
=
paddle_mobile
::
time
();
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time2
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
416
,
416
};
GetInput
<
float
>
(
g_yolo_img
,
&
input
,
dims
);
std
::
vector
<
float
>
vec_result
;
// = paddle_mobile.Predict(input, dims);
auto
time3
=
paddle_mobile
::
time
();
int
max
=
10
;
for
(
int
i
=
0
;
i
<
max
;
++
i
)
{
vec_result
=
paddle_mobile_gpu
.
Predict
(
input
,
dims
);
}
auto
time4
=
paddle_mobile
::
time
();
// auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time4 = paddle_mobile::time();
std
::
cout
<<
"predict cost :"
<<
paddle_mobile
::
time_diff
(
time3
,
time4
)
/
max
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>::
iterator
biggest
=
std
::
max_element
(
std
::
begin
(
vec_result
),
std
::
end
(
vec_result
));
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
<<
std
::
distance
(
std
::
begin
(
vec_result
),
biggest
)
<<
std
::
endl
;
// for (float i : vec_result) {
// std::cout << i << std::endl;
// }
}
}
void
t2
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
GPU_CL
>
paddle_mobile
;
// paddle_mobile.SetThreadNum(4);
#ifdef PADDLE_MOBILE_CL
paddle_mobile
.
SetCLPath
(
"/data/local/tmp/bin"
);
#endif
auto
time1
=
paddle_mobile
::
time
();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect
) + "/model",
// std::string(g_mobilenet_detect
) + "/params", true);
auto
isok
=
paddle_mobile
.
Load
(
std
::
string
(
g_yolo_mul
)
+
"/model"
,
std
::
string
(
g_yolo_mul
)
+
"/params"
,
true
);
auto
isok
=
paddle_mobile
.
Load
(
std
::
string
(
g_yolo_mul
),
true
);
//
auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
if
(
isok
)
{
auto
time2
=
paddle_mobile
::
time
();
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time2
)
<<
"ms"
...
...
@@ -62,5 +120,66 @@ int main() {
// std::cout << i << std::endl;
// }
}
}
void
t3
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
// paddle_mobile.SetThreadNum(4);
//#ifdef PADDLE_MOBILE_CL
// paddle_mobile.SetCLPath("/data/local/tmp/bin");
//#endif
auto
time1
=
paddle_mobile
::
time
();
auto
isok
=
paddle_mobile
.
Load
(
std
::
string
(
g_yolo_mul
)
+
"/model"
,
std
::
string
(
g_yolo_mul
)
+
"/params"
,
true
);
// auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
if
(
isok
)
{
auto
time2
=
paddle_mobile
::
time
();
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time2
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
416
,
416
};
GetInput
<
float
>
(
g_yolo_img
,
&
input
,
dims
);
std
::
vector
<
float
>
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
auto
time3
=
paddle_mobile
::
time
();
int
max
=
10
;
for
(
int
i
=
0
;
i
<
max
;
++
i
)
{
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
}
auto
time4
=
paddle_mobile
::
time
();
// auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time4 = paddle_mobile::time();
std
::
cout
<<
"predict cost :"
<<
paddle_mobile
::
time_diff
(
time3
,
time4
)
/
max
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>::
iterator
biggest
=
std
::
max_element
(
std
::
begin
(
vec_result
),
std
::
end
(
vec_result
));
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
<<
std
::
distance
(
std
::
begin
(
vec_result
),
biggest
)
<<
std
::
endl
;
// for (float i : vec_result) {
// std::cout << i << std::endl;
// }
}
}
int
main
()
{
// std::thread th1(t1);
// std::thread th2(t2);
std
::
thread
th3
(
t3
);
// std::thread th1(t1);
// th1.join();
// th2.join();
th3
.
join
();
// th1.join();
return
0
;
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录