Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
efebd83b
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
efebd83b
编写于
10月 29, 2014
作者:
A
Alexander Alekhin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3159 from ElenaGvozdeva:ocl_gemm
上级
4a9cfbf8
d88fdd03
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
197 addition
and
2 deletion
+197
-2
modules/core/src/matmul.cpp
modules/core/src/matmul.cpp
+85
-2
modules/core/src/opencl/gemm.cl
modules/core/src/opencl/gemm.cl
+112
-0
未找到文件。
modules/core/src/matmul.cpp
浏览文件 @
efebd83b
...
...
@@ -693,7 +693,7 @@ static void GEMMStore_64fc( const Complexd* c_data, size_t c_step,
#ifdef HAVE_CLAMDBLAS
static
bool
ocl_gemm
(
InputArray
matA
,
InputArray
matB
,
double
alpha
,
static
bool
ocl_gemm
_amdblas
(
InputArray
matA
,
InputArray
matB
,
double
alpha
,
InputArray
matC
,
double
beta
,
OutputArray
matD
,
int
flags
)
{
int
type
=
matA
.
type
(),
esz
=
CV_ELEM_SIZE
(
type
);
...
...
@@ -775,6 +775,84 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
#endif
#ifdef HAVE_OPENCL
static
bool
ocl_gemm
(
InputArray
matA
,
InputArray
matB
,
double
alpha
,
InputArray
matC
,
double
beta
,
OutputArray
matD
,
int
flags
)
{
int
depth
=
matA
.
depth
(),
cn
=
matA
.
channels
();
int
type
=
CV_MAKETYPE
(
depth
,
cn
);
CV_Assert
(
type
==
matB
.
type
()
&&
(
type
==
CV_32FC1
||
type
==
CV_64FC1
||
type
==
CV_32FC2
||
type
==
CV_64FC2
)
);
const
ocl
::
Device
&
dev
=
ocl
::
Device
::
getDefault
();
bool
doubleSupport
=
dev
.
doubleFPConfig
()
>
0
;
if
(
!
doubleSupport
&&
depth
==
CV_64F
)
return
false
;
bool
haveC
=
matC
.
kind
()
!=
cv
::
_InputArray
::
NONE
;
Size
sizeA
=
matA
.
size
(),
sizeB
=
matB
.
size
(),
sizeC
=
haveC
?
matC
.
size
()
:
Size
(
0
,
0
);
bool
atrans
=
(
flags
&
GEMM_1_T
)
!=
0
,
btrans
=
(
flags
&
GEMM_2_T
)
!=
0
,
ctrans
=
(
flags
&
GEMM_3_T
)
!=
0
;
if
(
atrans
)
sizeA
=
Size
(
sizeA
.
height
,
sizeA
.
width
);
if
(
btrans
)
sizeB
=
Size
(
sizeB
.
height
,
sizeB
.
width
);
if
(
haveC
&&
ctrans
)
sizeC
=
Size
(
sizeC
.
height
,
sizeC
.
width
);
Size
sizeD
(
sizeB
.
width
,
sizeA
.
height
);
CV_Assert
(
!
haveC
||
matC
.
type
()
==
type
);
CV_Assert
(
sizeA
.
width
==
sizeB
.
height
&&
(
!
haveC
||
sizeC
==
sizeD
)
);
int
max_wg_size
=
(
int
)
dev
.
maxWorkGroupSize
();
int
block_size
=
(
max_wg_size
/
(
32
*
cn
)
<
32
)
?
(
max_wg_size
/
(
16
*
cn
)
<
16
)
?
(
max_wg_size
/
(
8
*
cn
)
<
8
)
?
1
:
8
:
16
:
32
;
matD
.
create
(
sizeD
,
type
);
UMat
A
=
matA
.
getUMat
(),
B
=
matB
.
getUMat
(),
D
=
matD
.
getUMat
();
if
(
atrans
)
A
=
A
.
t
();
if
(
btrans
)
B
=
B
.
t
();
if
(
haveC
)
ctrans
?
transpose
(
matC
,
D
)
:
matC
.
copyTo
(
D
);
int
vectorWidths
[]
=
{
4
,
4
,
2
,
2
,
1
,
4
,
cn
,
-
1
};
int
kercn
=
ocl
::
checkOptimalVectorWidth
(
vectorWidths
,
B
,
D
);
String
opts
=
format
(
"-D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s"
,
ocl
::
typeToStr
(
type
),
ocl
::
typeToStr
(
depth
),
ocl
::
typeToStr
(
CV_MAKETYPE
(
depth
,
kercn
)),
cn
,
kercn
,
block_size
,
(
sizeA
.
width
%
block_size
!=
0
)
?
"-D NO_MULT"
:
""
,
haveC
?
"-D HAVE_C"
:
""
,
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
);
ocl
::
Kernel
k
(
"gemm"
,
cv
::
ocl
::
core
::
gemm_oclsrc
,
opts
);
if
(
k
.
empty
())
return
false
;
if
(
depth
==
CV_64F
)
k
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
A
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
B
,
cn
,
kercn
),
ocl
::
KernelArg
::
ReadWrite
(
D
,
cn
,
kercn
),
sizeA
.
width
,
alpha
,
beta
);
else
k
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
A
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
B
,
cn
,
kercn
),
ocl
::
KernelArg
::
ReadWrite
(
D
,
cn
,
kercn
),
sizeA
.
width
,
(
float
)
alpha
,
(
float
)
beta
);
size_t
globalsize
[
2
]
=
{
sizeD
.
width
*
cn
/
kercn
,
sizeD
.
height
};
size_t
localsize
[
2
]
=
{
block_size
,
block_size
};
return
k
.
run
(
2
,
globalsize
,
block_size
!=
1
?
localsize
:
NULL
,
false
);
}
#endif
}
void
cv
::
gemm
(
InputArray
matA
,
InputArray
matB
,
double
alpha
,
...
...
@@ -783,7 +861,12 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
#ifdef HAVE_CLAMDBLAS
CV_OCL_RUN
(
ocl
::
haveAmdBlas
()
&&
matA
.
dims
()
<=
2
&&
matB
.
dims
()
<=
2
&&
matC
.
dims
()
<=
2
&&
_matD
.
isUMat
()
&&
matA
.
cols
()
>
20
&&
matA
.
rows
()
>
20
&&
matB
.
cols
()
>
20
,
// since it works incorrect for small sizes
ocl_gemm
(
matA
,
matB
,
alpha
,
matC
,
beta
,
_matD
,
flags
))
ocl_gemm_amdblas
(
matA
,
matB
,
alpha
,
matC
,
beta
,
_matD
,
flags
))
#endif
#ifdef HAVE_OPENCL
CV_OCL_RUN
(
_matD
.
isUMat
()
&&
matA
.
dims
()
<=
2
&&
matB
.
dims
()
<=
2
&&
matC
.
dims
()
<=
2
,
ocl_gemm
(
matA
,
matB
,
alpha
,
matC
,
beta
,
_matD
,
flags
))
#endif
const
int
block_lin_size
=
128
;
...
...
modules/core/src/opencl/gemm.cl
0 → 100644
浏览文件 @
efebd83b
//
This
file
is
part
of
OpenCV
project.
//
It
is
subject
to
the
license
terms
in
the
LICENSE
file
found
in
the
top-level
directory
//
of
this
distribution
and
at
http://opencv.org/license.html.
//
Copyright
(
C
)
2014
,
Itseez,
Inc.,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
#
ifdef
DOUBLE_SUPPORT
#
ifdef
cl_amd_fp64
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
elif
defined
(
cl_khr_fp64
)
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
endif
#
endif
#
define
TSIZE
(
int
)
sizeof
(
T
)
#
define
WTSIZE
(
int
)
sizeof
(
WT
)
#
define
IND_A
mad24
(
y,
A_step,
A_offset
)
#
define
IND_B
mad24
(
x,
WTSIZE,
B_offset
)
#
define
STEP_B
B_step
/
WTSIZE
#
define
LOCAL_SIZE_ODD
(
LOCAL_SIZE
+
1
)
#
if
cn==2
#
if
kercn==2
#
define
MUL
(
a,
b
)
\
{
\
sum.x
+=
fma
(
a.x,
b.x,
-
a.y
*
b.y
)
;\
sum.y
+=
fma
(
a.x,
b.y,
a.y
*
b.x
)
;\
}
#
else
#
define
MUL
(
a,
b
)
\
{
\
sum.x
+=
fma
(
a.x,
b.x,
-
a.y
*
b.y
)
;\
sum.y
+=
fma
(
a.x,
b.y,
a.y
*
b.x
)
;\
sum.z
+=
fma
(
a.x,
b.z,
-
a.y
*
b.w
)
;\
sum.w
+=
fma
(
a.x,
b.w,
a.y
*
b.z
)
;\
}
#
endif
#
else
#
define
MUL
(
a,
b
)
sum
=
fma
(
a,
b,
sum
)
;
#
endif
__kernel
void
gemm
(
__global
const
uchar
*
A_ptr,
int
A_step,
int
A_offset,
__global
const
uchar
*
B_ptr,
int
B_step,
int
B_offset,
__global
uchar
*
D_ptr,
int
D_step,
int
D_offset,
int
D_rows,
int
D_cols,
int
n,
T1
alpha,
T1
beta
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
lidx
=
get_local_id
(
0
)
;
int
lidy
=
get_local_id
(
1
)
;
__global
const
T*
A
=
(
__global
const
T*
)(
A_ptr
+
IND_A
)
;
__global
const
WT*
B
=
(
__global
const
WT*
)(
B_ptr
+
IND_B
)
;
WT
sum
=
(
WT
)(
0
)
;
#
if
LOCAL_SIZE
==
1
if
(
x
<
D_cols
&&
y
<
D_rows
)
{
for
(
int
i
=
0
; i < n; ++i)
MUL
(
A[i],
B[i*STEP_B]
)
;
#
else
__local
T
a_local[LOCAL_SIZE_ODD*LOCAL_SIZE]
;
__local
WT
b_local[LOCAL_SIZE_ODD*LOCAL_SIZE]
;
int
reps
;
#
if
NO_MULT
reps
=
(
n
+
LOCAL_SIZE-1
)
/LOCAL_SIZE
;
#
else
reps
=
n/LOCAL_SIZE
;
#
endif
for
(
int
p
=
0
; p < reps; ++p)
{
if
(
p
*
LOCAL_SIZE
+
lidx
<
n
&&
y
<
D_rows
)
a_local[mad24
(
lidy,
LOCAL_SIZE_ODD,
lidx
)
]
=
A[mad24
(
p,
LOCAL_SIZE,
lidx
)
]
;
if
(
p
*
LOCAL_SIZE
+
lidy
<
n
&&
x
<
D_cols
)
b_local[mad24
(
lidy,
LOCAL_SIZE_ODD,
lidx
)
]
=
B[mad24
(
p,
LOCAL_SIZE,
lidy
)
*STEP_B]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
x
<
D_cols
&&
y
<
D_rows
)
{
#
if
NO_MULT
int
ie
=
min
(
LOCAL_SIZE,
n
-
p
*
LOCAL_SIZE
)
;
for
(
int
i
=
0
; i < ie; ++i)
#
else
for
(
int
i
=
0
; i < LOCAL_SIZE; ++i)
#
endif
MUL
(
a_local[mad24
(
lidy,
LOCAL_SIZE_ODD,
i
)
],
b_local[mad24
(
i,
LOCAL_SIZE_ODD,
lidx
)
]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
x
<
D_cols
&&
y
<
D_rows
)
{
#
endif
__global
WT*
D
=
(
__global
WT*
)(
D_ptr
+
mad24
(
y,
D_step,
mad24
(
x,
WTSIZE,
D_offset
)))
;
#
if
HAVE_C
D[0]
=
mad
(
alpha,
sum,
D[0]*beta
)
;
#
else
D[0]
=
alpha
*
sum
;
#
endif
}
}
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录