Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
4d9d7e6d
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
4d9d7e6d
编写于
8月 27, 2014
作者:
V
Vadim Pisarevsky
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3160 from akarsakov:ocl_dft_double_support
上级
2811d408
a89ff402
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
199 addition
and
172 deletion
+199
-172
modules/core/src/dxt.cpp
modules/core/src/dxt.cpp
+56
-38
modules/core/src/opencl/fft.cl
modules/core/src/opencl/fft.cl
+136
-128
modules/core/test/ocl/test_dft.cpp
modules/core/test/ocl/test_dft.cpp
+7
-6
未找到文件。
modules/core/src/dxt.cpp
浏览文件 @
4d9d7e6d
...
@@ -1802,11 +1802,14 @@ private:
...
@@ -1802,11 +1802,14 @@ private:
String
buildOptions
;
String
buildOptions
;
int
thread_count
;
int
thread_count
;
int
dft_size
;
int
dft_size
;
int
dft_depth
;
bool
status
;
bool
status
;
public:
public:
OCL_FftPlan
(
int
_size
)
:
dft_size
(
_size
),
status
(
true
)
OCL_FftPlan
(
int
_size
,
int
_depth
)
:
dft_size
(
_size
),
dft_depth
(
_depth
),
status
(
true
)
{
{
CV_Assert
(
dft_depth
==
CV_32F
||
dft_depth
==
CV_64F
);
int
min_radix
;
int
min_radix
;
std
::
vector
<
int
>
radixes
,
blocks
;
std
::
vector
<
int
>
radixes
,
blocks
;
ocl_getRadixes
(
dft_size
,
radixes
,
blocks
,
min_radix
);
ocl_getRadixes
(
dft_size
,
radixes
,
blocks
,
min_radix
);
...
@@ -1832,31 +1835,15 @@ public:
...
@@ -1832,31 +1835,15 @@ public:
n
*=
radix
;
n
*=
radix
;
}
}
twiddles
.
create
(
1
,
twiddle_size
,
CV_32FC2
);
twiddles
.
create
(
1
,
twiddle_size
,
CV_MAKE_TYPE
(
dft_depth
,
2
));
Mat
tw
=
twiddles
.
getMat
(
ACCESS_WRITE
);
if
(
dft_depth
==
CV_32F
)
float
*
ptr
=
tw
.
ptr
<
float
>
();
fillRadixTable
<
float
>
(
twiddles
,
radixes
);
int
ptr_index
=
0
;
else
fillRadixTable
<
double
>
(
twiddles
,
radixes
);
n
=
1
;
for
(
size_t
i
=
0
;
i
<
radixes
.
size
();
i
++
)
{
int
radix
=
radixes
[
i
];
n
*=
radix
;
for
(
int
j
=
1
;
j
<
radix
;
j
++
)
{
double
theta
=
-
CV_2PI
*
j
/
n
;
for
(
int
k
=
0
;
k
<
(
n
/
radix
);
k
++
)
{
ptr
[
ptr_index
++
]
=
(
float
)
cos
(
k
*
theta
);
ptr
[
ptr_index
++
]
=
(
float
)
sin
(
k
*
theta
);
}
}
}
buildOptions
=
format
(
"-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s"
,
buildOptions
=
format
(
"-D LOCAL_SIZE=%d -D kercn=%d -D FT=%s -D CT=%s%s -D RADIX_PROCESS=%s"
,
dft_size
,
min_radix
,
radix_processing
.
c_str
());
dft_size
,
min_radix
,
ocl
::
typeToStr
(
dft_depth
),
ocl
::
typeToStr
(
CV_MAKE_TYPE
(
dft_depth
,
2
)),
dft_depth
==
CV_64F
?
" -D DOUBLE_SUPPORT"
:
""
,
radix_processing
.
c_str
());
}
}
bool
enqueueTransform
(
InputArray
_src
,
OutputArray
_dst
,
int
num_dfts
,
int
flags
,
int
fftType
,
bool
rows
=
true
)
const
bool
enqueueTransform
(
InputArray
_src
,
OutputArray
_dst
,
int
num_dfts
,
int
flags
,
int
fftType
,
bool
rows
=
true
)
const
...
@@ -1913,7 +1900,7 @@ public:
...
@@ -1913,7 +1900,7 @@ public:
if
(
k
.
empty
())
if
(
k
.
empty
())
return
false
;
return
false
;
k
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnly
(
dst
),
ocl
::
KernelArg
::
PtrReadOnly
(
twiddles
),
thread_count
,
num_dfts
);
k
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
src
),
ocl
::
KernelArg
::
WriteOnly
(
dst
),
ocl
::
KernelArg
::
ReadOnlyNoSize
(
twiddles
),
thread_count
,
num_dfts
);
return
k
.
run
(
2
,
globalsize
,
localsize
,
false
);
return
k
.
run
(
2
,
globalsize
,
localsize
,
false
);
}
}
...
@@ -1986,6 +1973,32 @@ private:
...
@@ -1986,6 +1973,32 @@ private:
min_radix
=
min
(
min_radix
,
block
*
radix
);
min_radix
=
min
(
min_radix
,
block
*
radix
);
}
}
}
}
template
<
typename
T
>
static
void
fillRadixTable
(
UMat
twiddles
,
const
std
::
vector
<
int
>&
radixes
)
{
Mat
tw
=
twiddles
.
getMat
(
ACCESS_WRITE
);
T
*
ptr
=
tw
.
ptr
<
T
>
();
int
ptr_index
=
0
;
int
n
=
1
;
for
(
size_t
i
=
0
;
i
<
radixes
.
size
();
i
++
)
{
int
radix
=
radixes
[
i
];
n
*=
radix
;
for
(
int
j
=
1
;
j
<
radix
;
j
++
)
{
double
theta
=
-
CV_2PI
*
j
/
n
;
for
(
int
k
=
0
;
k
<
(
n
/
radix
);
k
++
)
{
ptr
[
ptr_index
++
]
=
(
T
)
cos
(
k
*
theta
);
ptr
[
ptr_index
++
]
=
(
T
)
sin
(
k
*
theta
);
}
}
}
}
};
};
class
OCL_FftPlanCache
class
OCL_FftPlanCache
...
@@ -1997,17 +2010,18 @@ public:
...
@@ -1997,17 +2010,18 @@ public:
return
planCache
;
return
planCache
;
}
}
Ptr
<
OCL_FftPlan
>
getFftPlan
(
int
dft_size
)
Ptr
<
OCL_FftPlan
>
getFftPlan
(
int
dft_size
,
int
depth
)
{
{
std
::
map
<
int
,
Ptr
<
OCL_FftPlan
>
>::
iterator
f
=
planStorage
.
find
(
dft_size
);
int
key
=
(
dft_size
<<
16
)
|
(
depth
&
0xFFFF
);
std
::
map
<
int
,
Ptr
<
OCL_FftPlan
>
>::
iterator
f
=
planStorage
.
find
(
key
);
if
(
f
!=
planStorage
.
end
())
if
(
f
!=
planStorage
.
end
())
{
{
return
f
->
second
;
return
f
->
second
;
}
}
else
else
{
{
Ptr
<
OCL_FftPlan
>
newPlan
=
Ptr
<
OCL_FftPlan
>
(
new
OCL_FftPlan
(
dft_size
));
Ptr
<
OCL_FftPlan
>
newPlan
=
Ptr
<
OCL_FftPlan
>
(
new
OCL_FftPlan
(
dft_size
,
depth
));
planStorage
[
dft_size
]
=
newPlan
;
planStorage
[
key
]
=
newPlan
;
return
newPlan
;
return
newPlan
;
}
}
}
}
...
@@ -2027,21 +2041,25 @@ protected:
...
@@ -2027,21 +2041,25 @@ protected:
static
bool
ocl_dft_rows
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_rows
,
int
flags
,
int
fftType
)
static
bool
ocl_dft_rows
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_rows
,
int
flags
,
int
fftType
)
{
{
Ptr
<
OCL_FftPlan
>
plan
=
OCL_FftPlanCache
::
getInstance
().
getFftPlan
(
_src
.
cols
());
int
type
=
_src
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
);
Ptr
<
OCL_FftPlan
>
plan
=
OCL_FftPlanCache
::
getInstance
().
getFftPlan
(
_src
.
cols
(),
depth
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_rows
,
flags
,
fftType
,
true
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_rows
,
flags
,
fftType
,
true
);
}
}
static
bool
ocl_dft_cols
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_cols
,
int
flags
,
int
fftType
)
static
bool
ocl_dft_cols
(
InputArray
_src
,
OutputArray
_dst
,
int
nonzero_cols
,
int
flags
,
int
fftType
)
{
{
Ptr
<
OCL_FftPlan
>
plan
=
OCL_FftPlanCache
::
getInstance
().
getFftPlan
(
_src
.
rows
());
int
type
=
_src
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
);
Ptr
<
OCL_FftPlan
>
plan
=
OCL_FftPlanCache
::
getInstance
().
getFftPlan
(
_src
.
rows
(),
depth
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_cols
,
flags
,
fftType
,
false
);
return
plan
->
enqueueTransform
(
_src
,
_dst
,
nonzero_cols
,
flags
,
fftType
,
false
);
}
}
static
bool
ocl_dft
(
InputArray
_src
,
OutputArray
_dst
,
int
flags
,
int
nonzero_rows
)
static
bool
ocl_dft
(
InputArray
_src
,
OutputArray
_dst
,
int
flags
,
int
nonzero_rows
)
{
{
int
type
=
_src
.
type
(),
cn
=
CV_MAT_CN
(
type
);
int
type
=
_src
.
type
(),
cn
=
CV_MAT_CN
(
type
)
,
depth
=
CV_MAT_DEPTH
(
type
)
;
Size
ssize
=
_src
.
size
();
Size
ssize
=
_src
.
size
();
if
(
!
(
type
==
CV_32FC1
||
type
==
CV_32FC2
)
)
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
if
(
!
((
cn
==
1
||
cn
==
2
)
&&
(
depth
==
CV_32F
||
(
depth
==
CV_64F
&&
doubleSupport
)))
)
return
false
;
return
false
;
// if is not a multiplication of prime numbers { 2, 3, 5 }
// if is not a multiplication of prime numbers { 2, 3, 5 }
...
@@ -2082,7 +2100,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
...
@@ -2082,7 +2100,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
if
(
fftType
==
C2C
||
fftType
==
R2C
)
if
(
fftType
==
C2C
||
fftType
==
R2C
)
{
{
// complex output
// complex output
_dst
.
create
(
src
.
size
(),
CV_
32FC2
);
_dst
.
create
(
src
.
size
(),
CV_
MAKETYPE
(
depth
,
2
)
);
output
=
_dst
.
getUMat
();
output
=
_dst
.
getUMat
();
}
}
else
else
...
@@ -2090,13 +2108,13 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
...
@@ -2090,13 +2108,13 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
// real output
// real output
if
(
is1d
)
if
(
is1d
)
{
{
_dst
.
create
(
src
.
size
(),
CV_
32FC1
);
_dst
.
create
(
src
.
size
(),
CV_
MAKETYPE
(
depth
,
1
)
);
output
=
_dst
.
getUMat
();
output
=
_dst
.
getUMat
();
}
}
else
else
{
{
_dst
.
create
(
src
.
size
(),
CV_
32FC1
);
_dst
.
create
(
src
.
size
(),
CV_
MAKETYPE
(
depth
,
1
)
);
output
.
create
(
src
.
size
(),
CV_
32FC2
);
output
.
create
(
src
.
size
(),
CV_
MAKETYPE
(
depth
,
2
)
);
}
}
}
}
...
...
modules/core/src/opencl/fft.cl
浏览文件 @
4d9d7e6d
...
@@ -12,22 +12,30 @@
...
@@ -12,22 +12,30 @@
#
define
fft5_4
-1.538841768587f
#
define
fft5_4
-1.538841768587f
#
define
fft5_5
0.363271264002f
#
define
fft5_5
0.363271264002f
#
ifdef
DOUBLE_SUPPORT
#
ifdef
cl_amd_fp64
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
elif
defined
(
cl_khr_fp64
)
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
endif
#
endif
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
float2
mul_float2
(
float2
a,
float2
b
)
{
CT
mul_complex
(
CT
a,
CT
b
)
{
return
(
float2
)(
fma
(
a.x,
b.x,
-a.y
*
b.y
)
,
fma
(
a.x,
b.y,
a.y
*
b.x
))
;
return
(
CT
)(
fma
(
a.x,
b.x,
-a.y
*
b.y
)
,
fma
(
a.x,
b.y,
a.y
*
b.x
))
;
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
float2
twiddle
(
float2
a
)
{
CT
twiddle
(
CT
a
)
{
return
(
float2
)(
a.y,
-a.x
)
;
return
(
CT
)(
a.y,
-a.x
)
;
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
butterfly2
(
float2
a0,
float2
a1,
__local
float2*
smem,
__global
const
float2
*
twiddles,
void
butterfly2
(
CT
a0,
CT
a1,
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size
)
const
int
x,
const
int
block_size
)
{
{
const
int
k
=
x
&
(
block_size
-
1
)
;
const
int
k
=
x
&
(
block_size
-
1
)
;
a1
=
mul_
float2
(
twiddles[k],
a1
)
;
a1
=
mul_
complex
(
twiddles[k],
a1
)
;
const
int
dst_ind
=
(
x
<<
1
)
-
k
;
const
int
dst_ind
=
(
x
<<
1
)
-
k
;
smem[dst_ind]
=
a0
+
a1
;
smem[dst_ind]
=
a0
+
a1
;
...
@@ -35,19 +43,19 @@ void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float
...
@@ -35,19 +43,19 @@ void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
butterfly4
(
float2
a0,
float2
a1,
float2
a2,
float2
a3,
__local
float2*
smem,
__global
const
float2
*
twiddles,
void
butterfly4
(
CT
a0,
CT
a1,
CT
a2,
CT
a3,
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size
)
const
int
x,
const
int
block_size
)
{
{
const
int
k
=
x
&
(
block_size
-
1
)
;
const
int
k
=
x
&
(
block_size
-
1
)
;
a1
=
mul_
float2
(
twiddles[k],
a1
)
;
a1
=
mul_
complex
(
twiddles[k],
a1
)
;
a2
=
mul_
float2
(
twiddles[k
+
block_size],
a2
)
;
a2
=
mul_
complex
(
twiddles[k
+
block_size],
a2
)
;
a3
=
mul_
float2
(
twiddles[k
+
2*block_size],
a3
)
;
a3
=
mul_
complex
(
twiddles[k
+
2*block_size],
a3
)
;
const
int
dst_ind
=
((
x
-
k
)
<<
2
)
+
k
;
const
int
dst_ind
=
((
x
-
k
)
<<
2
)
+
k
;
float2
b0
=
a0
+
a2
;
CT
b0
=
a0
+
a2
;
a2
=
a0
-
a2
;
a2
=
a0
-
a2
;
float2
b1
=
a1
+
a3
;
CT
b1
=
a1
+
a3
;
a3
=
twiddle
(
a1
-
a3
)
;
a3
=
twiddle
(
a1
-
a3
)
;
smem[dst_ind]
=
b0
+
b1
;
smem[dst_ind]
=
b0
+
b1
;
...
@@ -57,17 +65,17 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem
...
@@ -57,17 +65,17 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
butterfly3
(
float2
a0,
float2
a1,
float2
a2,
__local
float2*
smem,
__global
const
float2
*
twiddles,
void
butterfly3
(
CT
a0,
CT
a1,
CT
a2,
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size
)
const
int
x,
const
int
block_size
)
{
{
const
int
k
=
x
%
block_size
;
const
int
k
=
x
%
block_size
;
a1
=
mul_
float2
(
twiddles[k],
a1
)
;
a1
=
mul_
complex
(
twiddles[k],
a1
)
;
a2
=
mul_
float2
(
twiddles[k+block_size],
a2
)
;
a2
=
mul_
complex
(
twiddles[k+block_size],
a2
)
;
const
int
dst_ind
=
((
x
-
k
)
*
3
)
+
k
;
const
int
dst_ind
=
((
x
-
k
)
*
3
)
+
k
;
float2
b1
=
a1
+
a2
;
CT
b1
=
a1
+
a2
;
a2
=
twiddle
(
sin_120*
(
a1
-
a2
))
;
a2
=
twiddle
(
sin_120*
(
a1
-
a2
))
;
float2
b0
=
a0
-
(
float2
)(
0.5f
)
*b1
;
CT
b0
=
a0
-
(
CT
)(
0.5f
)
*b1
;
smem[dst_ind]
=
a0
+
b1
;
smem[dst_ind]
=
a0
+
b1
;
smem[dst_ind
+
block_size]
=
b0
+
a2
;
smem[dst_ind
+
block_size]
=
b0
+
a2
;
...
@@ -75,19 +83,19 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global
...
@@ -75,19 +83,19 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
butterfly5
(
float2
a0,
float2
a1,
float2
a2,
float2
a3,
float2
a4,
__local
float2*
smem,
__global
const
float2
*
twiddles,
void
butterfly5
(
CT
a0,
CT
a1,
CT
a2,
CT
a3,
CT
a4,
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size
)
const
int
x,
const
int
block_size
)
{
{
const
int
k
=
x
%
block_size
;
const
int
k
=
x
%
block_size
;
a1
=
mul_
float2
(
twiddles[k],
a1
)
;
a1
=
mul_
complex
(
twiddles[k],
a1
)
;
a2
=
mul_
float2
(
twiddles[k
+
block_size],
a2
)
;
a2
=
mul_
complex
(
twiddles[k
+
block_size],
a2
)
;
a3
=
mul_
float2
(
twiddles[k+2*block_size],
a3
)
;
a3
=
mul_
complex
(
twiddles[k+2*block_size],
a3
)
;
a4
=
mul_
float2
(
twiddles[k+3*block_size],
a4
)
;
a4
=
mul_
complex
(
twiddles[k+3*block_size],
a4
)
;
const
int
dst_ind
=
((
x
-
k
)
*
5
)
+
k
;
const
int
dst_ind
=
((
x
-
k
)
*
5
)
+
k
;
__local
float2
*
dst
=
smem
+
dst_ind
;
__local
CT
*
dst
=
smem
+
dst_ind
;
float2
b0,
b1,
b5
;
CT
b0,
b1,
b5
;
b1
=
a1
+
a4
;
b1
=
a1
+
a4
;
a1
-=
a4
;
a1
-=
a4
;
...
@@ -96,11 +104,11 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f
...
@@ -96,11 +104,11 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f
a3
-=
a2
;
a3
-=
a2
;
a2
=
b1
+
a4
;
a2
=
b1
+
a4
;
b0
=
a0
-
(
float2
)
0.25f
*
a2
;
b0
=
a0
-
(
CT
)
0.25f
*
a2
;
b1
=
fft5_2
*
(
b1
-
a4
)
;
b1
=
fft5_2
*
(
b1
-
a4
)
;
a4
=
fft5_3
*
(
float2
)(
-a1.y
-
a3.y,
a1.x
+
a3.x
)
;
a4
=
fft5_3
*
(
CT
)(
-a1.y
-
a3.y,
a1.x
+
a3.x
)
;
b5
=
(
float2
)(
a4.x
-
fft5_5
*
a1.y,
a4.y
+
fft5_5
*
a1.x
)
;
b5
=
(
CT
)(
a4.x
-
fft5_5
*
a1.y,
a4.y
+
fft5_5
*
a1.x
)
;
a4.x
+=
fft5_4
*
a3.y
;
a4.x
+=
fft5_4
*
a3.y
;
a4.y
-=
fft5_4
*
a3.x
;
a4.y
-=
fft5_4
*
a3.x
;
...
@@ -116,9 +124,9 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f
...
@@ -116,9 +124,9 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix2
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix2
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
{
float2
a0,
a1
;
CT
a0,
a1
;
if
(
x
<
t
)
if
(
x
<
t
)
{
{
...
@@ -135,10 +143,10 @@ void fft_radix2(__local float2* smem, __global const float2* twiddles, const int
...
@@ -135,10 +143,10 @@ void fft_radix2(__local float2* smem, __global const float2* twiddles, const int
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix2_B2
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix2_B2
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
x2
=
x1
+
t/2
;
const
int
x2
=
x1
+
t/2
;
float2
a0,
a1,
a2,
a3
;
CT
a0,
a1,
a2,
a3
;
if
(
x1
<
t/2
)
if
(
x1
<
t/2
)
{
{
...
@@ -158,11 +166,11 @@ void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const
...
@@ -158,11 +166,11 @@ void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix2_B3
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix2_B3
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
x2
=
x1
+
t/3
;
const
int
x2
=
x1
+
t/3
;
const
int
x3
=
x1
+
2*t/3
;
const
int
x3
=
x1
+
2*t/3
;
float2
a0,
a1,
a2,
a3,
a4,
a5
;
CT
a0,
a1,
a2,
a3,
a4,
a5
;
if
(
x1
<
t/3
)
if
(
x1
<
t/3
)
{
{
...
@@ -184,13 +192,13 @@ void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const
...
@@ -184,13 +192,13 @@ void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix2_B4
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix2_B4
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
thread_block
=
t/4
;
const
int
thread_block
=
t/4
;
const
int
x2
=
x1
+
thread_block
;
const
int
x2
=
x1
+
thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
if
(
x1
<
t/4
)
if
(
x1
<
t/4
)
{
{
...
@@ -214,14 +222,14 @@ void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const
...
@@ -214,14 +222,14 @@ void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix2_B5
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix2_B5
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
thread_block
=
t/5
;
const
int
thread_block
=
t/5
;
const
int
x2
=
x1
+
thread_block
;
const
int
x2
=
x1
+
thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
const
int
x5
=
x1
+
4*thread_block
;
const
int
x5
=
x1
+
4*thread_block
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9
;
if
(
x1
<
t/5
)
if
(
x1
<
t/5
)
{
{
...
@@ -247,9 +255,9 @@ void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const
...
@@ -247,9 +255,9 @@ void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix4
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix4
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
{
float2
a0,
a1,
a2,
a3
;
CT
a0,
a1,
a2,
a3
;
if
(
x
<
t
)
if
(
x
<
t
)
{
{
...
@@ -265,10 +273,10 @@ void fft_radix4(__local float2* smem, __global const float2* twiddles, const int
...
@@ -265,10 +273,10 @@ void fft_radix4(__local float2* smem, __global const float2* twiddles, const int
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix4_B2
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix4_B2
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
x2
=
x1
+
t/2
;
const
int
x2
=
x1
+
t/2
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
if
(
x1
<
t/2
)
if
(
x1
<
t/2
)
{
{
...
@@ -288,11 +296,11 @@ void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const
...
@@ -288,11 +296,11 @@ void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix4_B3
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix4_B3
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
x2
=
x1
+
t/3
;
const
int
x2
=
x1
+
t/3
;
const
int
x3
=
x2
+
t/3
;
const
int
x3
=
x2
+
t/3
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9,
a10,
a11
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9,
a10,
a11
;
if
(
x1
<
t/3
)
if
(
x1
<
t/3
)
{
{
...
@@ -314,35 +322,35 @@ void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const
...
@@ -314,35 +322,35 @@ void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix8
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix8
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
{
const
int
k
=
x
%
block_size
;
const
int
k
=
x
%
block_size
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7
;
if
(
x
<
t
)
if
(
x
<
t
)
{
{
int
tw_ind
=
block_size
/
8
;
int
tw_ind
=
block_size
/
8
;
a0
=
smem[x]
;
a0
=
smem[x]
;
a1
=
mul_
float2
(
twiddles[k],
smem[x
+
t]
)
;
a1
=
mul_
complex
(
twiddles[k],
smem[x
+
t]
)
;
a2
=
mul_
float2
(
twiddles[k
+
block_size],smem[x+2*t]
)
;
a2
=
mul_
complex
(
twiddles[k
+
block_size],smem[x+2*t]
)
;
a3
=
mul_
float2
(
twiddles[k+2*block_size],smem[x+3*t]
)
;
a3
=
mul_
complex
(
twiddles[k+2*block_size],smem[x+3*t]
)
;
a4
=
mul_
float2
(
twiddles[k+3*block_size],smem[x+4*t]
)
;
a4
=
mul_
complex
(
twiddles[k+3*block_size],smem[x+4*t]
)
;
a5
=
mul_
float2
(
twiddles[k+4*block_size],smem[x+5*t]
)
;
a5
=
mul_
complex
(
twiddles[k+4*block_size],smem[x+5*t]
)
;
a6
=
mul_
float2
(
twiddles[k+5*block_size],smem[x+6*t]
)
;
a6
=
mul_
complex
(
twiddles[k+5*block_size],smem[x+6*t]
)
;
a7
=
mul_
float2
(
twiddles[k+6*block_size],smem[x+7*t]
)
;
a7
=
mul_
complex
(
twiddles[k+6*block_size],smem[x+7*t]
)
;
float2
b0,
b1,
b6,
b7
;
CT
b0,
b1,
b6,
b7
;
b0
=
a0
+
a4
;
b0
=
a0
+
a4
;
a4
=
a0
-
a4
;
a4
=
a0
-
a4
;
b1
=
a1
+
a5
;
b1
=
a1
+
a5
;
a5
=
a1
-
a5
;
a5
=
a1
-
a5
;
a5
=
(
float2
)(
SQRT_2
)
*
(
float2
)(
a5.x
+
a5.y,
-a5.x
+
a5.y
)
;
a5
=
(
CT
)(
SQRT_2
)
*
(
CT
)(
a5.x
+
a5.y,
-a5.x
+
a5.y
)
;
b6
=
twiddle
(
a2
-
a6
)
;
b6
=
twiddle
(
a2
-
a6
)
;
a2
=
a2
+
a6
;
a2
=
a2
+
a6
;
b7
=
a3
-
a7
;
b7
=
a3
-
a7
;
b7
=
(
float2
)(
SQRT_2
)
*
(
float2
)(
-b7.x
+
b7.y,
-b7.x
-
b7.y
)
;
b7
=
(
CT
)(
SQRT_2
)
*
(
CT
)(
-b7.x
+
b7.y,
-b7.x
-
b7.y
)
;
a3
=
a3
+
a7
;
a3
=
a3
+
a7
;
a0
=
b0
+
a2
;
a0
=
b0
+
a2
;
...
@@ -361,7 +369,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int
...
@@ -361,7 +369,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int
if
(
x
<
t
)
if
(
x
<
t
)
{
{
const
int
dst_ind
=
((
x
-
k
)
<<
3
)
+
k
;
const
int
dst_ind
=
((
x
-
k
)
<<
3
)
+
k
;
__local
float2
*
dst
=
smem
+
dst_ind
;
__local
CT
*
dst
=
smem
+
dst_ind
;
dst[0]
=
a0
+
a1
;
dst[0]
=
a0
+
a1
;
dst[block_size]
=
a4
+
a5
;
dst[block_size]
=
a4
+
a5
;
...
@@ -377,9 +385,9 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int
...
@@ -377,9 +385,9 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix3
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix3
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
{
float2
a0,
a1,
a2
;
CT
a0,
a1,
a2
;
if
(
x
<
t
)
if
(
x
<
t
)
{
{
...
@@ -395,10 +403,10 @@ void fft_radix3(__local float2* smem, __global const float2* twiddles, const int
...
@@ -395,10 +403,10 @@ void fft_radix3(__local float2* smem, __global const float2* twiddles, const int
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix3_B2
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix3_B2
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
x2
=
x1
+
t/2
;
const
int
x2
=
x1
+
t/2
;
float2
a0,
a1,
a2,
a3,
a4,
a5
;
CT
a0,
a1,
a2,
a3,
a4,
a5
;
if
(
x1
<
t/2
)
if
(
x1
<
t/2
)
{
{
...
@@ -418,11 +426,11 @@ void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const
...
@@ -418,11 +426,11 @@ void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix3_B3
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix3_B3
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
x2
=
x1
+
t/3
;
const
int
x2
=
x1
+
t/3
;
const
int
x3
=
x2
+
t/3
;
const
int
x3
=
x2
+
t/3
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8
;
if
(
x1
<
t/3
)
if
(
x1
<
t/3
)
{
{
...
@@ -444,13 +452,13 @@ void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const
...
@@ -444,13 +452,13 @@ void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix3_B4
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix3_B4
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
thread_block
=
t/4
;
const
int
thread_block
=
t/4
;
const
int
x2
=
x1
+
thread_block
;
const
int
x2
=
x1
+
thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x3
=
x1
+
2*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
const
int
x4
=
x1
+
3*thread_block
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9,
a10,
a11
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9,
a10,
a11
;
if
(
x1
<
t/4
)
if
(
x1
<
t/4
)
{
{
...
@@ -474,10 +482,10 @@ void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const
...
@@ -474,10 +482,10 @@ void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix5
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
void
fft_radix5
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x,
const
int
block_size,
const
int
t
)
{
{
const
int
k
=
x
%
block_size
;
const
int
k
=
x
%
block_size
;
float2
a0,
a1,
a2,
a3,
a4
;
CT
a0,
a1,
a2,
a3,
a4
;
if
(
x
<
t
)
if
(
x
<
t
)
{
{
...
@@ -493,10 +501,10 @@ void fft_radix5(__local float2* smem, __global const float2* twiddles, const int
...
@@ -493,10 +501,10 @@ void fft_radix5(__local float2* smem, __global const float2* twiddles, const int
}
}
__attribute__
((
always_inline
))
__attribute__
((
always_inline
))
void
fft_radix5_B2
(
__local
float2*
smem,
__global
const
float2
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
void
fft_radix5_B2
(
__local
CT*
smem,
__global
const
CT
*
twiddles,
const
int
x1,
const
int
block_size,
const
int
t
)
{
{
const
int
x2
=
x1+t/2
;
const
int
x2
=
x1+t/2
;
float2
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9
;
CT
a0,
a1,
a2,
a3,
a4,
a5,
a6,
a7,
a8,
a9
;
if
(
x1
<
t/2
)
if
(
x1
<
t/2
)
{
{
...
@@ -523,32 +531,32 @@ void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const
...
@@ -523,32 +531,32 @@ void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const
__kernel
void
fft_multi_radix_rows
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__kernel
void
fft_multi_radix_rows
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
float2*
twiddles_ptr
,
const
int
t
,
const
int
nz
)
__global
CT*
twiddles_ptr,
int
twiddles_step,
int
twiddles_offset
,
const
int
t
,
const
int
nz
)
{
{
const
int
x
=
get_global_id
(
0
)
;
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
const
int
y
=
get_group_id
(
1
)
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
if
(
y
<
nz
)
if
(
y
<
nz
)
{
{
__local
float2
smem[LOCAL_SIZE]
;
__local
CT
smem[LOCAL_SIZE]
;
__global
const
float2*
twiddles
=
(
__global
float2*
)
twiddles_ptr
;
__global
const
CT*
twiddles
=
(
__global
const
CT*
)(
twiddles_ptr
+
twiddles_offset
)
;
const
int
ind
=
x
;
const
int
ind
=
x
;
#
ifdef
IS_1D
#
ifdef
IS_1D
float
scale
=
1.f
/dst_cols
;
FT
scale
=
(
FT
)
1
/dst_cols
;
#
else
#
else
float
scale
=
1.f
/
(
dst_cols*dst_rows
)
;
FT
scale
=
(
FT
)
1
/
(
dst_cols*dst_rows
)
;
#
endif
#
endif
#
ifdef
COMPLEX_INPUT
#
ifdef
COMPLEX_INPUT
__global
const
float2*
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
src_offset
)))
;
__global
const
CT*
src
=
(
__global
const
CT*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
)
)
,
src_offset
)))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
smem[x+i*block_size]
=
src[i*block_size]
;
smem[x+i*block_size]
=
src[i*block_size]
;
#
else
#
else
__global
const
float*
src
=
(
__global
const
float*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
__global
const
FT*
src
=
(
__global
const
FT*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)
sizeof
(
FT
)
,
src_offset
)))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
smem[x+i*block_size]
=
(
float2
)(
src[i*block_size],
0.f
)
;
smem[x+i*block_size]
=
(
CT
)(
src[i*block_size],
0.f
)
;
#
endif
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -562,14 +570,14 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -562,14 +570,14 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
const
int
cols
=
dst_cols
;
const
int
cols
=
dst_cols
;
#
endif
#
endif
__global
float2*
dst
=
(
__global
float2
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
__global
CT*
dst
=
(
__global
CT
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=x
; i<cols; i+=block_size)
for
(
int
i=x
; i<cols; i+=block_size)
dst[i]
=
SCALE_VAL
(
smem[i],
scale
)
;
dst[i]
=
SCALE_VAL
(
smem[i],
scale
)
;
#
else
#
else
//
pack
row
to
CCS
//
pack
row
to
CCS
__local
float*
smem_1cn
=
(
__local
float
*
)
smem
;
__local
FT*
smem_1cn
=
(
__local
FT
*
)
smem
;
__global
float*
dst
=
(
__global
float
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
__global
FT*
dst
=
(
__global
FT
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
for
(
int
i=x
; i<dst_cols-1; i+=block_size)
for
(
int
i=x
; i<dst_cols-1; i+=block_size)
dst[i+1]
=
SCALE_VAL
(
smem_1cn[i+2],
scale
)
;
dst[i+1]
=
SCALE_VAL
(
smem_1cn[i+2],
scale
)
;
if
(
x
==
0
)
if
(
x
==
0
)
...
@@ -580,9 +588,9 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -580,9 +588,9 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
{
{
//
fill
with
zero
other
rows
//
fill
with
zero
other
rows
#
ifdef
COMPLEX_OUTPUT
#
ifdef
COMPLEX_OUTPUT
__global
float2*
dst
=
(
__global
float2
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
__global
CT*
dst
=
(
__global
CT
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
#
else
#
else
__global
float*
dst
=
(
__global
float
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
__global
FT*
dst
=
(
__global
FT
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
#
endif
#
endif
#
pragma
unroll
#
pragma
unroll
for
(
int
i=x
; i<dst_cols; i+=block_size)
for
(
int
i=x
; i<dst_cols; i+=block_size)
...
@@ -592,60 +600,60 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -592,60 +600,60 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
__kernel
void
fft_multi_radix_cols
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__kernel
void
fft_multi_radix_cols
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
float2*
twiddles_ptr
,
const
int
t
,
const
int
nz
)
__global
CT*
twiddles_ptr,
int
twiddles_step,
int
twiddles_offset
,
const
int
t
,
const
int
nz
)
{
{
const
int
x
=
get_group_id
(
0
)
;
const
int
x
=
get_group_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
y
=
get_global_id
(
1
)
;
if
(
x
<
nz
)
if
(
x
<
nz
)
{
{
__local
float2
smem[LOCAL_SIZE]
;
__local
CT
smem[LOCAL_SIZE]
;
__global
const
uchar*
src
=
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
src_offset
))
;
__global
const
uchar*
src
=
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
)
)
,
src_offset
))
;
__global
const
float2*
twiddles
=
(
__global
float2*
)
twiddles_ptr
;
__global
const
CT*
twiddles
=
(
__global
const
CT*
)(
twiddles_ptr
+
twiddles_offset
)
;
const
int
ind
=
y
;
const
int
ind
=
y
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
float
scale
=
1.f/
(
dst_rows*dst_cols
)
;
FT
scale
=
1.f/
(
dst_rows*dst_cols
)
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
smem[y+i*block_size]
=
*
((
__global
const
float2
*
)(
src
+
i*block_size*src_step
))
;
smem[y+i*block_size]
=
*
((
__global
const
CT
*
)(
src
+
i*block_size*src_step
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
RADIX_PROCESS
;
RADIX_PROCESS
;
#
ifdef
COMPLEX_OUTPUT
#
ifdef
COMPLEX_OUTPUT
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
dst_offset
))
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
)
)
,
dst_offset
))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
*
((
__global
float2
*
)(
dst
+
i*block_size*dst_step
))
=
SCALE_VAL
(
smem[y
+
i*block_size],
scale
)
;
*
((
__global
CT
*
)(
dst
+
i*block_size*dst_step
))
=
SCALE_VAL
(
smem[y
+
i*block_size],
scale
)
;
#
else
#
else
if
(
x
==
0
)
if
(
x
==
0
)
{
{
//
pack
first
column
to
CCS
//
pack
first
column
to
CCS
__local
float*
smem_1cn
=
(
__local
float
*
)
smem
;
__local
FT*
smem_1cn
=
(
__local
FT
*
)
smem
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y+1,
dst_step,
dst_offset
)
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y+1,
dst_step,
dst_offset
)
;
for
(
int
i=y
; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
for
(
int
i=y
; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
*
((
__global
float
*
)
dst
)
=
SCALE_VAL
(
smem_1cn[i+2],
scale
)
;
*
((
__global
FT
*
)
dst
)
=
SCALE_VAL
(
smem_1cn[i+2],
scale
)
;
if
(
y
==
0
)
if
(
y
==
0
)
*
((
__global
float
*
)
(
dst_ptr
+
dst_offset
))
=
SCALE_VAL
(
smem_1cn[0],
scale
)
;
*
((
__global
FT
*
)
(
dst_ptr
+
dst_offset
))
=
SCALE_VAL
(
smem_1cn[0],
scale
)
;
}
}
else
if
(
x
==
(
dst_cols+1
)
/2
)
else
if
(
x
==
(
dst_cols+1
)
/2
)
{
{
//
pack
last
column
to
CCS
(
if
needed
)
//
pack
last
column
to
CCS
(
if
needed
)
__local
float*
smem_1cn
=
(
__local
float
*
)
smem
;
__local
FT*
smem_1cn
=
(
__local
FT
*
)
smem
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
dst_cols-1,
(
int
)
sizeof
(
float
)
,
mad24
(
y+1,
dst_step,
dst_offset
))
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
dst_cols-1,
(
int
)
sizeof
(
FT
)
,
mad24
(
y+1,
dst_step,
dst_offset
))
;
for
(
int
i=y
; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
for
(
int
i=y
; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
*
((
__global
float
*
)
dst
)
=
SCALE_VAL
(
smem_1cn[i+2],
scale
)
;
*
((
__global
FT
*
)
dst
)
=
SCALE_VAL
(
smem_1cn[i+2],
scale
)
;
if
(
y
==
0
)
if
(
y
==
0
)
*
((
__global
float*
)
(
dst_ptr
+
mad24
(
dst_cols-1,
(
int
)
sizeof
(
float
)
,
dst_offset
)))
=
SCALE_VAL
(
smem_1cn[0],
scale
)
;
*
((
__global
FT*
)
(
dst_ptr
+
mad24
(
dst_cols-1,
(
int
)
sizeof
(
FT
)
,
dst_offset
)))
=
SCALE_VAL
(
smem_1cn[0],
scale
)
;
}
}
else
else
{
{
__global
uchar*
dst
=
dst_ptr
+
mad24
(
x,
(
int
)
sizeof
(
float
)
*2,
mad24
(
y,
dst_step,
dst_offset
-
(
int
)
sizeof
(
float
)))
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
x,
(
int
)
sizeof
(
FT
)
*2,
mad24
(
y,
dst_step,
dst_offset
-
(
int
)
sizeof
(
FT
)))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=y
; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
for
(
int
i=y
; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
vstore2
(
SCALE_VAL
(
smem[i],
scale
)
,
0
,
(
__global
float
*
)
dst
)
;
vstore2
(
SCALE_VAL
(
smem[i],
scale
)
,
0
,
(
__global
FT
*
)
dst
)
;
}
}
#
endif
#
endif
}
}
...
@@ -653,25 +661,25 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
...
@@ -653,25 +661,25 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
__kernel
void
ifft_multi_radix_rows
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__kernel
void
ifft_multi_radix_rows
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
float2*
twiddles_ptr
,
const
int
t
,
const
int
nz
)
__global
CT*
twiddles_ptr,
int
twiddles_step,
int
twiddles_offset
,
const
int
t
,
const
int
nz
)
{
{
const
int
x
=
get_global_id
(
0
)
;
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_group_id
(
1
)
;
const
int
y
=
get_group_id
(
1
)
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
#
ifdef
IS_1D
#
ifdef
IS_1D
const
float
scale
=
1.f
/dst_cols
;
const
FT
scale
=
(
FT
)
1
/dst_cols
;
#
else
#
else
const
float
scale
=
1.f
/
(
dst_cols*dst_rows
)
;
const
FT
scale
=
(
FT
)
1
/
(
dst_cols*dst_rows
)
;
#
endif
#
endif
if
(
y
<
nz
)
if
(
y
<
nz
)
{
{
__local
float2
smem[LOCAL_SIZE]
;
__local
CT
smem[LOCAL_SIZE]
;
__global
const
float2*
twiddles
=
(
__global
float2*
)
twiddles_ptr
;
__global
const
CT*
twiddles
=
(
__global
const
CT*
)(
twiddles_ptr
+
twiddles_offset
)
;
const
int
ind
=
x
;
const
int
ind
=
x
;
#
if
defined
(
COMPLEX_INPUT
)
&&
!defined
(
NO_CONJUGATE
)
#
if
defined
(
COMPLEX_INPUT
)
&&
!defined
(
NO_CONJUGATE
)
__global
const
float2*
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
src_offset
)))
;
__global
const
CT*
src
=
(
__global
const
CT*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
)
)
,
src_offset
)))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
{
{
...
@@ -681,7 +689,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -681,7 +689,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
#
else
#
else
#
if
!defined
(
REAL_INPUT
)
&&
defined
(
NO_CONJUGATE
)
#
if
!defined
(
REAL_INPUT
)
&&
defined
(
NO_CONJUGATE
)
__global
const
float2*
src
=
(
__global
const
float2*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2
,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
__global
const
CT*
src
=
(
__global
const
CT*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2
,
(
int
)
sizeof
(
FT
)
,
src_offset
)))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=x
; i<(LOCAL_SIZE-1)/2; i+=block_size)
for
(
int
i=x
; i<(LOCAL_SIZE-1)/2; i+=block_size)
...
@@ -695,7 +703,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -695,7 +703,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
#
pragma
unroll
#
pragma
unroll
for
(
int
i=x
; i<(LOCAL_SIZE-1)/2; i+=block_size)
for
(
int
i=x
; i<(LOCAL_SIZE-1)/2; i+=block_size)
{
{
float2
src
=
vload2
(
0
,
(
__global
const
float*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2*i+1,
(
int
)
sizeof
(
float
)
,
src_offset
))))
;
CT
src
=
vload2
(
0
,
(
__global
const
FT*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2*i+1,
(
int
)
sizeof
(
FT
)
,
src_offset
))))
;
smem[i+1].x
=
src.x
;
smem[i+1].x
=
src.x
;
smem[i+1].y
=
-src.y
;
smem[i+1].y
=
-src.y
;
...
@@ -706,7 +714,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -706,7 +714,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
if
(
x==0
)
if
(
x==0
)
{
{
smem[0].x
=
*
(
__global
const
float
*
)(
src_ptr
+
mad24
(
y,
src_step,
src_offset
))
;
smem[0].x
=
*
(
__global
const
FT
*
)(
src_ptr
+
mad24
(
y,
src_step,
src_offset
))
;
smem[0].y
=
0.f
;
smem[0].y
=
0.f
;
if
(
LOCAL_SIZE
%
2
==0
)
if
(
LOCAL_SIZE
%
2
==0
)
...
@@ -714,7 +722,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -714,7 +722,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
#
if
!defined
(
REAL_INPUT
)
&&
defined
(
NO_CONJUGATE
)
#
if
!defined
(
REAL_INPUT
)
&&
defined
(
NO_CONJUGATE
)
smem[LOCAL_SIZE/2].x
=
src[LOCAL_SIZE/2-1].x
;
smem[LOCAL_SIZE/2].x
=
src[LOCAL_SIZE/2-1].x
;
#
else
#
else
smem[LOCAL_SIZE/2].x
=
*
(
__global
const
float*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
LOCAL_SIZE-1,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
smem[LOCAL_SIZE/2].x
=
*
(
__global
const
FT*
)(
src_ptr
+
mad24
(
y,
src_step,
mad24
(
LOCAL_SIZE-1,
(
int
)
sizeof
(
FT
)
,
src_offset
)))
;
#
endif
#
endif
smem[LOCAL_SIZE/2].y
=
0.f
;
smem[LOCAL_SIZE/2].y
=
0.f
;
}
}
...
@@ -727,7 +735,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -727,7 +735,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
//
copy
data
to
dst
//
copy
data
to
dst
#
ifdef
COMPLEX_OUTPUT
#
ifdef
COMPLEX_OUTPUT
__global
float2*
dst
=
(
__global
float*
)(
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
dst_offset
)))
;
__global
CT*
dst
=
(
__global
CT*
)(
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
)
)
,
dst_offset
)))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
{
{
...
@@ -735,7 +743,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -735,7 +743,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
dst[i*block_size].y
=
SCALE_VAL
(
-smem[x
+
i*block_size].y,
scale
)
;
dst[i*block_size].y
=
SCALE_VAL
(
-smem[x
+
i*block_size].y,
scale
)
;
}
}
#
else
#
else
__global
float*
dst
=
(
__global
float*
)(
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
))
,
dst_offset
)))
;
__global
FT*
dst
=
(
__global
FT*
)(
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
FT
))
,
dst_offset
)))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
{
{
...
@@ -747,9 +755,9 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -747,9 +755,9 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
{
{
//
fill
with
zero
other
rows
//
fill
with
zero
other
rows
#
ifdef
COMPLEX_OUTPUT
#
ifdef
COMPLEX_OUTPUT
__global
float2*
dst
=
(
__global
float2
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
__global
CT*
dst
=
(
__global
CT
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
#
else
#
else
__global
float*
dst
=
(
__global
float
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
__global
FT*
dst
=
(
__global
FT
*
)(
dst_ptr
+
mad24
(
y,
dst_step,
dst_offset
))
;
#
endif
#
endif
#
pragma
unroll
#
pragma
unroll
for
(
int
i=x
; i<dst_cols; i+=block_size)
for
(
int
i=x
; i<dst_cols; i+=block_size)
...
@@ -759,7 +767,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
...
@@ -759,7 +767,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
__kernel
void
ifft_multi_radix_cols
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__kernel
void
ifft_multi_radix_cols
(
__global
const
uchar*
src_ptr,
int
src_step,
int
src_offset,
int
src_rows,
int
src_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
uchar*
dst_ptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols,
__global
float2*
twiddles_ptr
,
const
int
t
,
const
int
nz
)
__global
CT*
twiddles_ptr,
int
twiddles_step,
int
twiddles_offset
,
const
int
t
,
const
int
nz
)
{
{
const
int
x
=
get_group_id
(
0
)
;
const
int
x
=
get_group_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
y
=
get_global_id
(
1
)
;
...
@@ -767,17 +775,17 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
...
@@ -767,17 +775,17 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
#
ifdef
COMPLEX_INPUT
#
ifdef
COMPLEX_INPUT
if
(
x
<
nz
)
if
(
x
<
nz
)
{
{
__local
float2
smem[LOCAL_SIZE]
;
__local
CT
smem[LOCAL_SIZE]
;
__global
const
uchar*
src
=
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
src_offset
))
;
__global
const
uchar*
src
=
src_ptr
+
mad24
(
y,
src_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
)
)
,
src_offset
))
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float
)
*2
)
,
dst_offset
))
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
)
)
,
dst_offset
))
;
__global
const
float2*
twiddles
=
(
__global
float2*
)
twiddles_ptr
;
__global
const
CT*
twiddles
=
(
__global
const
CT*
)(
twiddles_ptr
+
twiddles_offset
)
;
const
int
ind
=
y
;
const
int
ind
=
y
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
{
{
float2
temp
=
*
((
__global
const
float2
*
)(
src
+
i*block_size*src_step
))
;
CT
temp
=
*
((
__global
const
CT
*
)(
src
+
i*block_size*src_step
))
;
smem[y+i*block_size].x
=
temp.x
;
smem[y+i*block_size].x
=
temp.x
;
smem[y+i*block_size].y
=
-temp.y
;
smem[y+i*block_size].y
=
-temp.y
;
}
}
...
@@ -790,7 +798,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
...
@@ -790,7 +798,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
{
{
__global
float2*
res
=
(
__global
float2
*
)(
dst
+
i*block_size*dst_step
)
;
__global
CT*
res
=
(
__global
CT
*
)(
dst
+
i*block_size*dst_step
)
;
res[0].x
=
smem[y
+
i*block_size].x
;
res[0].x
=
smem[y
+
i*block_size].x
;
res[0].y
=
-smem[y
+
i*block_size].y
;
res[0].y
=
-smem[y
+
i*block_size].y
;
}
}
...
@@ -798,22 +806,22 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
...
@@ -798,22 +806,22 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
#
else
#
else
if
(
x
<
nz
)
if
(
x
<
nz
)
{
{
__global
const
float2*
twiddles
=
(
__global
float2*
)
twiddles_ptr
;
__global
const
CT*
twiddles
=
(
__global
const
CT*
)(
twiddles_ptr
+
twiddles_offset
)
;
const
int
ind
=
y
;
const
int
ind
=
y
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
const
int
block_size
=
LOCAL_SIZE/kercn
;
__local
float2
smem[LOCAL_SIZE]
;
__local
CT
smem[LOCAL_SIZE]
;
#
ifdef
EVEN
#
ifdef
EVEN
if
(
x!=0
&&
(
x!=
(
nz-1
)))
if
(
x!=0
&&
(
x!=
(
nz-1
)))
#
else
#
else
if
(
x!=0
)
if
(
x!=0
)
#
endif
#
endif
{
{
__global
const
uchar*
src
=
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2*x-1,
(
int
)
sizeof
(
float
)
,
src_offset
))
;
__global
const
uchar*
src
=
src_ptr
+
mad24
(
y,
src_step,
mad24
(
2*x-1,
(
int
)
sizeof
(
FT
)
,
src_offset
))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
{
{
float2
temp
=
vload2
(
0
,
(
__global
const
float
*
)(
src
+
i*block_size*src_step
))
;
CT
temp
=
vload2
(
0
,
(
__global
const
FT
*
)(
src
+
i*block_size*src_step
))
;
smem[y+i*block_size].x
=
temp.x
;
smem[y+i*block_size].x
=
temp.x
;
smem[y+i*block_size].y
=
-temp.y
;
smem[y+i*block_size].y
=
-temp.y
;
}
}
...
@@ -821,8 +829,8 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
...
@@ -821,8 +829,8 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
else
else
{
{
int
ind
=
x==0
?
0:
2*x-1
;
int
ind
=
x==0
?
0:
2*x-1
;
__global
const
float*
src
=
(
__global
const
float*
)(
src_ptr
+
mad24
(
1
,
src_step,
mad24
(
ind,
(
int
)
sizeof
(
float
)
,
src_offset
)))
;
__global
const
FT*
src
=
(
__global
const
FT*
)(
src_ptr
+
mad24
(
1
,
src_step,
mad24
(
ind,
(
int
)
sizeof
(
FT
)
,
src_offset
)))
;
int
step
=
src_step/
(
int
)
sizeof
(
float
)
;
int
step
=
src_step/
(
int
)
sizeof
(
FT
)
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=y
; i<(LOCAL_SIZE-1)/2; i+=block_size)
for
(
int
i=y
; i<(LOCAL_SIZE-1)/2; i+=block_size)
...
@@ -835,7 +843,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
...
@@ -835,7 +843,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
}
}
if
(
y==0
)
if
(
y==0
)
{
{
smem[0].x
=
*
(
__global
const
float*
)(
src_ptr
+
mad24
(
ind,
(
int
)
sizeof
(
float
)
,
src_offset
))
;
smem[0].x
=
*
(
__global
const
FT*
)(
src_ptr
+
mad24
(
ind,
(
int
)
sizeof
(
FT
)
,
src_offset
))
;
smem[0].y
=
0.f
;
smem[0].y
=
0.f
;
if
(
LOCAL_SIZE
%
2
==0
)
if
(
LOCAL_SIZE
%
2
==0
)
...
@@ -850,12 +858,12 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
...
@@ -850,12 +858,12 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
RADIX_PROCESS
;
RADIX_PROCESS
;
//
copy
data
to
dst
//
copy
data
to
dst
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
float2
))
,
dst_offset
))
;
__global
uchar*
dst
=
dst_ptr
+
mad24
(
y,
dst_step,
mad24
(
x,
(
int
)(
sizeof
(
CT
))
,
dst_offset
))
;
#
pragma
unroll
#
pragma
unroll
for
(
int
i=0
; i<kercn; i++)
for
(
int
i=0
; i<kercn; i++)
{
{
__global
float2*
res
=
(
__global
float2
*
)(
dst
+
i*block_size*dst_step
)
;
__global
CT*
res
=
(
__global
CT
*
)(
dst
+
i*block_size*dst_step
)
;
res[0].x
=
smem[y
+
i*block_size].x
;
res[0].x
=
smem[y
+
i*block_size].x
;
res[0].y
=
-smem[y
+
i*block_size].y
;
res[0].y
=
-smem[y
+
i*block_size].y
;
}
}
...
...
modules/core/test/ocl/test_dft.cpp
浏览文件 @
4d9d7e6d
...
@@ -62,7 +62,7 @@ namespace ocl {
...
@@ -62,7 +62,7 @@ namespace ocl {
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
// Dft
// Dft
PARAM_TEST_CASE
(
Dft
,
cv
::
Size
,
OCL_FFT_TYPE
,
bool
,
bool
,
bool
,
bool
)
PARAM_TEST_CASE
(
Dft
,
cv
::
Size
,
OCL_FFT_TYPE
,
MatDepth
,
bool
,
bool
,
bool
,
bool
)
{
{
cv
::
Size
dft_size
;
cv
::
Size
dft_size
;
int
dft_flags
,
depth
,
cn
,
dft_type
;
int
dft_flags
,
depth
,
cn
,
dft_type
;
...
@@ -76,7 +76,7 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
...
@@ -76,7 +76,7 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
{
{
dft_size
=
GET_PARAM
(
0
);
dft_size
=
GET_PARAM
(
0
);
dft_type
=
GET_PARAM
(
1
);
dft_type
=
GET_PARAM
(
1
);
depth
=
CV_32F
;
depth
=
GET_PARAM
(
2
)
;
dft_flags
=
0
;
dft_flags
=
0
;
switch
(
dft_type
)
switch
(
dft_type
)
...
@@ -87,13 +87,13 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
...
@@ -87,13 +87,13 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
case
C2C
:
dft_flags
|=
cv
::
DFT_COMPLEX_OUTPUT
;
cn
=
2
;
break
;
case
C2C
:
dft_flags
|=
cv
::
DFT_COMPLEX_OUTPUT
;
cn
=
2
;
break
;
}
}
if
(
GET_PARAM
(
2
))
dft_flags
|=
cv
::
DFT_INVERSE
;
if
(
GET_PARAM
(
3
))
if
(
GET_PARAM
(
3
))
dft_flags
|=
cv
::
DFT_
ROWS
;
dft_flags
|=
cv
::
DFT_
INVERSE
;
if
(
GET_PARAM
(
4
))
if
(
GET_PARAM
(
4
))
dft_flags
|=
cv
::
DFT_ROWS
;
if
(
GET_PARAM
(
5
))
dft_flags
|=
cv
::
DFT_SCALE
;
dft_flags
|=
cv
::
DFT_SCALE
;
hint
=
GET_PARAM
(
5
);
hint
=
GET_PARAM
(
6
);
is1d
=
(
dft_flags
&
DFT_ROWS
)
!=
0
||
dft_size
.
height
==
1
;
is1d
=
(
dft_flags
&
DFT_ROWS
)
!=
0
||
dft_size
.
height
==
1
;
}
}
...
@@ -177,6 +177,7 @@ OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(),
...
@@ -177,6 +177,7 @@ OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(),
OCL_INSTANTIATE_TEST_CASE_P
(
Core
,
Dft
,
Combine
(
Values
(
cv
::
Size
(
45
,
72
),
cv
::
Size
(
36
,
36
),
cv
::
Size
(
512
,
1
),
cv
::
Size
(
1280
,
768
)),
OCL_INSTANTIATE_TEST_CASE_P
(
Core
,
Dft
,
Combine
(
Values
(
cv
::
Size
(
45
,
72
),
cv
::
Size
(
36
,
36
),
cv
::
Size
(
512
,
1
),
cv
::
Size
(
1280
,
768
)),
Values
((
OCL_FFT_TYPE
)
R2C
,
(
OCL_FFT_TYPE
)
C2C
,
(
OCL_FFT_TYPE
)
R2R
,
(
OCL_FFT_TYPE
)
C2R
),
Values
((
OCL_FFT_TYPE
)
R2C
,
(
OCL_FFT_TYPE
)
C2C
,
(
OCL_FFT_TYPE
)
R2R
,
(
OCL_FFT_TYPE
)
C2R
),
Values
(
CV_32F
,
CV_64F
),
Bool
(),
// DFT_INVERSE
Bool
(),
// DFT_INVERSE
Bool
(),
// DFT_ROWS
Bool
(),
// DFT_ROWS
Bool
(),
// DFT_SCALE
Bool
(),
// DFT_SCALE
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录