提交 6c8b6bd0 编写于 作者: A Alexander Karsakov

Added packing to CCS format

上级 ed07241f
...@@ -2083,20 +2083,19 @@ struct OCL_FftPlan ...@@ -2083,20 +2083,19 @@ struct OCL_FftPlan
int dft_size; int dft_size;
int flags; int flags;
bool status;
OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags) OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags), status(true)
{ {
int min_radix = INT_MAX; int min_radix = INT_MAX;
std::vector<int> radixes, blocks; std::vector<int> radixes, blocks;
ocl_getRadixes(dft_size, radixes, blocks, min_radix); ocl_getRadixes(dft_size, radixes, blocks, min_radix);
thread_count = (dft_size + min_radix-1) / min_radix; thread_count = (dft_size + min_radix-1) / min_radix;
printf("cols: %d - ", dft_size); if (thread_count > ocl::Device::getDefault().maxWorkGroupSize())
for (int i=0; i<radixes.size(); i++)
{ {
printf("%d ", radixes[i]); status = false;
return;
} }
printf("min radix - %d\n", min_radix);
// generate string with radix calls // generate string with radix calls
String radix_processing; String radix_processing;
...@@ -2142,6 +2141,9 @@ struct OCL_FftPlan ...@@ -2142,6 +2141,9 @@ struct OCL_FftPlan
bool enqueueTransform(InputArray _src, OutputArray _dst, int dft_size, int flags, bool rows = true) const bool enqueueTransform(InputArray _src, OutputArray _dst, int dft_size, int flags, bool rows = true) const
{ {
if (!status)
return false;
UMat src = _src.getUMat(); UMat src = _src.getUMat();
UMat dst = _dst.getUMat(); UMat dst = _dst.getUMat();
...@@ -2162,11 +2164,14 @@ struct OCL_FftPlan ...@@ -2162,11 +2164,14 @@ struct OCL_FftPlan
kernel_name = "fft_multi_radix_cols"; kernel_name = "fft_multi_radix_cols";
} }
bool is1d = (flags & DFT_ROWS) != 0 || dft_size == 1;
String options = buildOptions; String options = buildOptions;
if (src.channels() == 1) if (src.channels() == 1)
options += " -D REAL_INPUT"; options += " -D REAL_INPUT";
if (dst.channels() == 1) if (dst.channels() == 1)
options += " -D CCS_OUTPUT"; options += " -D CCS_OUTPUT";
if ((is1d && src.channels() == 1) || (rows && (flags & DFT_REAL_OUTPUT)))
options += " -D NO_CONJUGATE";
ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options); ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
if (k.empty()) if (k.empty())
...@@ -2219,61 +2224,16 @@ protected: ...@@ -2219,61 +2224,16 @@ protected:
std::vector<OCL_FftPlan*> planStorage; std::vector<OCL_FftPlan*> planStorage;
}; };
static bool ocl_packToCCS(InputArray _src, OutputArray _dst, int flags)
{
UMat src = _src.getUMat();
_dst.create(src.size(), CV_32F);
UMat dst = _dst.getUMat();
src = src.reshape(1);
if ((flags & DFT_ROWS) == 0 && src.rows > 1)
{
// pack to CCS by rows
if (dst.cols > 2)
src.colRange(2, dst.cols + (dst.cols % 2)).copyTo(dst.colRange(1, dst.cols-1 + (dst.cols % 2)));
Mat dst_mat = dst.getMat(ACCESS_WRITE);
Mat buffer_mat = src.getMat(ACCESS_READ);
dst_mat.at<float>(0,0) = buffer_mat.at<float>(0,0);
dst_mat.at<float>(dst_mat.rows-1,0) = buffer_mat.at<float>(src.rows/2,0);
for (int i=1; i<dst_mat.rows-1; i+=2)
{
dst_mat.at<float>(i,0) = buffer_mat.at<float>((i+1)/2,0);
dst_mat.at<float>(i+1,0) = buffer_mat.at<float>((i+1)/2,1);
}
if (dst_mat.cols % 2 == 0)
{
dst_mat.at<float>(0,dst_mat.cols-1) = buffer_mat.at<float>(0,src.cols/2);
dst_mat.at<float>(dst_mat.rows-1,dst_mat.cols-1) = buffer_mat.at<float>(src.rows/2,src.cols/2);
for (int i=1; i<dst_mat.rows-1; i+=2)
{
dst_mat.at<float>(i,dst_mat.cols-1) = buffer_mat.at<float>((i+1)/2,src.cols/2);
dst_mat.at<float>(i+1,dst_mat.cols-1) = buffer_mat.at<float>((i+1)/2,src.cols/2+1);
}
}
}
else
{
// pack to CCS each row
src.colRange(0,1).copyTo(dst.colRange(0,1));
src.colRange(2, (dst.cols+1)).copyTo(dst.colRange(1, dst.cols));
}
return true;
}
static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags) static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags)
{ {
const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags); const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags);
return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, true); return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, true);
} }
static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int flags) static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags)
{ {
const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows(), flags); const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows(), flags);
return plan->enqueueTransform(_src, _dst, _src.cols(), flags, false); return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, false);
} }
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
...@@ -2315,6 +2275,8 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro ...@@ -2315,6 +2275,8 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
// Forward Complex to CCS not supported // Forward Complex to CCS not supported
if (complex_input && real_output && !inv) if (complex_input && real_output && !inv)
{ {
flags ^= DFT_REAL_OUTPUT;
flags |= DFT_COMPLEX_OUTPUT;
real_output = 0; real_output = 0;
complex_output = 1; complex_output = 1;
} }
...@@ -2344,23 +2306,21 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro ...@@ -2344,23 +2306,21 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
if (complex_output) if (complex_output)
{ {
if (real_input && is1d && !inv) _dst.create(src.size(), CV_32FC2);
output.create(src.size(), CV_32FC2); output = _dst.getUMat();
else }
{ else
_dst.create(src.size(), CV_32FC2);
output = _dst.getUMat();
}
} else
{ {
// CCS
if (is1d) if (is1d)
{ {
_dst.create(src.size(), CV_32FC1); _dst.create(src.size(), CV_32FC1);
output = _dst.getUMat(); output = _dst.getUMat();
} }
else else
{
_dst.create(src.size(), CV_32FC1);
output.create(src.size(), CV_32FC2); output.create(src.size(), CV_32FC2);
}
} }
if (!ocl_dft_C2C_rows(input, output, nonzero_rows, flags)) if (!ocl_dft_C2C_rows(input, output, nonzero_rows, flags))
...@@ -2368,32 +2328,13 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro ...@@ -2368,32 +2328,13 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
if (!is1d) if (!is1d)
{ {
if (!ocl_dft_C2C_cols(output, output, flags)) int nonzero_cols = real_input && real_output ? output.cols/2 + 1 : output.cols;
if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags))
return false; return false;
} } else
if (complex_output)
{ {
if (real_input && is1d && !inv) _dst.assign(output);
_dst.assign(output.colRange(0, output.cols/2+1));
else
_dst.assign(output);
} }
else
{
if (!inv)
{
if (!is1d)
ocl_packToCCS(output, _dst, flags);
else
_dst.assign(output);
}
else
{
// copy real part to dst
}
}
//printf("OCL!\n");
return true; return true;
} }
...@@ -2435,7 +2376,6 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) ...@@ -2435,7 +2376,6 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
int elem_size = (int)src.elemSize1(), complex_elem_size = elem_size*2; int elem_size = (int)src.elemSize1(), complex_elem_size = elem_size*2;
int factors[34]; int factors[34];
bool inplace_transform = false; bool inplace_transform = false;
bool is1d = (flags & DFT_ROWS) != 0 || src.rows == 1;
#ifdef USE_IPP_DFT #ifdef USE_IPP_DFT
AutoBuffer<uchar> ippbuf; AutoBuffer<uchar> ippbuf;
int ipp_norm_flag = !(flags & DFT_SCALE) ? 8 : inv ? 2 : 1; int ipp_norm_flag = !(flags & DFT_SCALE) ? 8 : inv ? 2 : 1;
...@@ -2444,10 +2384,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) ...@@ -2444,10 +2384,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 ); CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) ) if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
if (!is1d) _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
_dst.create( src.size(), CV_MAKETYPE(depth, 2) );
else
_dst.create( Size(src.cols/2+1, src.rows), CV_MAKETYPE(depth, 2) );
else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) ) else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
_dst.create( src.size(), depth ); _dst.create( src.size(), depth );
else else
......
...@@ -331,10 +331,17 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, ...@@ -331,10 +331,17 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
RADIX_PROCESS; RADIX_PROCESS;
#ifndef CCS_OUTPUT #ifndef CCS_OUTPUT
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset))); #ifdef NO_CONJUGATE
// copy result without complex conjugate
const int cols = dst_cols/2 + 1;
#else
const int cols = dst_cols;
#endif
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
#pragma unroll #pragma unroll
for (int i=0; i<kercn; i++) for (int i=x; i<cols; i+=block_size)
dst[i*block_size] = smem[x + i*block_size]; dst[i] = smem[i];
#else #else
// pack row to CCS // pack row to CCS
__local float* smem_1cn = (__local float*) smem; __local float* smem_1cn = (__local float*) smem;
...@@ -358,7 +365,6 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, ...@@ -358,7 +365,6 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
{ {
__local float2 smem[LOCAL_SIZE]; __local float2 smem[LOCAL_SIZE];
__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)); __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
__constant const float2* twiddles = (__constant float2*) twiddles_ptr; __constant const float2* twiddles = (__constant float2*) twiddles_ptr;
const int ind = y; const int ind = y;
const int block_size = LOCAL_SIZE/kercn; const int block_size = LOCAL_SIZE/kercn;
...@@ -370,9 +376,39 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, ...@@ -370,9 +376,39 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
RADIX_PROCESS; RADIX_PROCESS;
// copy data to dst #ifndef CCS_OUTPUT
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
#pragma unroll #pragma unroll
for (int i=0; i<kercn; i++) for (int i=0; i<kercn; i++)
*((__global float2*)(dst + i*block_size*src_step)) = smem[y + i*block_size]; *((__global float2*)(dst + i*block_size*dst_step)) = smem[y + i*block_size];
#else
if (x == 0)
{
// pack first column to CCS
__local float* smem_1cn = (__local float*) smem;
__global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);
for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
*((__global float*) dst) = smem_1cn[i+2];
if (y == 0)
*((__global float*) (dst_ptr + dst_offset)) = smem_1cn[0];
}
else if (x == (dst_cols+1)/2)
{
// pack last column to CCS (if needed)
__local float* smem_1cn = (__local float*) smem;
__global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(float), mad24(y+1, dst_step, dst_offset));
for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
*((__global float*) dst) = smem_1cn[i+2];
if (y == 0)
*((__global float*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(float), dst_offset))) = smem_1cn[0];
}
else
{
__global uchar* dst = dst_ptr + mad24(x, (int)sizeof(float)*2, mad24(y, dst_step, dst_offset - (int)sizeof(float)));
#pragma unroll
for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
vstore2(smem[i], 0, (__global float*) dst);
}
#endif
} }
} }
\ No newline at end of file
...@@ -67,6 +67,7 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool) ...@@ -67,6 +67,7 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool)
cv::Size dft_size; cv::Size dft_size;
int dft_flags, depth, cn, dft_type; int dft_flags, depth, cn, dft_type;
bool inplace; bool inplace;
bool is1d;
TEST_DECLARE_INPUT_PARAMETER(src); TEST_DECLARE_INPUT_PARAMETER(src);
TEST_DECLARE_OUTPUT_PARAMETER(dst); TEST_DECLARE_OUTPUT_PARAMETER(dst);
...@@ -96,6 +97,7 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool) ...@@ -96,6 +97,7 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool)
inplace = GET_PARAM(3); inplace = GET_PARAM(3);
if (inplace && dft_type == 0) if (inplace && dft_type == 0)
inplace = 0; inplace = 0;
is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1;
} }
void generateTestData() void generateTestData()
...@@ -114,17 +116,23 @@ OCL_TEST_P(Dft, Mat) ...@@ -114,17 +116,23 @@ OCL_TEST_P(Dft, Mat)
OCL_OFF(cv::dft(src, dst, dft_flags)); OCL_OFF(cv::dft(src, dst, dft_flags));
OCL_ON(cv::dft(usrc, udst, dft_flags)); OCL_ON(cv::dft(usrc, udst, dft_flags));
if (dft_type == R2C && is1d)
{
dst = dst(cv::Range(0, dst.rows), cv::Range(0, dst.cols/2 + 1));
udst = udst(cv::Range(0, udst.rows), cv::Range(0, udst.cols/2 + 1));
}
//Mat gpu = udst.getMat(ACCESS_READ); Mat gpu = udst.getMat(ACCESS_READ);
//std::cout << src << std::endl; //std::cout << src << std::endl;
//std::cout << dst << std::endl; //std::cout << dst << std::endl;
//std::cout << gpu << std::endl; //std::cout << gpu << std::endl;
//int cn = udst.channels(); //int cn = udst.channels();
//
//Mat df; //Mat df;
//absdiff(dst, gpu, df); //absdiff(dst, gpu, df);
//std::cout << df << std::endl; //std::cout << Mat_<int>(df) << std::endl;
double eps = src.size().area() * 1e-4; double eps = src.size().area() * 1e-4;
EXPECT_MAT_NEAR(dst, udst, eps); EXPECT_MAT_NEAR(dst, udst, eps);
...@@ -181,9 +189,9 @@ OCL_TEST_P(MulSpectrums, Mat) ...@@ -181,9 +189,9 @@ OCL_TEST_P(MulSpectrums, Mat)
OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(6, 1), cv::Size(5, 8), cv::Size(30, 20), OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(6, 4), cv::Size(5, 8), cv::Size(6, 6),
cv::Size(512, 1), cv::Size(1280, 768)), cv::Size(512, 1), cv::Size(1280, 768)),
Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R/*, (OCL_FFT_TYPE) C2R*/), Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R),
Bool(), // DFT_ROWS Bool(), // DFT_ROWS
Bool() // inplace Bool() // inplace
) )
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册