提交 27c1bd27 编写于 作者: K krodyush

Improve ocl cvt_color performance for the following conversions: RGB<->BGR,...

Improve ocl cvt_color performance for the following conversions: RGB<->BGR, RGB->Gray, RGB<->XYZ, RGB<->YCrCb, RGB<->YUV, and mRGBA<->RGBA.
The improvement was done basically by processing more than 1 pixel by each work-item and using vector's operations.
new performance tests were added
上级 26f5d2d6
......@@ -57,9 +57,39 @@ CV_ENUM(ConversionTypes, CV_RGB2GRAY, CV_RGB2BGR, CV_RGB2YUV, CV_YUV2RGB, CV_RGB
CV_HLS2RGB, CV_BGR5652BGR, CV_BGR2BGR565, CV_RGBA2mRGBA, CV_mRGBA2RGBA, CV_YUV2RGB_NV12)
typedef tuple<Size, tuple<ConversionTypes, int, int> > cvtColorParams;
typedef TestBaseWithParam<cvtColorParams> cvtColorFixture;
typedef TestBaseWithParam<cvtColorParams> cvtColorU8Fixture;
typedef TestBaseWithParam<cvtColorParams> cvtColorF32Fixture;
typedef TestBaseWithParam<cvtColorParams> cvtColorU16Fixture;
PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine(
#define RUN_CVT_PERF_TEST \
cvtColorParams params = GetParam();\
const Size srcSize = get<0>(params);\
const tuple<int, int, int> conversionParams = get<1>(params);\
const int code = get<0>(conversionParams), scn = get<1>(conversionParams),\
dcn = get<2>(conversionParams);\
\
Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));\
declare.in(src, WARMUP_RNG).out(dst);\
\
if (RUN_OCL_IMPL)\
{\
ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type());\
\
OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn);\
oclDst.download(dst);\
\
SANITY_CHECK(dst, 1);\
}\
else if (RUN_PLAIN_IMPL)\
{\
TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);\
\
SANITY_CHECK(dst);\
}\
else\
OCL_PERF_ELSE\
PERF_TEST_P(cvtColorU8Fixture, cvtColor, testing::Combine(
testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)),
testing::Values(
make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1),
......@@ -81,30 +111,41 @@ PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine(
make_tuple(ConversionTypes(CV_YUV2RGB_NV12), 1, 3)
)))
{
cvtColorParams params = GetParam();
const Size srcSize = get<0>(params);
const tuple<int, int, int> conversionParams = get<1>(params);
const int code = get<0>(conversionParams), scn = get<1>(conversionParams),
dcn = get<2>(conversionParams);
Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));
declare.in(src, WARMUP_RNG).out(dst);
if (RUN_OCL_IMPL)
{
ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type());
OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn);
oclDst.download(dst);
RUN_CVT_PERF_TEST
}
SANITY_CHECK(dst, 1);
}
else if (RUN_PLAIN_IMPL)
{
TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);
PERF_TEST_P(cvtColorF32Fixture, cvtColor, testing::Combine(
testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)),
testing::Values(
make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1),
make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3),
make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3),
make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3),
make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3),
make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3),
make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3),
make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3),
make_tuple(ConversionTypes(CV_RGB2HSV), 3, 3),
make_tuple(ConversionTypes(CV_HSV2RGB), 3, 3),
make_tuple(ConversionTypes(CV_RGB2HLS), 3, 3),
make_tuple(ConversionTypes(CV_HLS2RGB), 3, 3)
)))
{
RUN_CVT_PERF_TEST
}
SANITY_CHECK(dst);
}
else
OCL_PERF_ELSE
PERF_TEST_P(cvtColorU16Fixture, cvtColor, testing::Combine(
testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)),
testing::Values(
make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1),
make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3),
make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3),
make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3),
make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3),
make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3),
make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3),
make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3)
)))
{
RUN_CVT_PERF_TEST
}
......@@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
{
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
int pixels_per_work_item = 1;
std::string build_options = format("-D DEPTH_%d", src.depth());
if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
{
if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
pixels_per_work_item = 4;
else if (src.cols % 2 == 0)
pixels_per_work_item = 2;
else
pixels_per_work_item = 1;
}
std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item);
if (!additionalOptions.empty())
build_options += additionalOptions;
......@@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
......@@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
if (!data2.empty())
args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data ));
size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(),
const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat())
{
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx);
if (!additionalOptions.empty())
build_options += additionalOptions;
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data1.empty())
args.push_back( make_pair( sizeof(cl_mem) , (void *)&data1.data ));
if (!data2.empty())
args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data ));
size_t gt[3] = { dst.cols, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
if (!additionalOptions.empty())
build_options += additionalOptions;
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data.empty())
args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data ));
size_t gt[3] = { dst.cols, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
......@@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
std::string build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
int pixels_per_work_item = 1;
if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
{
if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
pixels_per_work_item = 4;
else if (src.cols % 2 == 0)
pixels_per_work_item = 2;
else
pixels_per_work_item = 1;
}
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item);
if (!additionalOptions.empty())
build_options += additionalOptions;
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data.empty())
args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data ));
size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
if (!additionalOptions.empty())
build_options += additionalOptions;
......@@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
......@@ -119,13 +238,46 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
if (!additionalOptions.empty())
build_options += additionalOptions;
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data.empty())
args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data ));
size_t gt[3] = { dst.cols, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
{
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s",
src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
......@@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
{
std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
src.depth(), greenbits, dst.channels());
std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d",
src.depth(), greenbits, dst.channels(), bidx);
int src_offset = src.offset >> 1, src_step = src.step >> 1;
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1();
......@@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
......@@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
{
std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
src.depth(), greenbits, src.channels());
std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d",
src.depth(), greenbits, src.channels(), bidx);
int src_offset = (int)src.offset, src_step = (int)src.step;
int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1;
......@@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
......@@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
CV_Assert(scn == 1);
dcn = code == CV_GRAY2BGRA ? 4 : 3;
dst.create(sz, CV_MAKETYPE(depth, dcn));
toRGB_caller(src, dst, 0, "Gray2RGB");
fromGray_caller(src, dst, 0, "Gray2RGB");
break;
}
case CV_BGR2YUV: case CV_RGB2YUV:
......@@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
Size dstSz(sz.width, sz.height * 2 / 3);
dst.create(dstSz, CV_MAKETYPE(depth, dcn));
toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12");
toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12");
break;
}
case CV_BGR2YCrCb: case CV_RGB2YCrCb:
......@@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
initialized = true;
}
fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
return;
}
fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
break;
}
case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
......@@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
dst.create(sz, CV_MAKETYPE(depth, dcn));
std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB";
toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
break;
}
case CV_RGBA2mRGBA: case CV_mRGBA2RGBA:
......
......@@ -56,35 +56,59 @@
#ifdef DEPTH_0
#define DATA_TYPE uchar
#define VECTOR2 uchar2
#define VECTOR4 uchar4
#define VECTOR8 uchar8
#define VECTOR16 uchar16
#define COEFF_TYPE int
#define MAX_NUM 255
#define HALF_MAX 128
#define SAT_CAST(num) convert_uchar_sat_rte(num)
#define SAT_CAST2(num) convert_uchar2_sat(num)
#define SAT_CAST4(num) convert_uchar4_sat(num)
#endif
#ifdef DEPTH_2
#define DATA_TYPE ushort
#define VECTOR2 ushort2
#define VECTOR4 ushort4
#define VECTOR8 ushort8
#define VECTOR16 ushort16
#define COEFF_TYPE int
#define MAX_NUM 65535
#define HALF_MAX 32768
#define SAT_CAST(num) convert_ushort_sat_rte(num)
#define SAT_CAST2(num) convert_ushort2_sat(num)
#define SAT_CAST4(num) convert_ushort4_sat(num)
#endif
#ifdef DEPTH_5
#define DATA_TYPE float
#define VECTOR2 float2
#define VECTOR4 float4
#define VECTOR8 float8
#define VECTOR16 float16
#define COEFF_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX 0.5f
#define SAT_CAST(num) (num)
#endif
#ifndef bidx
#define bidx 0
#endif
#ifndef pixels_per_work_item
#define pixels_per_work_item 1
#endif
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
enum
{
yuv_shift = 14,
xyz_shift = 12,
hsv_shift = 12,
hsv_shift = 12,
R2Y = 4899,
G2Y = 9617,
B2Y = 1868,
......@@ -93,26 +117,84 @@ enum
///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
__constant float c_RGB2GrayCoeffs_f[3] = { 0.114f, 0.587f, 0.299f };
__constant int c_RGB2GrayCoeffs_i[3] = { B2Y, G2Y, R2Y };
__kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step,
int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
{
int x = get_global_id(0);
int x = get_global_id(0) * pixels_per_work_item;
int y = get_global_id(1);
if (y < rows && x < cols)
{
int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst_idx = mad24(y, dst_step, dst_offset + x);
#ifndef INTEL_DEVICE
#ifdef DEPTH_5
dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f;
#else
dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
#endif
#else
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
#ifdef DEPTH_5
__constant float * coeffs = c_RGB2GrayCoeffs_f;
#else
__constant int * coeffs = c_RGB2GrayCoeffs_i;
#endif
if (1 == pixels_per_work_item)
{
#ifdef DEPTH_5
*dst_ptr = src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] *coeffs[2];
#else
*dst_ptr = (DATA_TYPE)CV_DESCALE((src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] * coeffs[2]), yuv_shift);
#endif
}
else if (2 == pixels_per_work_item)
{
const VECTOR8 r0 = vload8(0, src_ptr);
#ifdef DEPTH_5
const float2 c0 = r0.s04;
const float2 c1 = r0.s15;
const float2 c2 = r0.s26;
const float2 Y = c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2];
#else
const int2 c0 = convert_int2(r0.s04);
const int2 c1 = convert_int2(r0.s15);
const int2 c2 = convert_int2(r0.s26);
const int2 yi = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift);
const VECTOR2 Y = SAT_CAST2(yi);
#endif
vstore2(Y, 0, dst_ptr);
}
else if (4 == pixels_per_work_item)
{
#ifndef DEPTH_5
const VECTOR16 r0 = vload16(0, src_ptr);
const int4 c0 = convert_int4(r0.s048c);
const int4 c1 = convert_int4(r0.s159d);
const int4 c2 = convert_int4(r0.s26ae);
const int4 Y = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift);
vstore4(SAT_CAST4(Y), 0, dst_ptr);
#endif
}
#endif //INTEL_DEVICE
}
}
__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step,
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
{
......@@ -140,10 +222,10 @@ __constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877
__constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
__kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step,
int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
{
int x = get_global_id(0);
int x = get_global_id(0) * pixels_per_work_item;
int y = get_global_id(1);
if (y < rows && x < cols)
......@@ -151,24 +233,85 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step,
x <<= 2;
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x);
DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
#ifdef DEPTH_5
__constant float * coeffs = c_RGB2YUVCoeffs_f;
DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
#else
__constant int * coeffs = c_RGB2YUVCoeffs_i;
int delta = HALF_MAX * (1 << yuv_shift);
int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
const int delta = HALF_MAX * (1 << yuv_shift);
#endif
if (1 == pixels_per_work_item)
{
const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
#ifdef DEPTH_5
float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
float U = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
float V = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
#else
int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
int U = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
int V = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
#endif
dst[dst_idx] = SAT_CAST( Y );
dst[dst_idx + 1] = SAT_CAST( Cr );
dst[dst_idx + 2] = SAT_CAST( Cb );
dst_ptr[0] = SAT_CAST( Y );
dst_ptr[1] = SAT_CAST( U );
dst_ptr[2] = SAT_CAST( V );
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const VECTOR8 r0 = vload8(0, src_ptr);
#ifdef DEPTH_5
const float2 c0 = r0.s04;
const float2 c1 = r0.s15;
const float2 c2 = r0.s26;
const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]);
const float2 U = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX);
const float2 V = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX);
#else
const int2 c0 = convert_int2(r0.s04);
const int2 c1 = convert_int2(r0.s15);
const int2 c2 = convert_int2(r0.s26);
const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
const VECTOR2 Y = SAT_CAST2(yi);
const VECTOR2 U = SAT_CAST2(ui);
const VECTOR2 V = SAT_CAST2(vi);
#endif
vstore8((VECTOR8)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0), 0, dst_ptr);
}
else if (4 == pixels_per_work_item)
{
#ifndef DEPTH_5
const VECTOR16 r0 = vload16(0, src_ptr);
const int4 c0 = convert_int4(r0.s048c);
const int4 c1 = convert_int4(r0.s159d);
const int4 c2 = convert_int4(r0.s26ae);
const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
const VECTOR4 Y = SAT_CAST4(yi);
const VECTOR4 U = SAT_CAST4(ui);
const VECTOR4 V = SAT_CAST4(vi);
vstore16((VECTOR16)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0, Y.s2, U.s2, V.s2, 0, Y.s3, U.s3, V.s3, 0), 0, dst_ptr);
#endif
}
#endif //INTEL_DEVICE
}
}
......@@ -176,10 +319,10 @@ __constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
__constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
__kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step,
int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
{
int x = get_global_id(0);
int x = get_global_id(0) * pixels_per_work_item;
int y = get_global_id(1);
if (y < rows && x < cols)
......@@ -187,26 +330,95 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step,
x <<= 2;
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x);
DATA_TYPE yuv[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
#ifdef DEPTH_5
__constant float * coeffs = c_YUV2RGBCoeffs_f;
float b = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3];
float g = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1];
float r = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0];
#else
__constant int * coeffs = c_YUV2RGBCoeffs_i;
int b = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift);
int g = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift);
int r = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift);
#endif
dst[dst_idx + bidx] = SAT_CAST( b );
dst[dst_idx + 1] = SAT_CAST( g );
dst[dst_idx + (bidx^2)] = SAT_CAST( r );
if (1 == pixels_per_work_item)
{
const DATA_TYPE yuv[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
#ifdef DEPTH_5
float B = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3];
float G = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1];
float R = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0];
#else
int B = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift);
int G = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift);
int R = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift);
#endif
dst_ptr[bidx] = SAT_CAST( B );
dst_ptr[1] = SAT_CAST( G );
dst_ptr[(bidx^2)] = SAT_CAST( R );
#if dcn == 4
dst[dst_idx + 3] = MAX_NUM;
dst_ptr[3] = MAX_NUM;
#endif
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const VECTOR8 r0 = vload8(0, src_ptr);
#ifdef DEPTH_5
const float2 Y = r0.s04;
const float2 U = r0.s15;
const float2 V = r0.s26;
const float2 c0 = (bidx == 0) ? (Y + (V - HALF_MAX) * coeffs[3]) : (Y + (U - HALF_MAX) * coeffs[0]);
const float2 c1 = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
const float2 c2 = (bidx == 0) ? (Y + (U - HALF_MAX) * coeffs[0]) : (Y + (V - HALF_MAX) * coeffs[3]);
#else
const int2 Y = convert_int2(r0.s04);
const int2 U = convert_int2(r0.s15);
const int2 V = convert_int2(r0.s26);
const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift));
const int2 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift));
const VECTOR2 c0 = SAT_CAST2(c0i);
const VECTOR2 c1 = SAT_CAST2(c1i);
const VECTOR2 c2 = SAT_CAST2(c2i);
#endif
#if dcn == 4
vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr);
#else
vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr);
#endif
}
else if (4 == pixels_per_work_item)
{
#ifndef DEPTH_5
const VECTOR16 r0 = vload16(0, src_ptr);
const int4 Y = convert_int4(r0.s048c);
const int4 U = convert_int4(r0.s159d);
const int4 V = convert_int4(r0.s26ae);
const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift));
const int4 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift));
const VECTOR4 c0 = SAT_CAST4(c0i);
const VECTOR4 c1 = SAT_CAST4(c1i);
const VECTOR4 c2 = SAT_CAST4(c2i);
#if dcn == 4
vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr);
#else
vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr);
#endif
#endif
}
#endif //INTEL_DEVICE
}
}
......@@ -218,7 +430,7 @@ __constant int ITUR_BT_601_CVR = 1673527;
__constant int ITUR_BT_601_SHIFT = 20;
__kernel void YUV2RGBA_NV12(int cols, int rows, int src_step, int dst_step,
int bidx, __global const uchar* src, __global uchar* dst,
__global const uchar* src, __global uchar* dst,
int src_offset, int dst_offset)
{
const int x = get_global_id(0);
......@@ -275,10 +487,10 @@ __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564
__constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
__kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step,
int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
{
int x = get_global_id(0);
int x = get_global_id(0) * pixels_per_work_item;
int y = get_global_id(1);
if (y < rows && x < cols)
......@@ -287,24 +499,83 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step,
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x);
DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
#ifdef DEPTH_5
__constant float * coeffs = c_RGB2YCrCbCoeffs_f;
DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
#else
__constant int * coeffs = c_RGB2YCrCbCoeffs_i;
int delta = HALF_MAX * (1 << yuv_shift);
int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
const int delta = HALF_MAX * (1 << yuv_shift);
#endif
dst[dst_idx] = SAT_CAST( Y );
dst[dst_idx + 1] = SAT_CAST( Cr );
dst[dst_idx + 2] = SAT_CAST( Cb );
if (1 == pixels_per_work_item)
{
const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
#ifdef DEPTH_5
float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
float Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
float Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
#else
int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
#endif
dst_ptr[0] = SAT_CAST( Y );
dst_ptr[1] = SAT_CAST( Cr );
dst_ptr[2] = SAT_CAST( Cb );
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const VECTOR8 r0 = vload8(0, src_ptr);
#ifdef DEPTH_5
const float2 c0 = r0.s04;
const float2 c1 = r0.s15;
const float2 c2 = r0.s26;
const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]);
const float2 Cr = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX);
const float2 Cb = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX);
#else
const int2 c0 = convert_int2(r0.s04);
const int2 c1 = convert_int2(r0.s15);
const int2 c2 = convert_int2(r0.s26);
const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
const VECTOR2 Y = SAT_CAST2(yi);
const VECTOR2 Cr = SAT_CAST2(ui);
const VECTOR2 Cb = SAT_CAST2(vi);
#endif
vstore8((VECTOR8)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0), 0, dst_ptr);
}
else if (4 == pixels_per_work_item)
{
#ifndef DEPTH_5
const VECTOR16 r0 = vload16(0, src_ptr);
const int4 c0 = convert_int4(r0.s048c);
const int4 c1 = convert_int4(r0.s159d);
const int4 c2 = convert_int4(r0.s26ae);
const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
const VECTOR4 Y = SAT_CAST4(yi);
const VECTOR4 Cr = SAT_CAST4(ui);
const VECTOR4 Cb = SAT_CAST4(vi);
vstore16((VECTOR16)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0, Y.s2, Cr.s2, Cb.s2, 0, Y.s3, Cr.s3, Cb.s3, 0), 0, dst_ptr);
#endif
}
#endif //INTEL_DEVICE
}
}
......@@ -312,10 +583,10 @@ __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
__constant int c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };
__kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step,
int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset)
{
int x = get_global_id(0);
int x = get_global_id(0) * pixels_per_work_item;
int y = get_global_id(1);
if (y < rows && x < cols)
......@@ -324,36 +595,104 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step,
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x);
DATA_TYPE ycrcb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
#ifdef DEPTH_5
__constant float * coeffs = c_YCrCb2RGBCoeffs_f;
#else
__constant int * coeffs = c_YCrCb2RGBCoeffs_i;
#endif
if (1 == pixels_per_work_item)
{
const DATA_TYPE ycrcb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
#ifdef DEPTH_5
__constant float * coeff = c_YCrCb2RGBCoeffs_f;
float r = ycrcb[0] + coeff[0] * (ycrcb[1] - HALF_MAX);
float g = ycrcb[0] + coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX);
float b = ycrcb[0] + coeff[3] * (ycrcb[2] - HALF_MAX);
float B = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[3];
float G = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1];
float R = ycrcb[0] + (ycrcb[1] - HALF_MAX) * coeffs[0];
#else
__constant int * coeff = c_YCrCb2RGBCoeffs_i;
int r = ycrcb[0] + CV_DESCALE(coeff[0] * (ycrcb[1] - HALF_MAX), yuv_shift);
int g = ycrcb[0] + CV_DESCALE(coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX), yuv_shift);
int b = ycrcb[0] + CV_DESCALE(coeff[3] * (ycrcb[2] - HALF_MAX), yuv_shift);
int B = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[3], yuv_shift);
int G = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1], yuv_shift);
int R = ycrcb[0] + CV_DESCALE((ycrcb[1] - HALF_MAX) * coeffs[0], yuv_shift);
#endif
dst[dst_idx + (bidx^2)] = SAT_CAST(r);
dst[dst_idx + 1] = SAT_CAST(g);
dst[dst_idx + bidx] = SAT_CAST(b);
dst_ptr[bidx] = SAT_CAST( B );
dst_ptr[1] = SAT_CAST( G );
dst_ptr[(bidx^2)] = SAT_CAST( R );
#if dcn == 4
dst[dst_idx + 3] = MAX_NUM;
dst_ptr[3] = MAX_NUM;
#endif
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const VECTOR8 r0 = vload8(0, src_ptr);
#ifdef DEPTH_5
const float2 Y = r0.s04;
const float2 Cr = r0.s15;
const float2 Cb = r0.s26;
const float2 c0 = (bidx == 0) ? (Y + (Cb - HALF_MAX) * coeffs[3]) : (Y + (Cr - HALF_MAX) * coeffs[0]);
const float2 c1 = Y + (Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1];
const float2 c2 = (bidx == 0) ? (Y + (Cr - HALF_MAX) * coeffs[0]) : (Y + (Cb - HALF_MAX) * coeffs[3]);
#else
const int2 Y = convert_int2(r0.s04);
const int2 Cr = convert_int2(r0.s15);
const int2 Cb = convert_int2(r0.s26);
const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift));
const int2 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift);
const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift));
const VECTOR2 c0 = SAT_CAST2(c0i);
const VECTOR2 c1 = SAT_CAST2(c1i);
const VECTOR2 c2 = SAT_CAST2(c2i);
#endif
#if dcn == 4
vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr);
#else
vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr);
#endif
}
else if (4 == pixels_per_work_item)
{
#ifndef DEPTH_5
const VECTOR16 r0 = vload16(0, src_ptr);
const int4 Y = convert_int4(r0.s048c);
const int4 Cr = convert_int4(r0.s159d);
const int4 Cb = convert_int4(r0.s26ae);
const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift));
const int4 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift);
const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift));
const VECTOR4 c0 = SAT_CAST4(c0i);
const VECTOR4 c1 = SAT_CAST4(c1i);
const VECTOR4 c2 = SAT_CAST4(c2i);
#if dcn == 4
vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr);
#else
vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr);
#endif
#endif
}
#endif //INTEL_DEVICE
}
}
///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
__kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step,
int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
int dx = get_global_id(0) * pixels_per_work_item;
int dy = get_global_id(1);
if (dy < rows && dx < cols)
......@@ -362,28 +701,85 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step,
int src_idx = mad24(dy, src_step, src_offset + dx);
int dst_idx = mad24(dy, dst_step, dst_offset + dx);
DATA_TYPE r = src[src_idx], g = src[src_idx + 1], b = src[src_idx + 2];
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
if (1 == pixels_per_work_item)
{
DATA_TYPE R = src_ptr[0], G = src_ptr[1], B = src_ptr[2];
#ifdef DEPTH_5
float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
float X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2];
float Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5];
float Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8];
#else
int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
int X = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
int Y = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
int Z = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
#endif
dst[dst_idx] = SAT_CAST(x);
dst[dst_idx + 1] = SAT_CAST(y);
dst[dst_idx + 2] = SAT_CAST(z);
dst_ptr[0] = SAT_CAST( X );
dst_ptr[1] = SAT_CAST( Y );
dst_ptr[2] = SAT_CAST( Z );
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const VECTOR8 r0 = vload8(0, src_ptr);
#ifdef DEPTH_5
const float2 R = r0.s04;
const float2 G = r0.s15;
const float2 B = r0.s26;
const float2 X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2];
const float2 Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5];
const float2 Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8];
#else
const int2 R = convert_int2(r0.s04);
const int2 G = convert_int2(r0.s15);
const int2 B = convert_int2(r0.s26);
const int2 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
const int2 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
const int2 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
const VECTOR2 X = SAT_CAST2(xi);
const VECTOR2 Y = SAT_CAST2(yi);
const VECTOR2 Z = SAT_CAST2(zi);
#endif
vstore8((VECTOR8)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0), 0, dst_ptr);
}
else if (4 == pixels_per_work_item)
{
#ifndef DEPTH_5
const VECTOR16 r0 = vload16(0, src_ptr);
const int4 R = convert_int4(r0.s048c);
const int4 G = convert_int4(r0.s159d);
const int4 B = convert_int4(r0.s26ae);
const int4 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
const int4 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
const int4 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
const VECTOR4 X = SAT_CAST4(xi);
const VECTOR4 Y = SAT_CAST4(yi);
const VECTOR4 Z = SAT_CAST4(zi);
vstore16((VECTOR16)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0, X.s2, Y.s2, Z.s2, 0, X.s3, Y.s3, Z.s3, 0), 0, dst_ptr);
#endif
}
#endif //INTEL_DEVICE
}
}
__kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step,
int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
__global const DATA_TYPE* src, __global DATA_TYPE* dst,
int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
int dx = get_global_id(0) * pixels_per_work_item;
int dy = get_global_id(1);
if (dy < rows && dx < cols)
......@@ -392,23 +788,88 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step,
int src_idx = mad24(dy, src_step, src_offset + dx);
int dst_idx = mad24(dy, dst_step, dst_offset + dx);
DATA_TYPE x = src[src_idx], y = src[src_idx + 1], z = src[src_idx + 2];
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
if (1 == pixels_per_work_item)
{
const DATA_TYPE X = src_ptr[0], Y = src_ptr[1], Z = src_ptr[2];
#ifdef DEPTH_5
float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
float B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2];
float G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5];
float R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8];
#else
int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
int B = CV_DESCALE(X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2], xyz_shift);
int G = CV_DESCALE(X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5], xyz_shift);
int R = CV_DESCALE(X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8], xyz_shift);
#endif
dst[dst_idx] = SAT_CAST(b);
dst[dst_idx + 1] = SAT_CAST(g);
dst[dst_idx + 2] = SAT_CAST(r);
dst_ptr[0] = SAT_CAST( B );
dst_ptr[1] = SAT_CAST( G );
dst_ptr[2] = SAT_CAST( R );
#if dcn == 4
dst[dst_idx + 3] = MAX_NUM;
dst_ptr[3] = MAX_NUM;
#endif
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const VECTOR8 r0 = vload8(0, src_ptr);
#ifdef DEPTH_5
const float2 X = r0.s04;
const float2 Y = r0.s15;
const float2 Z = r0.s26;
float2 B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2];
float2 G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5];
float2 R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8];
#else
const int2 xi = convert_int2(r0.s04);
const int2 yi = convert_int2(r0.s15);
const int2 zi = convert_int2(r0.s26);
const int2 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift);
const int2 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift);
const int2 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift);
const VECTOR2 R = SAT_CAST2(ri);
const VECTOR2 G = SAT_CAST2(gi);
const VECTOR2 B = SAT_CAST2(bi);
#endif
#if dcn == 4
vstore8((VECTOR8)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM), 0, dst_ptr);
#else
vstore8((VECTOR8)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0), 0, dst_ptr);
#endif
}
else if (4 == pixels_per_work_item)
{
#ifndef DEPTH_5
const VECTOR16 r0 = vload16(0, src_ptr);
const int4 xi = convert_int4(r0.s048c);
const int4 yi = convert_int4(r0.s159d);
const int4 zi = convert_int4(r0.s26ae);
const int4 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift);
const int4 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift);
const int4 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift);
const VECTOR4 R = SAT_CAST4(ri);
const VECTOR4 G = SAT_CAST4(gi);
const VECTOR4 B = SAT_CAST4(bi);
#if dcn == 4
vstore16((VECTOR16)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM, B.s2, G.s2, R.s2, MAX_NUM, B.s3, G.s3, R.s3, MAX_NUM), 0, dst_ptr);
#else
vstore16((VECTOR16)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0, B.s2, G.s2, R.s2, 0, B.s3, G.s3, R.s3, 0), 0, dst_ptr);
#endif
#endif
}
#endif //INTEL_DEVICE
}
}
......@@ -427,6 +888,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step,
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x);
#ifndef INTEL_DEVICE
#ifdef REVERSE
dst[dst_idx] = src[src_idx + 2];
dst[dst_idx + 1] = src[src_idx + 1];
......@@ -443,13 +905,44 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step,
#else
dst[dst_idx + 3] = src[src_idx + 3];
#endif
#endif
#else
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
const VECTOR4 r0 = vload4(0, src_ptr);
#ifdef REVERSE
if (3 == dcn)
{
vstore4((VECTOR4)(r0.s210, 0), 0, dst_ptr);
}
else if (3 == scn)
{
vstore4((VECTOR4)(r0.s210, MAX_NUM), 0, dst_ptr);
}
else {
vstore4((VECTOR4)(r0.s2103), 0, dst_ptr);
}
#elif defined ORDER
if (3 == dcn)
{
vstore4((VECTOR4)(r0.s012, 0), 0, dst_ptr);
}
else if (3 == scn)
{
vstore4((VECTOR4)(r0.s012, MAX_NUM), 0, dst_ptr);
}
else {
vstore4(r0, 0, dst_ptr);
}
#endif
#endif
}
}
///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step,
__global const ushort * src, __global uchar * dst,
int src_offset, int dst_offset)
{
......@@ -482,7 +975,7 @@ __kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bid
}
}
__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step,
__global const uchar * src, __global ushort * dst,
int src_offset, int dst_offset)
{
......@@ -507,7 +1000,7 @@ __kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bid
///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step,
__global const ushort * src, __global uchar * dst,
int src_offset, int dst_offset)
{
......@@ -532,7 +1025,7 @@ __kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bi
}
}
__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step,
__global const uchar * src, __global ushort * dst,
int src_offset, int dst_offset)
{
......@@ -560,7 +1053,7 @@ __constant int sector_data[][3] = { {1, 3, 0}, { 1, 0, 2 }, { 3, 0, 1 }, { 0, 2,
#ifdef DEPTH_0
__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step,
__global const uchar * src, __global uchar * dst,
int src_offset, int dst_offset,
__constant int * sdiv_table, __constant int * hdiv_table)
......@@ -600,7 +1093,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
}
}
__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step,
__global const uchar * src, __global uchar * dst,
int src_offset, int dst_offset)
{
......@@ -656,7 +1149,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
#elif defined DEPTH_5
__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step,
__global const float * src, __global float * dst,
int src_offset, int dst_offset)
{
......@@ -698,7 +1191,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
}
}
__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step,
__global const float * src, __global float * dst,
int src_offset, int dst_offset)
{
......@@ -758,7 +1251,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
#ifdef DEPTH_0
__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step,
__global const uchar * src, __global uchar * dst,
int src_offset, int dst_offset)
{
......@@ -805,7 +1298,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
}
}
__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step,
__global const uchar * src, __global uchar * dst,
int src_offset, int dst_offset)
{
......@@ -860,7 +1353,7 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
#elif defined DEPTH_5
__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step,
__global const float * src, __global float * dst,
int src_offset, int dst_offset)
{
......@@ -907,7 +1400,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
}
}
__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step,
__global const float * src, __global float * dst,
int src_offset, int dst_offset)
{
......@@ -968,10 +1461,10 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
#ifdef DEPTH_0
__kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step,
int bidx, __global const uchar * src, __global uchar * dst,
__global const uchar * src, __global uchar * dst,
int src_offset, int dst_offset)
{
int x = get_global_id(0);
int x = get_global_id(0) * pixels_per_work_item;
int y = get_global_id(1);
if (y < rows && x < cols)
......@@ -980,21 +1473,66 @@ __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step,
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x);
uchar v0 = src[src_idx], v1 = src[src_idx + 1];
uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
if (1 == pixels_per_work_item)
{
const uchar4 r0 = vload4(0, src_ptr);
dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
dst[dst_idx + 3] = v3;
dst_ptr[0] = (r0.s0 * r0.s3 + HALF_MAX) / MAX_NUM;
dst_ptr[1] = (r0.s1 * r0.s3 + HALF_MAX) / MAX_NUM;
dst_ptr[2] = (r0.s2 * r0.s3 + HALF_MAX) / MAX_NUM;
dst_ptr[3] = r0.s3;
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const uchar8 r0 = vload8(0, src_ptr);
const int2 v0 = convert_int2(r0.s04);
const int2 v1 = convert_int2(r0.s15);
const int2 v2 = convert_int2(r0.s26);
const int2 v3 = convert_int2(r0.s37);
const int2 ri = (v0 * v3 + HALF_MAX) / MAX_NUM;
const int2 gi = (v1 * v3 + HALF_MAX) / MAX_NUM;
const int2 bi = (v2 * v3 + HALF_MAX) / MAX_NUM;
const uchar2 r = convert_uchar2(ri);
const uchar2 g = convert_uchar2(gi);
const uchar2 b = convert_uchar2(bi);
vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr);
}
else if (4 == pixels_per_work_item)
{
const uchar16 r0 = vload16(0, src_ptr);
const int4 v0 = convert_int4(r0.s048c);
const int4 v1 = convert_int4(r0.s159d);
const int4 v2 = convert_int4(r0.s26ae);
const int4 v3 = convert_int4(r0.s37bf);
const int4 ri = (v0 * v3 + HALF_MAX) / MAX_NUM;
const int4 gi = (v1 * v3 + HALF_MAX) / MAX_NUM;
const int4 bi = (v2 * v3 + HALF_MAX) / MAX_NUM;
const uchar4 r = convert_uchar4(ri);
const uchar4 g = convert_uchar4(gi);
const uchar4 b = convert_uchar4(bi);
vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr);
}
#endif //INTEL_DEVICE
}
}
__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bidx,
__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step,
__global const uchar * src, __global uchar * dst,
int src_offset, int dst_offset)
{
int x = get_global_id(0);
int x = get_global_id(0) * pixels_per_work_item;
int y = get_global_id(1);
if (y < rows && x < cols)
......@@ -1003,14 +1541,63 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bid
int src_idx = mad24(y, src_step, src_offset + x);
int dst_idx = mad24(y, dst_step, dst_offset + x);
uchar v0 = src[src_idx], v1 = src[src_idx + 1];
uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
uchar v3_half = v3 / 2;
global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
if (1 == pixels_per_work_item)
{
const uchar4 r0 = vload4(0, src_ptr);
const uchar v3_half = r0.s3 / 2;
dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
dst[dst_idx + 3] = v3;
const uchar r = (r0.s3 == 0) ? 0 : (r0.s0 * MAX_NUM + v3_half) / r0.s3;
const uchar g = (r0.s3 == 0) ? 0 : (r0.s1 * MAX_NUM + v3_half) / r0.s3;
const uchar b = (r0.s3 == 0) ? 0 : (r0.s2 * MAX_NUM + v3_half) / r0.s3;
vstore4((uchar4)(r, g, b, r0.s3), 0, dst_ptr);
}
#ifdef INTEL_DEVICE
else if (2 == pixels_per_work_item)
{
const uchar8 r0 = vload8(0, src_ptr);
const int2 v0 = convert_int2(r0.s04);
const int2 v1 = convert_int2(r0.s15);
const int2 v2 = convert_int2(r0.s26);
const int2 v3 = convert_int2(r0.s37);
const int2 v3_half = v3 / 2;
const int2 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3;
const int2 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3;
const int2 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3;
const uchar2 r = convert_uchar2(ri);
const uchar2 g = convert_uchar2(gi);
const uchar2 b = convert_uchar2(bi);
vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr);
}
else if (4 == pixels_per_work_item)
{
const uchar16 r0 = vload16(0, src_ptr);
const int4 v0 = convert_int4(r0.s048c);
const int4 v1 = convert_int4(r0.s159d);
const int4 v2 = convert_int4(r0.s26ae);
const int4 v3 = convert_int4(r0.s37bf);
const int4 v3_half = v3 / 2;
const int4 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3;
const int4 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3;
const int4 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3;
const uchar4 r = convert_uchar4(ri);
const uchar4 g = convert_uchar4(gi);
const uchar4 b = convert_uchar4(bi);
vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr);
}
#endif //INTEL_DEVICE
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册