Improve ocl cvt_color performance for the following conversions: RGB<->BGR,...

Improve ocl cvt_color performance for the following conversions: RGB<->BGR, RGB->Gray, RGB<->XYZ, RGB<->YCrCb, RGB<->YUV, and mRGBA<->RGBA. The improvement was done basically by processing more than 1 pixel by each work-item and using vector's operations. new performance tests were added

Improve ocl cvt_color performance for the following conversions: RGB<->BGR,...
Improve ocl cvt_color performance for the following conversions: RGB<->BGR, RGB->Gray, RGB<->XYZ, RGB<->YCrCb, RGB<->YUV, and mRGBA<->RGBA. The improvement was done basically by processing more than 1 pixel by each work-item and using vector's operations. new performance tests were added
27c1bd27 · krodyush · 26f5d2d6 · 27c1bd27 · 27c1bd27 · 27c1bd27
3 changed file
--- a/modules/ocl/perf/perf_color.cpp
+++ b/modules/ocl/perf/perf_color.cpp
@@ -57,9 +57,39 @@ CV_ENUM(ConversionTypes, CV_RGB2GRAY, CV_RGB2BGR, CV_RGB2YUV, CV_YUV2RGB, CV_RGB
        CV_HLS2RGB, CV_BGR5652BGR, CV_BGR2BGR565, CV_RGBA2mRGBA, CV_mRGBA2RGBA, CV_YUV2RGB_NV12)

 typedef tuple<Size, tuple<ConversionTypes, int, int> > cvtColorParams;
-typedef TestBaseWithParam<cvtColorParams> cvtColorFixture;
+typedef TestBaseWithParam<cvtColorParams> cvtColorU8Fixture;
+typedef TestBaseWithParam<cvtColorParams> cvtColorF32Fixture;
+typedef TestBaseWithParam<cvtColorParams> cvtColorU16Fixture;

-PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine(
+#define RUN_CVT_PERF_TEST \
+    cvtColorParams params = GetParam();\
+    const Size srcSize = get<0>(params);\
+    const tuple<int, int, int> conversionParams = get<1>(params);\
+    const int code = get<0>(conversionParams), scn = get<1>(conversionParams),\
+            dcn = get<2>(conversionParams);\
+\
+    Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));\
+    declare.in(src, WARMUP_RNG).out(dst);\
+\
+    if (RUN_OCL_IMPL)\
+    {\
+        ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type());\
+\
+        OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn);\
+        oclDst.download(dst);\
+\
+        SANITY_CHECK(dst, 1);\
+    }\
+    else if (RUN_PLAIN_IMPL)\
+    {\
+        TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);\
+\
+        SANITY_CHECK(dst);\
+    }\
+    else\
+        OCL_PERF_ELSE\
+
+PERF_TEST_P(cvtColorU8Fixture, cvtColor, testing::Combine(
                testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)),
                testing::Values(
                    make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1),
@@ -81,30 +111,41 @@ PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine(
                    make_tuple(ConversionTypes(CV_YUV2RGB_NV12), 1, 3)
                    )))
 {
-    cvtColorParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const tuple<int, int, int> conversionParams = get<1>(params);
-    const int code = get<0>(conversionParams), scn = get<1>(conversionParams),
-            dcn = get<2>(conversionParams);
-
-    Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type());
-
-        OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn);
-        oclDst.download(dst);
+    RUN_CVT_PERF_TEST
+}

-        SANITY_CHECK(dst, 1);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);
+PERF_TEST_P(cvtColorF32Fixture, cvtColor, testing::Combine(
+                testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)),
+                testing::Values(
+                    make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1),
+                    make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3),
+                    make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3),
+                    make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3),
+                    make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2HSV), 3, 3),
+                    make_tuple(ConversionTypes(CV_HSV2RGB), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2HLS), 3, 3),
+                    make_tuple(ConversionTypes(CV_HLS2RGB), 3, 3)
+                    )))
+{
+    RUN_CVT_PERF_TEST
+}

-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
+PERF_TEST_P(cvtColorU16Fixture, cvtColor, testing::Combine(
+                testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)),
+                testing::Values(
+                    make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1),
+                    make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3),
+                    make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3),
+                    make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3),
+                    make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3),
+                    make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3)
+                    )))
+{
+    RUN_CVT_PERF_TEST
 }
--- a/modules/ocl/src/color.cpp
+++ b/modules/ocl/src/color.cpp
@@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
 {
    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+    int pixels_per_work_item = 1;

-    std::string build_options = format("-D DEPTH_%d", src.depth());
+    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+    {
+        if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
+            pixels_per_work_item =  4;
+        else if (src.cols % 2 == 0)
+            pixels_per_work_item =  2;
+        else
+            pixels_per_work_item =  1;
+    }
+
+    std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item);
    if (!additionalOptions.empty())
        build_options += additionalOptions;

@@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
    if (!data2.empty())
        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data ));

+    size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
+static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                           const std::string & additionalOptions = std::string(),
+                           const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat())
+{
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+
+    std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
+    if (!data1.empty())
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data1.data ));
+    if (!data2.empty())
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data ));
+
+   size_t gt[3] = { dst.cols, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
+static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
+{
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
+    if (!data.empty())
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data ));
+
    size_t gt[3] = { dst.cols, dst.rows, 1 };
 #ifdef ANDROID
    size_t lt[3] = { 16, 10, 1 };
@@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
 static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
 {
-    std::string build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+    int pixels_per_work_item = 1;
+
+    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+    {
+        if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
+            pixels_per_work_item =  4;
+        else if (src.cols % 2 == 0)
+            pixels_per_work_item =  2;
+        else
+            pixels_per_work_item =  1;
+    }
+
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
+    if (!data.empty())
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data ));
+
+    size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
+static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
+{
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
    if (!additionalOptions.empty())
        build_options += additionalOptions;

@@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -119,13 +238,46 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
 }

+static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
+{
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
+    if (!data.empty())
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data ));
+
+    size_t gt[3] = { dst.cols, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
 static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
 {
-    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
-                                       dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();

+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s",
+                                        src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
+
    vector<pair<size_t , const void *> > args;
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
@@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)

 static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
 {
-    std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
-                                       src.depth(), greenbits, dst.channels());
+    std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d",
+                                       src.depth(), greenbits, dst.channels(), bidx);
    int src_offset = src.offset >> 1, src_step = src.step >> 1;
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1();

@@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree

 static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
 {
-    std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
-                                       src.depth(), greenbits, src.channels());
+    std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d",
+                                       src.depth(), greenbits, src.channels(), bidx);
    int src_offset = (int)src.offset, src_step = (int)src.step;
    int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1;

@@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
        CV_Assert(scn == 1);
        dcn  = code == CV_GRAY2BGRA ? 4 : 3;
        dst.create(sz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, 0, "Gray2RGB");
+        fromGray_caller(src, dst, 0, "Gray2RGB");
        break;
    }
    case CV_BGR2YUV: case CV_RGB2YUV:
@@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)

        Size dstSz(sz.width, sz.height * 2 / 3);
        dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12");
+        toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12");
        break;
    }
    case CV_BGR2YCrCb: case CV_RGB2YCrCb:
@@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
                initialized = true;
            }

-            fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
+            toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
            return;
        }

-        fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
+        toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
        break;
    }
    case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
@@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
        dst.create(sz, CV_MAKETYPE(depth, dcn));

        std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB";
-        toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
+        fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
        break;
    }
    case CV_RGBA2mRGBA: case CV_mRGBA2RGBA:

--- a/modules/ocl/src/opencl/cvt_color.cl
+++ b/modules/ocl/src/opencl/cvt_color.cl
@@ -56,35 +56,59 @@

 #ifdef DEPTH_0
 #define DATA_TYPE uchar
+#define VECTOR2 uchar2
+#define VECTOR4 uchar4
+#define VECTOR8 uchar8
+#define VECTOR16 uchar16
 #define COEFF_TYPE int
 #define MAX_NUM  255
 #define HALF_MAX 128
 #define SAT_CAST(num) convert_uchar_sat_rte(num)
+#define SAT_CAST2(num) convert_uchar2_sat(num)
+#define SAT_CAST4(num) convert_uchar4_sat(num)
 #endif

 #ifdef DEPTH_2
 #define DATA_TYPE ushort
+#define VECTOR2 ushort2
+#define VECTOR4 ushort4
+#define VECTOR8 ushort8
+#define VECTOR16 ushort16
 #define COEFF_TYPE int
 #define MAX_NUM  65535
 #define HALF_MAX 32768
 #define SAT_CAST(num) convert_ushort_sat_rte(num)
+#define SAT_CAST2(num) convert_ushort2_sat(num)
+#define SAT_CAST4(num) convert_ushort4_sat(num)
 #endif

 #ifdef DEPTH_5
 #define DATA_TYPE float
+#define VECTOR2 float2
+#define VECTOR4 float4
+#define VECTOR8 float8
+#define VECTOR16 float16
 #define COEFF_TYPE float
 #define MAX_NUM  1.0f
 #define HALF_MAX 0.5f
 #define SAT_CAST(num) (num)
 #endif

+#ifndef bidx
+    #define bidx 0
+#endif
+
+#ifndef pixels_per_work_item
+    #define pixels_per_work_item 1
+#endif
+
 #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))

 enum
 {
    yuv_shift  = 14,
    xyz_shift  = 12,
-    hsv_shift = 12,
+    hsv_shift  = 12,
    R2Y        = 4899,
    G2Y        = 9617,
    B2Y        = 1868,
@@ -93,26 +117,84 @@ enum

 ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////

+__constant float c_RGB2GrayCoeffs_f[3]  = { 0.114f, 0.587f, 0.299f };
+__constant int   c_RGB2GrayCoeffs_i[3]  = { B2Y, G2Y, R2Y };
+
 __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step,
-                       int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                       __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                       int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
    int y = get_global_id(1);

    if (y < rows && x < cols)
    {
        int src_idx = mad24(y, src_step, src_offset + (x << 2));
        int dst_idx = mad24(y, dst_step, dst_offset + x);
+
+#ifndef INTEL_DEVICE
 #ifdef DEPTH_5
        dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f;
 #else
        dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
 #endif
+#else
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+#ifdef DEPTH_5
+        __constant float * coeffs = c_RGB2GrayCoeffs_f;
+#else
+        __constant int * coeffs = c_RGB2GrayCoeffs_i;
+#endif
+
+        if (1 == pixels_per_work_item)
+        {
+#ifdef DEPTH_5
+            *dst_ptr = src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] *coeffs[2];
+#else
+            *dst_ptr = (DATA_TYPE)CV_DESCALE((src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] * coeffs[2]), yuv_shift);
+#endif
+        }
+        else if (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 c0 = r0.s04;
+            const float2 c1 = r0.s15;
+            const float2 c2 = r0.s26;
+
+            const float2 Y = c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2];
+#else
+            const int2 c0 = convert_int2(r0.s04);
+            const int2 c1 = convert_int2(r0.s15);
+            const int2 c2 = convert_int2(r0.s26);
+
+            const int2 yi = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift);
+            const VECTOR2 Y = SAT_CAST2(yi);
+#endif
+
+            vstore2(Y, 0, dst_ptr);
+        }
+        else if (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 c0 = convert_int4(r0.s048c);
+            const int4 c1 = convert_int4(r0.s159d);
+            const int4 c2 = convert_int4(r0.s26ae);
+            const int4 Y = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift);
+
+            vstore4(SAT_CAST4(Y), 0, dst_ptr);
+#endif
+        }
+#endif //INTEL_DEVICE
    }
 }

-__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step,
                       __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                       int src_offset, int dst_offset)
 {
@@ -140,10 +222,10 @@ __constant float c_RGB2YUVCoeffs_f[5]  = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877
 __constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, 8061, 14369 };

 __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                      int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
    int y = get_global_id(1);

    if (y < rows && x < cols)
@@ -151,24 +233,85 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step,
        x <<= 2;
        int src_idx = mad24(y, src_step, src_offset + x);
        int dst_idx = mad24(y, dst_step, dst_offset + x);
-        DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);

 #ifdef DEPTH_5
        __constant float * coeffs = c_RGB2YUVCoeffs_f;
-        DATA_TYPE Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
-        DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
-        DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
 #else
        __constant int * coeffs = c_RGB2YUVCoeffs_i;
-        int delta = HALF_MAX * (1 << yuv_shift);
-        int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
-        int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
-        int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
+        const int delta = HALF_MAX * (1 << yuv_shift);
+#endif
+
+        if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
+
+#ifdef DEPTH_5
+            float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
+            float U = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
+            float V = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
+#else
+            int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
+            int U = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
+            int V = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
 #endif

-        dst[dst_idx] = SAT_CAST( Y );
-        dst[dst_idx + 1] = SAT_CAST( Cr );
-        dst[dst_idx + 2] = SAT_CAST( Cb );
+            dst_ptr[0] = SAT_CAST( Y );
+            dst_ptr[1] = SAT_CAST( U );
+            dst_ptr[2] = SAT_CAST( V );
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 c0 = r0.s04;
+            const float2 c1 = r0.s15;
+            const float2 c2 = r0.s26;
+
+            const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]);
+            const float2 U = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX);
+            const float2 V = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX);
+#else
+            const int2 c0 = convert_int2(r0.s04);
+            const int2 c1 = convert_int2(r0.s15);
+            const int2 c2 = convert_int2(r0.s26);
+
+            const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR2 Y = SAT_CAST2(yi);
+            const VECTOR2 U = SAT_CAST2(ui);
+            const VECTOR2 V = SAT_CAST2(vi);
+#endif
+
+            vstore8((VECTOR8)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0), 0, dst_ptr);
+        }
+        else if (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 c0 = convert_int4(r0.s048c);
+            const int4 c1 = convert_int4(r0.s159d);
+            const int4 c2 = convert_int4(r0.s26ae);
+
+            const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR4 Y = SAT_CAST4(yi);
+            const VECTOR4 U = SAT_CAST4(ui);
+            const VECTOR4 V = SAT_CAST4(vi);
+
+            vstore16((VECTOR16)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0, Y.s2, U.s2, V.s2, 0, Y.s3, U.s3, V.s3, 0), 0, dst_ptr);
+#endif
+        }
+#endif //INTEL_DEVICE
    }
 }

@@ -176,10 +319,10 @@ __constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
 __constant int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };

 __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                      int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
    int y = get_global_id(1);

    if (y < rows && x < cols)
@@ -187,26 +330,95 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step,
        x <<= 2;
        int src_idx = mad24(y, src_step, src_offset + x);
        int dst_idx = mad24(y, dst_step, dst_offset + x);
-        DATA_TYPE yuv[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);

 #ifdef DEPTH_5
        __constant float * coeffs = c_YUV2RGBCoeffs_f;
-        float b = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3];
-        float g = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1];
-        float r = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0];
 #else
        __constant int * coeffs = c_YUV2RGBCoeffs_i;
-        int b = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift);
-        int g = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift);
-        int r = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift);
 #endif

-        dst[dst_idx + bidx] = SAT_CAST( b );
-        dst[dst_idx + 1]      = SAT_CAST( g );
-        dst[dst_idx + (bidx^2)]   = SAT_CAST( r );
+        if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE yuv[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
+
+#ifdef DEPTH_5
+            float B = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3];
+            float G = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1];
+            float R = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0];
+#else
+            int B = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift);
+            int G = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift);
+            int R = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift);
+#endif
+
+            dst_ptr[bidx]     = SAT_CAST( B );
+            dst_ptr[1]        = SAT_CAST( G );
+            dst_ptr[(bidx^2)] = SAT_CAST( R );
 #if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+            dst_ptr[3]         = MAX_NUM;
+#endif
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 Y = r0.s04;
+            const float2 U = r0.s15;
+            const float2 V = r0.s26;
+
+            const float2 c0 = (bidx == 0) ? (Y + (V - HALF_MAX) * coeffs[3]) : (Y + (U - HALF_MAX) * coeffs[0]);
+            const float2 c1 = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
+            const float2 c2 = (bidx == 0) ? (Y + (U - HALF_MAX) * coeffs[0]) : (Y + (V - HALF_MAX) * coeffs[3]);
+#else
+            const int2 Y = convert_int2(r0.s04);
+            const int2 U = convert_int2(r0.s15);
+            const int2 V = convert_int2(r0.s26);
+
+            const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift));
+            const int2 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+            const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR2 c0 = SAT_CAST2(c0i);
+            const VECTOR2 c1 = SAT_CAST2(c1i);
+            const VECTOR2 c2 = SAT_CAST2(c2i);
+#endif
+
+#if dcn == 4
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr);
+#else
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr);
+#endif
+        }
+        else if (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 Y = convert_int4(r0.s048c);
+            const int4 U = convert_int4(r0.s159d);
+            const int4 V = convert_int4(r0.s26ae);
+
+            const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift));
+            const int4 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+            const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR4 c0 = SAT_CAST4(c0i);
+            const VECTOR4 c1 = SAT_CAST4(c1i);
+            const VECTOR4 c2 = SAT_CAST4(c2i);
+
+#if dcn == 4
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr);
+#else
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr);
+#endif
 #endif
+        }
+#endif //INTEL_DEVICE
    }
 }

@@ -218,7 +430,7 @@ __constant int ITUR_BT_601_CVR = 1673527;
 __constant int ITUR_BT_601_SHIFT = 20;

 __kernel void YUV2RGBA_NV12(int cols, int rows, int src_step, int dst_step,
-                            int bidx, __global const uchar* src, __global uchar* dst,
+                            __global const uchar* src, __global uchar* dst,
                            int src_offset, int dst_offset)
 {
    const int x = get_global_id(0);
@@ -275,10 +487,10 @@ __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564
 __constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};

 __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                        int src_offset, int dst_offset)
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
    int y = get_global_id(1);

    if (y < rows && x < cols)
@@ -287,24 +499,83 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step,
        int src_idx = mad24(y, src_step, src_offset + x);
        int dst_idx = mad24(y, dst_step, dst_offset + x);

-        DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);

 #ifdef DEPTH_5
        __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
-        DATA_TYPE Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
-        DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
-        DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
 #else
        __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
-        int delta = HALF_MAX * (1 << yuv_shift);
-        int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
-        int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
-        int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
+        const int delta = HALF_MAX * (1 << yuv_shift);
 #endif

-        dst[dst_idx] = SAT_CAST( Y );
-        dst[dst_idx + 1] = SAT_CAST( Cr );
-        dst[dst_idx + 2] = SAT_CAST( Cb );
+        if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
+
+#ifdef DEPTH_5
+            float Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
+            float Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
+            float Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
+#else
+            int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
+            int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
+            int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
+#endif
+
+            dst_ptr[0] = SAT_CAST( Y );
+            dst_ptr[1] = SAT_CAST( Cr );
+            dst_ptr[2] = SAT_CAST( Cb );
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 c0 = r0.s04;
+            const float2 c1 = r0.s15;
+            const float2 c2 = r0.s26;
+
+            const float2 Y  = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]);
+            const float2 Cr = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX);
+            const float2 Cb = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX);
+#else
+            const int2 c0 = convert_int2(r0.s04);
+            const int2 c1 = convert_int2(r0.s15);
+            const int2 c2 = convert_int2(r0.s26);
+
+            const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR2 Y  = SAT_CAST2(yi);
+            const VECTOR2 Cr = SAT_CAST2(ui);
+            const VECTOR2 Cb = SAT_CAST2(vi);
+#endif
+
+            vstore8((VECTOR8)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0), 0, dst_ptr);
+        }
+        else if (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+            const int4 c0 = convert_int4(r0.s048c);
+            const int4 c1 = convert_int4(r0.s159d);
+            const int4 c2 = convert_int4(r0.s26ae);
+
+            const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR4 Y  = SAT_CAST4(yi);
+            const VECTOR4 Cr = SAT_CAST4(ui);
+            const VECTOR4 Cb = SAT_CAST4(vi);
+
+            vstore16((VECTOR16)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0, Y.s2, Cr.s2, Cb.s2, 0, Y.s3, Cr.s3, Cb.s3, 0), 0, dst_ptr);
+#endif
+        }
+#endif //INTEL_DEVICE
    }
 }

@@ -312,10 +583,10 @@ __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
 __constant int   c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };

 __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                        int src_offset, int dst_offset)
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
    int y = get_global_id(1);

    if (y < rows && x < cols)
@@ -324,36 +595,104 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step,
        int src_idx = mad24(y, src_step, src_offset + x);
        int dst_idx = mad24(y, dst_step, dst_offset + x);

-        DATA_TYPE ycrcb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+#ifdef DEPTH_5
+        __constant float * coeffs = c_YCrCb2RGBCoeffs_f;
+#else
+        __constant int * coeffs = c_YCrCb2RGBCoeffs_i;
+#endif
+
+        if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE ycrcb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};

 #ifdef DEPTH_5
-        __constant float * coeff = c_YCrCb2RGBCoeffs_f;
-        float r = ycrcb[0] + coeff[0] * (ycrcb[1] - HALF_MAX);
-        float g = ycrcb[0] + coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX);
-        float b = ycrcb[0] + coeff[3] * (ycrcb[2] - HALF_MAX);
+            float B = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[3];
+            float G = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1];
+            float R = ycrcb[0] + (ycrcb[1] - HALF_MAX) * coeffs[0];
 #else
-        __constant int * coeff = c_YCrCb2RGBCoeffs_i;
-        int r = ycrcb[0] + CV_DESCALE(coeff[0] * (ycrcb[1] - HALF_MAX), yuv_shift);
-        int g = ycrcb[0] + CV_DESCALE(coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX), yuv_shift);
-        int b = ycrcb[0] + CV_DESCALE(coeff[3] * (ycrcb[2] - HALF_MAX), yuv_shift);
+            int B = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[3], yuv_shift);
+            int G = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1], yuv_shift);
+            int R = ycrcb[0] + CV_DESCALE((ycrcb[1] - HALF_MAX) * coeffs[0], yuv_shift);
 #endif

-        dst[dst_idx + (bidx^2)] = SAT_CAST(r);
-        dst[dst_idx + 1] = SAT_CAST(g);
-        dst[dst_idx + bidx] = SAT_CAST(b);
+            dst_ptr[bidx]     = SAT_CAST( B );
+            dst_ptr[1]        = SAT_CAST( G );
+            dst_ptr[(bidx^2)] = SAT_CAST( R );
 #if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+            dst_ptr[3]         = MAX_NUM;
+#endif
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 Y  = r0.s04;
+            const float2 Cr = r0.s15;
+            const float2 Cb = r0.s26;
+
+            const float2 c0 = (bidx == 0) ? (Y + (Cb - HALF_MAX) * coeffs[3]) : (Y + (Cr - HALF_MAX) * coeffs[0]);
+            const float2 c1 = Y + (Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1];
+            const float2 c2 = (bidx == 0) ? (Y + (Cr - HALF_MAX) * coeffs[0]) : (Y + (Cb - HALF_MAX) * coeffs[3]);
+#else
+            const int2 Y  = convert_int2(r0.s04);
+            const int2 Cr = convert_int2(r0.s15);
+            const int2 Cb = convert_int2(r0.s26);
+
+            const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift));
+            const int2 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift);
+            const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR2 c0 = SAT_CAST2(c0i);
+            const VECTOR2 c1 = SAT_CAST2(c1i);
+            const VECTOR2 c2 = SAT_CAST2(c2i);
+#endif
+
+#if dcn == 4
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr);
+#else
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr);
+#endif
+        }
+        else if (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 Y  = convert_int4(r0.s048c);
+            const int4 Cr = convert_int4(r0.s159d);
+            const int4 Cb = convert_int4(r0.s26ae);
+
+            const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift));
+            const int4 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift);
+            const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR4 c0 = SAT_CAST4(c0i);
+            const VECTOR4 c1 = SAT_CAST4(c1i);
+            const VECTOR4 c2 = SAT_CAST4(c2i);
+
+#if dcn == 4
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr);
+#else
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr);
 #endif
+#endif
+        }
+#endif //INTEL_DEVICE
    }
 }

 ///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////

 __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                      int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
 {
-    int dx = get_global_id(0);
+    int dx = get_global_id(0) * pixels_per_work_item;
    int dy = get_global_id(1);

    if (dy < rows && dx < cols)
@@ -362,28 +701,85 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step,
        int src_idx = mad24(dy, src_step, src_offset + dx);
        int dst_idx = mad24(dy, dst_step, dst_offset + dx);

-        DATA_TYPE r = src[src_idx], g = src[src_idx + 1], b = src[src_idx + 2];
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+        if (1 == pixels_per_work_item)
+        {
+            DATA_TYPE R = src_ptr[0], G = src_ptr[1], B = src_ptr[2];

 #ifdef DEPTH_5
-        float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
-        float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
-        float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
+            float X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2];
+            float Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5];
+            float Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8];
 #else
-        int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
-        int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
-        int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
+            int X = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
+            int Y = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
+            int Z = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
 #endif
-        dst[dst_idx] = SAT_CAST(x);
-        dst[dst_idx + 1] = SAT_CAST(y);
-        dst[dst_idx + 2] = SAT_CAST(z);
+
+            dst_ptr[0] = SAT_CAST( X );
+            dst_ptr[1] = SAT_CAST( Y );
+            dst_ptr[2] = SAT_CAST( Z );
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 R = r0.s04;
+            const float2 G = r0.s15;
+            const float2 B = r0.s26;
+
+            const float2 X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2];
+            const float2 Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5];
+            const float2 Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8];
+#else
+            const int2 R = convert_int2(r0.s04);
+            const int2 G = convert_int2(r0.s15);
+            const int2 B = convert_int2(r0.s26);
+
+            const int2 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
+            const int2 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
+            const int2 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
+
+            const VECTOR2 X = SAT_CAST2(xi);
+            const VECTOR2 Y = SAT_CAST2(yi);
+            const VECTOR2 Z = SAT_CAST2(zi);
+#endif
+
+            vstore8((VECTOR8)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0), 0, dst_ptr);
+        }
+        else if (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 R = convert_int4(r0.s048c);
+            const int4 G = convert_int4(r0.s159d);
+            const int4 B = convert_int4(r0.s26ae);
+
+            const int4 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
+            const int4 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
+            const int4 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
+
+            const VECTOR4 X = SAT_CAST4(xi);
+            const VECTOR4 Y = SAT_CAST4(yi);
+            const VECTOR4 Z = SAT_CAST4(zi);
+
+            vstore16((VECTOR16)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0, X.s2, Y.s2, Z.s2, 0, X.s3, Y.s3, Z.s3, 0), 0, dst_ptr);
+#endif
+        }
+#endif //INTEL_DEVICE
    }
 }

 __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                      int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
 {
-    int dx = get_global_id(0);
+    int dx = get_global_id(0) * pixels_per_work_item;
    int dy = get_global_id(1);

    if (dy < rows && dx < cols)
@@ -392,23 +788,88 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step,
        int src_idx = mad24(dy, src_step, src_offset + dx);
        int dst_idx = mad24(dy, dst_step, dst_offset + dx);

-        DATA_TYPE x = src[src_idx], y = src[src_idx + 1], z = src[src_idx + 2];
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+        if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE X = src_ptr[0], Y = src_ptr[1], Z = src_ptr[2];

 #ifdef DEPTH_5
-        float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
-        float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
-        float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
+            float B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2];
+            float G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5];
+            float R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8];
 #else
-        int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
-        int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
-        int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
+            int B = CV_DESCALE(X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2], xyz_shift);
+            int G = CV_DESCALE(X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5], xyz_shift);
+            int R = CV_DESCALE(X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8], xyz_shift);
 #endif
-        dst[dst_idx] = SAT_CAST(b);
-        dst[dst_idx + 1] = SAT_CAST(g);
-        dst[dst_idx + 2] = SAT_CAST(r);
+
+            dst_ptr[0] = SAT_CAST( B );
+            dst_ptr[1] = SAT_CAST( G );
+            dst_ptr[2] = SAT_CAST( R );
 #if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+            dst_ptr[3] = MAX_NUM;
+#endif
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 X = r0.s04;
+            const float2 Y = r0.s15;
+            const float2 Z = r0.s26;
+
+            float2 B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2];
+            float2 G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5];
+            float2 R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8];
+#else
+            const int2 xi = convert_int2(r0.s04);
+            const int2 yi = convert_int2(r0.s15);
+            const int2 zi = convert_int2(r0.s26);
+
+            const int2 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift);
+            const int2 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift);
+            const int2 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift);
+
+            const VECTOR2 R = SAT_CAST2(ri);
+            const VECTOR2 G = SAT_CAST2(gi);
+            const VECTOR2 B = SAT_CAST2(bi);
+#endif
+
+#if dcn == 4
+            vstore8((VECTOR8)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM), 0, dst_ptr);
+#else
+            vstore8((VECTOR8)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0), 0, dst_ptr);
+#endif
+        }
+        else if (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 xi = convert_int4(r0.s048c);
+            const int4 yi = convert_int4(r0.s159d);
+            const int4 zi = convert_int4(r0.s26ae);
+
+            const int4 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift);
+            const int4 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift);
+            const int4 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift);
+
+            const VECTOR4 R = SAT_CAST4(ri);
+            const VECTOR4 G = SAT_CAST4(gi);
+            const VECTOR4 B = SAT_CAST4(bi);
+
+#if dcn == 4
+            vstore16((VECTOR16)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM, B.s2, G.s2, R.s2, MAX_NUM, B.s3, G.s3, R.s3, MAX_NUM), 0, dst_ptr);
+#else
+            vstore16((VECTOR16)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0, B.s2, G.s2, R.s2, 0, B.s3, G.s3, R.s3, 0), 0, dst_ptr);
 #endif
+#endif
+        }
+#endif //INTEL_DEVICE
    }
 }

@@ -427,6 +888,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step,
        int src_idx = mad24(y, src_step, src_offset + x);
        int dst_idx = mad24(y, dst_step, dst_offset + x);

+#ifndef INTEL_DEVICE
 #ifdef REVERSE
        dst[dst_idx] = src[src_idx + 2];
        dst[dst_idx + 1] = src[src_idx + 1];
@@ -443,13 +905,44 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step,
 #else
        dst[dst_idx + 3] = src[src_idx + 3];
 #endif
+#endif
+#else
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+        const VECTOR4 r0 = vload4(0, src_ptr);
+#ifdef REVERSE
+        if (3 == dcn)
+        {
+            vstore4((VECTOR4)(r0.s210, 0), 0, dst_ptr);
+        }
+        else if (3 == scn)
+        {
+            vstore4((VECTOR4)(r0.s210, MAX_NUM), 0, dst_ptr);
+        }
+        else {
+            vstore4((VECTOR4)(r0.s2103), 0, dst_ptr);
+        }
+#elif defined ORDER
+        if (3 == dcn)
+        {
+            vstore4((VECTOR4)(r0.s012, 0), 0, dst_ptr);
+        }
+        else if (3 == scn)
+        {
+            vstore4((VECTOR4)(r0.s012, MAX_NUM), 0, dst_ptr);
+        }
+        else {
+            vstore4(r0, 0, dst_ptr);
+        }
+#endif
 #endif
    }
 }

 ///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////

-__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step,
                         __global const ushort * src, __global uchar * dst,
                         int src_offset, int dst_offset)
 {
@@ -482,7 +975,7 @@ __kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bid
    }
 }

-__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step,
                         __global const uchar * src, __global ushort * dst,
                         int src_offset, int dst_offset)
 {
@@ -507,7 +1000,7 @@ __kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bid

 ///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////

-__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step,
                          __global const ushort * src, __global uchar * dst,
                          int src_offset, int dst_offset)
 {
@@ -532,7 +1025,7 @@ __kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bi
    }
 }

-__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step,
                          __global const uchar * src, __global ushort * dst,
                          int src_offset, int dst_offset)
 {
@@ -560,7 +1053,7 @@ __constant int sector_data[][3] = { {1, 3, 0}, { 1, 0, 2 }, { 3, 0, 1 }, { 0, 2,

 #ifdef DEPTH_0

-__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step,
                      __global const uchar * src, __global uchar * dst,
                      int src_offset, int dst_offset,
                      __constant int * sdiv_table, __constant int * hdiv_table)
@@ -600,7 +1093,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
    }
 }

-__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step,
                      __global const uchar * src, __global uchar * dst,
                      int src_offset, int dst_offset)
 {
@@ -656,7 +1149,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,

 #elif defined DEPTH_5

-__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step,
                      __global const float * src, __global float * dst,
                      int src_offset, int dst_offset)
 {
@@ -698,7 +1191,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
    }
 }

-__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step,
                      __global const float * src, __global float * dst,
                      int src_offset, int dst_offset)
 {
@@ -758,7 +1251,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,

 #ifdef DEPTH_0

-__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step,
                      __global const uchar * src, __global uchar * dst,
                      int src_offset, int dst_offset)
 {
@@ -805,7 +1298,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
    }
 }

-__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step,
                      __global const uchar * src, __global uchar * dst,
                      int src_offset, int dst_offset)
 {
@@ -860,7 +1353,7 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,

 #elif defined DEPTH_5

-__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step,
                      __global const float * src, __global float * dst,
                      int src_offset, int dst_offset)
 {
@@ -907,7 +1400,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
    }
 }

-__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step,
                      __global const float * src, __global float * dst,
                      int src_offset, int dst_offset)
 {
@@ -968,10 +1461,10 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
 #ifdef DEPTH_0

 __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const uchar * src, __global uchar * dst,
+                        __global const uchar * src, __global uchar * dst,
                        int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
    int y = get_global_id(1);

    if (y < rows && x < cols)
@@ -980,21 +1473,66 @@ __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step,
        int src_idx = mad24(y, src_step, src_offset + x);
        int dst_idx = mad24(y, dst_step, dst_offset + x);

-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+        if (1 == pixels_per_work_item)
+        {
+            const uchar4 r0 = vload4(0, src_ptr);

-        dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 3] = v3;
+            dst_ptr[0] = (r0.s0 * r0.s3 + HALF_MAX) / MAX_NUM;
+            dst_ptr[1] = (r0.s1 * r0.s3 + HALF_MAX) / MAX_NUM;
+            dst_ptr[2] = (r0.s2 * r0.s3 + HALF_MAX) / MAX_NUM;
+            dst_ptr[3] = r0.s3;
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const uchar8 r0 = vload8(0, src_ptr);
+
+            const int2 v0 = convert_int2(r0.s04);
+            const int2 v1 = convert_int2(r0.s15);
+            const int2 v2 = convert_int2(r0.s26);
+            const int2 v3 = convert_int2(r0.s37);
+
+            const int2 ri = (v0 * v3 + HALF_MAX) / MAX_NUM;
+            const int2 gi = (v1 * v3 + HALF_MAX) / MAX_NUM;
+            const int2 bi = (v2 * v3 + HALF_MAX) / MAX_NUM;
+
+            const uchar2 r = convert_uchar2(ri);
+            const uchar2 g = convert_uchar2(gi);
+            const uchar2 b = convert_uchar2(bi);
+
+            vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr);
+        }
+        else if (4 == pixels_per_work_item)
+        {
+            const uchar16 r0 = vload16(0, src_ptr);
+
+            const int4 v0 = convert_int4(r0.s048c);
+            const int4 v1 = convert_int4(r0.s159d);
+            const int4 v2 = convert_int4(r0.s26ae);
+            const int4 v3 = convert_int4(r0.s37bf);
+
+            const int4 ri = (v0 * v3 + HALF_MAX) / MAX_NUM;
+            const int4 gi = (v1 * v3 + HALF_MAX) / MAX_NUM;
+            const int4 bi = (v2 * v3 + HALF_MAX) / MAX_NUM;
+
+            const uchar4 r = convert_uchar4(ri);
+            const uchar4 g = convert_uchar4(gi);
+            const uchar4 b = convert_uchar4(bi);
+
+            vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr);
+        }
+#endif //INTEL_DEVICE
    }
 }

-__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step,
                        __global const uchar * src, __global uchar * dst,
                        int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
    int y = get_global_id(1);

    if (y < rows && x < cols)
@@ -1003,14 +1541,63 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bid
        int src_idx = mad24(y, src_step, src_offset + x);
        int dst_idx = mad24(y, dst_step, dst_offset + x);

-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
-        uchar v3_half = v3 / 2;
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+        if (1 == pixels_per_work_item)
+        {
+            const uchar4 r0 = vload4(0, src_ptr);
+            const uchar v3_half = r0.s3 / 2;

-        dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 3] = v3;
+            const uchar r = (r0.s3 == 0) ? 0 : (r0.s0 * MAX_NUM + v3_half) / r0.s3;
+            const uchar g = (r0.s3 == 0) ? 0 : (r0.s1 * MAX_NUM + v3_half) / r0.s3;
+            const uchar b = (r0.s3 == 0) ? 0 : (r0.s2 * MAX_NUM + v3_half) / r0.s3;
+
+            vstore4((uchar4)(r, g, b, r0.s3), 0, dst_ptr);
+        }
+#ifdef INTEL_DEVICE
+        else if (2 == pixels_per_work_item)
+        {
+            const uchar8 r0 = vload8(0, src_ptr);
+
+            const int2 v0 = convert_int2(r0.s04);
+            const int2 v1 = convert_int2(r0.s15);
+            const int2 v2 = convert_int2(r0.s26);
+            const int2 v3 = convert_int2(r0.s37);
+            const int2 v3_half = v3 / 2;
+
+            const int2 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3;
+            const int2 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3;
+            const int2 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3;
+
+            const uchar2 r = convert_uchar2(ri);
+            const uchar2 g = convert_uchar2(gi);
+            const uchar2 b = convert_uchar2(bi);
+
+            vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr);
+        }
+        else if (4 == pixels_per_work_item)
+        {
+            const uchar16 r0 = vload16(0, src_ptr);
+
+            const int4 v0 = convert_int4(r0.s048c);
+            const int4 v1 = convert_int4(r0.s159d);
+            const int4 v2 = convert_int4(r0.s26ae);
+            const int4 v3 = convert_int4(r0.s37bf);
+            const int4 v3_half = v3 / 2;
+
+
+            const int4 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3;
+            const int4 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3;
+            const int4 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3;
+
+            const uchar4 r = convert_uchar4(ri);
+            const uchar4 g = convert_uchar4(gi);
+            const uchar4 b = convert_uchar4(bi);
+
+            vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr);
+        }
+#endif //INTEL_DEVICE
    }
 }