use perf test replace performance sample

5539e85a · yao · 55c9a7c8 · 5539e85a · 5539e85a · 5539e85a
25 changed file
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,129 +42,118 @@

 #include "precomp.hpp"

-#ifdef HAVE_OPENCL
-
-using namespace std;
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-
-void print_info()
-{
-    printf("\n");
-#if defined _WIN32
-#   if defined _WIN64
-    puts("OS: Windows 64");
-#   else
-    puts("OS: Windows 32");
-#   endif
-#elif defined linux
-#   if defined _LP64
-    puts("OS: Linux 64");
-#   else
-    puts("OS: Linux 32");
-#   endif
-#elif defined __APPLE__
-#   if defined _LP64
-    puts("OS: Apple 64");
-#   else
-    puts("OS: Apple 32");
-#   endif
-#endif
-
-}
-std::string workdir;
-int main(int argc, char **argv)
+int main(int argc, const char *argv[])
 {
-    TS::ptr()->init("ocl");
-    InitGoogleTest(&argc, argv);
-    const char *keys =
-
-        "{ h | help     | false              | print help message }"
-
-        "{ w | workdir  | ../../../samples/c/| set working directory }"
+    vector<ocl::Info> oclinfo;
+    int num_devices = getDevice(oclinfo);

-        "{ t | type     | gpu                | set device type:cpu or gpu}"
+    if (num_devices < 1)
+    {
+        cerr << "no device found\n";
+        return -1;
+    }

-        "{ p | platform | 0                  | set platform id }"
+    int devidx = 0;

-        "{ d | device   | 0                  | set device id }";
+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
+        {
+            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
+        }
+    }

+    redirectError(cvErrorCallback);

+    const char *keys =
+        "{ h | help    | false | print help message }"
+        "{ f | filter  |       | filter for test }"
+        "{ w | workdir |       | set working directory }"
+        "{ l | list    | false | show all tests }"
+        "{ d | device  | 0     | device id }"
+        "{ i | iters   | 10    | iteration count }"
+        "{ m | warmup  | 1     | gpu warm up iteration count}"
+        "{ t | xtop    | 1.1	  | xfactor top boundary}"
+        "{ b | xbottom | 0.9	  | xfactor bottom boundary}"
+        "{ v | verify  | false | only run gpu once to verify if problems occur}";

    CommandLineParser cmd(argc, argv, keys);

    if (cmd.get<bool>("help"))
-
    {
-
-        cout << "Avaible options besides goole test option:" << endl;
-
+        cout << "Avaible options:" << endl;
        cmd.printParams();
+        return 0;
    }

-    workdir = cmd.get<string>("workdir");
-
-    string type = cmd.get<string>("type");
-
-    unsigned int pid = cmd.get<unsigned int>("platform");
-
    int device = cmd.get<int>("device");

-
-    print_info();
-    // int flag = CVCL_DEVICE_TYPE_GPU;
-
-    // if(type == "cpu")
-
-    // {
-
-    //     flag = CVCL_DEVICE_TYPE_CPU;
-
-    // }
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = getDevice(oclinfo);
-    if(devnums <= device || device < 0)
-
+    if (device < 0 || device >= num_devices)
    {
-
-        std::cout << "device invalid\n";
-
+        cerr << "Invalid device ID" << endl;
        return -1;
+    }

+    if (cmd.get<bool>("verify"))
+    {
+        TestSystem::instance().setNumIters(1);
+        TestSystem::instance().setGPUWarmupIters(0);
+        TestSystem::instance().setCPUIters(0);
    }

-    if(pid >= oclinfo.size())
+    devidx = 0;

+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
        {
+            if (device == devidx)
+            {
+                ocl::setDevice(oclinfo[i], (int)j);
+                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
+                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
+                goto END_DEV;
+            }
+        }
+    }

-        std::cout << "platform invalid\n";
+END_DEV:

-        return -1;
+    string filter = cmd.get<string>("filter");
+    string workdir = cmd.get<string>("workdir");
+    bool list = cmd.get<bool>("list");
+    int iters = cmd.get<int>("iters");
+    int wu_iters = cmd.get<int>("warmup");
+    double x_top = cmd.get<double>("xtop");
+    double x_bottom = cmd.get<double>("xbottom");

-    }
+    TestSystem::instance().setTopThreshold(x_top);
+    TestSystem::instance().setBottomThreshold(x_bottom);

-    if(pid != 0 || device != 0)
+    if (!filter.empty())
+    {
+        TestSystem::instance().setTestFilter(filter);
+    }

+    if (!workdir.empty())
+    {
+        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
        {
+            workdir += '/';
+        }

-        setDevice(oclinfo[pid], device);
+        TestSystem::instance().setWorkingDir(workdir);
+    }

+    if (list)
+    {
+        TestSystem::instance().setListMode(true);
    }

-    cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
-    setBinpath(CLBINPATH);
-    return RUN_ALL_TESTS();
-}
+    TestSystem::instance().setNumIters(iters);
+    TestSystem::instance().setGPUWarmupIters(wu_iters);

-#else // DON'T HAVE_OPENCL
+    TestSystem::instance().run();

-int main()
-{
-    printf("OpenCV was built without OpenCL support\n");
    return 0;
 }
\ No newline at end of file
-
-
-#endif // HAVE_OPENCL
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@@ -44,79 +44,77 @@
 //M*/

 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(Blend, MatType, int)
+///////////// blend ////////////////////////
+template <typename T>
+void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
 {
-    int type;
-    int channels;
-    std::vector<cv::ocl::Info> oclinfo;
+    result_gold.create(img1.size(), img1.type());
+
+    int cn = img1.channels();

-    virtual void SetUp()
+    for (int y = 0; y < img1.rows; ++y)
    {
+        const float *weights1_row = weights1.ptr<float>(y);
+        const float *weights2_row = weights2.ptr<float>(y);
+        const T *img1_row = img1.ptr<T>(y);
+        const T *img2_row = img2.ptr<T>(y);
+        T *result_gold_row = result_gold.ptr<T>(y);

-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        //cv::ocl::setBinpath(CLBINPATH);
+        for (int x = 0; x < img1.cols * cn; ++x)
+        {
+            float w1 = weights1_row[x / cn];
+            float w2 = weights2_row[x / cn];
+            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
        }
-};
-
-TEST_P(Blend, Performance)
+    }
+}
+TEST(blend)
 {
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat img1_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat img2_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
-    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
-    cv::ocl::oclMat gimg1(size, CV_MAKETYPE(type, channels)), gimg2(size, CV_MAKETYPE(type, channels)), gweights1(size, CV_32F), gweights2(size, CV_32F);
-    cv::ocl::oclMat gdst(size, CV_MAKETYPE(type, channels));
+    Mat src1, src2, weights1, weights2, dst;
+    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;

+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    double totalgputick_all = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++) //LOOP_TIMES=100
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        t1 = (double)cvGetTickCount();
-        cv::ocl::oclMat gimg1 = cv::ocl::oclMat(img1_host);
-        cv::ocl::oclMat gimg2 = cv::ocl::oclMat(img2_host);
-        cv::ocl::oclMat gweights1 = cv::ocl::oclMat(weights1);
-        cv::ocl::oclMat gweights2 = cv::ocl::oclMat(weights1);
-
-        t2 = (double)cvGetTickCount();
-        cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, gdst);
-        t2 = (double)cvGetTickCount() - t2;
-
-        cv::Mat m;
-        gdst.download(m);
-        t1 = (double)cvGetTickCount() - t1;
-
-        if (j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
-            continue;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";
+
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(weights1, size, size, CV_32FC1, 0, 1);
+            gen(weights2, size, size, CV_32FC1, 0, 1);
+
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+
+            CPU_ON;
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+            CPU_OFF;
+
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+
+            WARMUP_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
    }
-
-        totalgputick_all = t1 + totalgputick_all;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-    };
-
-    cout << "average gpu total  runtime is  " << totalgputick_all / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-    cout << "average gpu runtime without data transfering  is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
 }
\ No newline at end of file
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-#endif
\ No newline at end of file
--- a/modules/ocl/perf/perf_brute_force_matcher.cpp
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+//////////////////// BruteForceMatch /////////////////
+TEST(BruteForceMatcher)
+{
+    Mat trainIdx_cpu;
+    Mat distance_cpu;
+    Mat allDist_cpu;
+    Mat nMatches_cpu;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        // Init CPU matcher
+        int desc_len = 64;
+
+        BFMatcher matcher(NORM_L2);
+
+        Mat query;
+        gen(query, size, desc_len, CV_32F, 0, 1);
+
+        Mat train;
+        gen(train, size, desc_len, CV_32F, 0, 1);
+        // Output
+        vector< vector<DMatch> > matches(2);
+        // Init GPU matcher
+        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
+
+        ocl::oclMat d_query(query);
+        ocl::oclMat d_train(train);
+
+        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
+
+        SUBTEST << size << "; match";
+
+        matcher.match(query, train, matches[0]);
+
+        CPU_ON;
+        matcher.match(query, train, matches[0]);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.match(d_query, d_train, matches[0]);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; knnMatch";
+
+        matcher.knnMatch(query, train, matches, 2);
+
+        CPU_ON;
+        matcher.knnMatch(query, train, matches, 2);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.knnMatch(d_query, d_train, matches, 2);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; radiusMatch";
+
+        float max_distance = 2.0f;
+
+        matcher.radiusMatch(query, train, matches, max_distance);
+
+        CPU_ON;
+        matcher.radiusMatch(query, train, matches, max_distance);
+        CPU_OFF;
+
+        d_trainIdx.release();
+
+        WARMUP_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
+        GPU_FULL_OFF;
+    }
+}
\ No newline at end of file
--- a/modules/ocl/perf/perf_canny.cpp
+++ b/modules/ocl/perf/perf_canny.cpp
@@ -42,112 +42,42 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////
-// Canny1
-extern std::string workdir;
-IMPLEMENT_PARAM_CLASS(AppertureSize, int);
-IMPLEMENT_PARAM_CLASS(L2gradient, bool);

-PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient)
+///////////// Canny ////////////////////////
+TEST(Canny)
 {
-    int apperture_size;
-    bool useL2gradient;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);

-    virtual void SetUp()
+    if (img.empty())
    {
-        apperture_size = GET_PARAM(0);
-        useL2gradient = GET_PARAM(1);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
+        throw runtime_error("can't open aloeL.jpg");
    }
-};
-
-TEST_P(Canny1, Performance)
-{
-    cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    double low_thresh = 100.0;
-    double high_thresh = 150.0;
-
-    cv::Mat edges_gold;
-    cv::ocl::oclMat edges;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;

-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        edges.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";

-        if(j == 0)
-            continue;
+    Mat edges(img.size(), CV_8UC1);

-        totalgputick = t1 + totalgputick;
+    CPU_ON;
+    Canny(img, edges, 50.0, 100.0);
+    CPU_OFF;

-        totalgputick_kernel = t2 + totalgputick_kernel;
+    ocl::oclMat d_img(img);
+    ocl::oclMat d_edges;
+    ocl::CannyBuf d_buf;

-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+    WARMUP_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    WARMUP_OFF;

+    GPU_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+     ;
+    GPU_OFF;

+    GPU_FULL_ON;
+    d_img.upload(img);
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    d_edges.download(edges);
+    GPU_FULL_OFF;
 }
\ No newline at end of file
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny1, testing::Combine(
-                            testing::Values(AppertureSize(3), AppertureSize(5)),
-                            testing::Values(L2gradient(false), L2gradient(true))));
-
-
-
-#endif  //Have opencl
\ No newline at end of file
--- a/modules/ocl/perf/perf_color.cpp
+++ b/modules/ocl/perf/perf_color.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// cvtColor////////////////////////
+TEST(cvtColor)
+{
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC4};
+    std::string type_name[] = {"CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            gen(src, size, size, all_type[j], 0, 256);
+            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";
+
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+
+            CPU_ON;
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+
+
+    }
+
+
+}
\ No newline at end of file
--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_columnsum.cpp
@@ -15,8 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//	   Fangfang Bai fangfang@multicorewareinc.com
-//
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -31,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -43,78 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-///////////////////////////////////////////////////////////////////////////////
-/// ColumnSum
-
-#ifdef HAVE_OPENCL
-
-////////////////////////////////////////////////////////////////////////
-// ColumnSum

-PARAM_TEST_CASE(ColumnSum)
+///////////// columnSum////////////////////////
+TEST(columnSum)
 {
-    cv::Mat src;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;

-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
+        SUBTEST << size << 'x' << size << "; CV_32FC1";

-TEST_F(ColumnSum, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_32FC1);
-    cv::ocl::oclMat d_dst;
+        gen(src, size, size, CV_32FC1, 0, 256);

-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
+        CPU_ON;
+        dst.create(src.size(), src.type());

-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (int i = 1; i < src.rows; ++i)
        {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat d_src(src);
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::columnSum(d_src, d_dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        d_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+            for (int j = 0; j < src.cols; ++j)
+            {
+                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            }
        }

-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+        CPU_OFF;

+        d_src.upload(src);
+        WARMUP_ON;
+        ocl::columnSum(d_src, d_dst);
+        WARMUP_OFF;

+        GPU_ON;
+        ocl::columnSum(d_src, d_dst);
+         ;
+        GPU_OFF;

+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::columnSum(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
+    }
 }
\ No newline at end of file
-
-
-
-#endif
\ No newline at end of file
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/ocl/perf/perf_fft.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Fangfangbai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,85 +42,48 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDFFT
-////////////////////////////////////////////////////////////////////////////
-// Dft
-PARAM_TEST_CASE(Dft, cv::Size, bool)
-{
-    cv::Size dft_size;
-    bool	 dft_rows;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
-    {
-        dft_size = GET_PARAM(0);
-        dft_rows = GET_PARAM(1);
-        cv::ocl::getDevice(info);
-    }
-};

-TEST_P(Dft, C2C)
+///////////// dft ////////////////////////
+TEST(dft)
 {
-    cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
-    int flags = 0;
-    flags |= dft_rows ? cv::DFT_ROWS : 0;
-
-    cv::ocl::oclMat d_b;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;

-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
+    int all_type[] = {CV_32FC1, CV_32FC2};
+    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};

-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";

-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ga = cv::ocl::oclMat(a); //upload
+            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));

-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::dft(ga, d_b, a.size(), flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+            dft(src, dst);

-        cv::Mat cpu_dst;
-        d_b.download (cpu_dst);//download
+            CPU_ON;
+            dft(src, dst);
+            CPU_OFF;

-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+            d_src.upload(src);

-        if(j == 0)
-            continue;
+            WARMUP_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+            WARMUP_OFF;

-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+             ;
+            GPU_OFF;

+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::dft(d_src, d_dst, Size(size, size));
+            d_dst.download(dst);
+            GPU_FULL_OFF;
        }

-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-
-
-
-TEST_P(Dft, R2CthenC2R)
-{
-    cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
-
-    int flags = 0;
-    //flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
-
-    cv::ocl::oclMat d_b, d_c;
-
-    cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
-    cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
-
-    EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
+    }
 }
\ No newline at end of file
-
-//INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
-//						testing::Values(cv::Size(1280, 1024), cv::Size(1920, 1080),cv::Size(1800, 1500)),
-//						testing::Values(false, true)));
-
-#endif // HAVE_CLAMDFFT
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -41,73 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDBLAS
-////////////////////////////////////////////////////////////////////////////
-// GEMM
-PARAM_TEST_CASE(Gemm, int, cv::Size, int)
-{
-    int      type;
-    cv::Size mat_size;
-    int		 flags;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
-    {
-        type     = GET_PARAM(0);
-        mat_size = GET_PARAM(1);
-        flags    = GET_PARAM(2);

-        cv::ocl::getDevice(info);
-    }
-};
-
-TEST_P(Gemm, Performance)
+///////////// gemm ////////////////////////
+TEST(gemm)
 {
-    cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
-    cv::ocl::oclMat ocl_dst;
+    Mat src1, src2, src3, dst;
+    ocl::oclMat d_src1, d_src2, d_src3, d_dst;

-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ga = cv::ocl::oclMat(a);//upload
-        cv::ocl::oclMat gb = cv::ocl::oclMat(b);//upload
-        cv::ocl::oclMat gc = cv::ocl::oclMat(c);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::gemm(ga, gb, 1.0, gc, 1.0, ocl_dst, flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        ocl_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+        SUBTEST << size << 'x' << size;
+
+        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+
+        CPU_ON;
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+        CPU_OFF;
+
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+
+        WARMUP_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
    }
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
 }
\ No newline at end of file
-
-
-INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
-                            testing::Values(CV_32FC1, CV_32FC2/* , CV_64FC1, CV_64FC2*/),
-                            testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
-                            testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)(cv::GEMM_1_T + cv::GEMM_2_T))));
-#endif
\ No newline at end of file
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/ocl/perf/perf_haar.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,133 +42,97 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-#include "opencv2/objdetect/objdetect.hpp"
 #include "precomp.hpp"

-#ifdef HAVE_OPENCL
+///////////// Haar ////////////////////////
+namespace cv
+{
+namespace ocl
+{

-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-extern std::string workdir;
 struct getRect
 {
-    Rect operator ()(const CvAvgComp &e) const
+    Rect operator()(const CvAvgComp &e) const
    {
        return e.rect;
    }
 };

-PARAM_TEST_CASE(HaarTestBase, int, int)
+class CascadeClassifier_GPU : public OclCascadeClassifier
 {
-    //std::vector<cv::ocl::Info> oclinfo;
-    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::CascadeClassifier cpucascade, cpunestedCascade;
-    //    Mat img;
-
-    double scale;
-    int index;
-
-    virtual void SetUp()
-    {
-        scale = 1.0;
-        index = 0;
-        string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml";
-
-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
+public:
+    void detectMultiScale(oclMat &image,
+                          CV_OUT std::vector<cv::Rect>& faces,
+                          double scaleFactor = 1.1,
+                          int minNeighbors = 3, int flags = 0,
+                          Size minSize = Size(),
+                          Size maxSize = Size())
    {
-            cout << "ERROR: Could not load classifier cascade" << endl;
-            return;
-        }
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath("E:\\");
+        (void)maxSize;
+        MemStorage storage(cvCreateMemStorage(0));
+        //CvMat img=image;
+        CvSeq *objs = oclHaarDetectObjects(image, storage, scaleFactor, minNeighbors, flags, minSize);
+        vector<CvAvgComp> vecAvgComp;
+        Seq<CvAvgComp>(objs).copyTo(vecAvgComp);
+        faces.resize(vecAvgComp.size());
+        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
    }
-};
-
-////////////////////////////////faceDetect/////////////////////////////////////////////////

-struct Haar : HaarTestBase {};
+};

-TEST_F(Haar, FaceDetect)
+}
+}
+TEST(Haar)
 {
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
+    Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);

-    if(img.empty())
+    if (img.empty())
    {
-        std::cout << imgName << std::endl;
-        return ;
+        throw runtime_error("can't open basketball1.png");
    }

-    //int i = 0;
-    double t = 0;
-    vector<Rect> faces, oclfaces;
-
-    // const static Scalar colors[] =  { CV_RGB(0, 0, 255),
-    //                                   CV_RGB(0, 128, 255),
-    //                                   CV_RGB(0, 255, 255),
-    //                                   CV_RGB(0, 255, 0),
-    //                                   CV_RGB(255, 128, 0),
-    //                                   CV_RGB(255, 255, 0),
-    //                                   CV_RGB(255, 0, 0),
-    //                                   CV_RGB(255, 0, 255)
-    //                                 } ;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
+    CascadeClassifier faceCascadeCPU;

-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
+    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
    {
-        cpucascade.detectMultiScale( smallImg, faces,  1.1,
-                                     3, 0
-                                     | CV_HAAR_SCALE_IMAGE
-                                     , Size(30, 30), Size(0, 0) );
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
    }
-    t = (double)cvGetTickCount() - t ;
-    printf( "cpudetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );

-    cv::ocl::oclMat image;
-    CvSeq *_objects=NULL;
-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
+    vector<Rect> faces;
+
+    SUBTEST << img.cols << "x" << img.rows << "; scale image";
+    CPU_ON;
+    faceCascadeCPU.detectMultiScale(img, faces,
+                                    1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    CPU_OFF;
+
+    ocl::CascadeClassifier_GPU faceCascade;
+
+    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
    {
-        image.upload(smallImg);
-        _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
-                   3, 0
-                   | CV_HAAR_SCALE_IMAGE
-                   , Size(30, 30), Size(0, 0) );
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
    }
-    t = (double)cvGetTickCount() - t ;
-    printf( "ocldetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    oclfaces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-
-    //for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
-    //{
-    //	Mat smallImgROI;
-    //	Point center;
-    //	Scalar color = colors[i%8];
-    //	int radius;
-    //	center.x = cvRound((r->x + r->width*0.5)*scale);
-    //	center.y = cvRound((r->y + r->height*0.5)*scale);
-    //	radius = cvRound((r->width + r->height)*0.25*scale);
-    //	circle( img, center, radius, color, 3, 8, 0 );
-    //}
-    //namedWindow("result");
-    //imshow("result",img);
-    //waitKey(0);
-    //destroyAllWindows();

+    ocl::oclMat d_img(img);
+
+    faces.clear();
+
+    WARMUP_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    WARMUP_OFF;
+
+    faces.clear();
+
+    GPU_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+     ;
+    GPU_OFF;
+
+    GPU_FULL_ON;
+    d_img.upload(img);
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    GPU_FULL_OFF;
 }
\ No newline at end of file
-#endif // HAVE_OPENCL
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,125 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-extern std::string workdir;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-IMPLEMENT_PARAM_CLASS(WinSizw48, bool);
-
-PARAM_TEST_CASE(HOG, WinSizw48, bool)
-{
-    bool is48;
-    vector<float> detector;
-    virtual void SetUp()
-    {
-        is48 = GET_PARAM(0);
-        if(is48)
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
-        }
-        else
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
-        }
-    }
-};

-TEST_P(HOG, Performance)
+///////////// HOG////////////////////////
+TEST(HOG)
 {
-    cv::Mat img = readImage(workdir + "lena.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    // define HOG related arguments
-    float scale = 1.05f;
-    //int nlevels = 13;
-    int gr_threshold = 8;
-    float hit_threshold = 1.4f;
-    //bool hit_threshold_auto = true;
-
-    int win_width = is48 ? 48 : 64;
-    int win_stride_width = 8;
-    int win_stride_height = 8;
-
-    bool gamma_corr = true;
-
-    Size win_size(win_width, win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(win_stride_width, win_stride_height);
-
-    cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
-
-    gpu_hog.setSVMDetector(detector);
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
+    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);

-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    if (src.empty())
    {
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        ocl::oclMat d_src(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
+        throw runtime_error("can't open road.png");
+    }

-        vector<Rect> found;
-        gpu_hog.detectMultiScale(d_src, found, hit_threshold, win_stride,
-                                 Size(0, 0), scale, gr_threshold);

-        t2 = (double)cvGetTickCount() - t2;//kernel
+    cv::HOGDescriptor hog;
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    std::vector<cv::Rect> found_locations;

-        // no download time for HOG
+    SUBTEST << 768 << 'x' << 576 << "; road.png";

-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+    hog.detectMultiScale(src, found_locations);

-        if(j == 0)
-            continue;
+    CPU_ON;
+    hog.detectMultiScale(src, found_locations);
+    CPU_OFF;

-        totalgputick = t1 + totalgputick;
+    cv::ocl::HOGDescriptor ocl_hog;
+    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+    ocl::oclMat d_src;
+    d_src.upload(src);

-        totalgputick_kernel = t2 + totalgputick_kernel;
+    WARMUP_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    WARMUP_OFF;

-    }
+    GPU_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+     ;
+    GPU_OFF;

-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+    GPU_FULL_ON;
+    d_src.upload(src);
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    GPU_FULL_OFF;
 }
\ No newline at end of file
-
-
-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, testing::Combine(testing::Values(WinSizw48(false), WinSizw48(true)), testing::Values(false)));
-
-#endif  //Have opencl
\ No newline at end of file
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@ -42,191 +42,105 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-//////// Utility
-#ifndef DIFFERENT_SIZES
-#else
-#undef DIFFERENT_SIZES
-#endif
-#define DIFFERENT_SIZES testing::Values(cv::Size(256, 256), cv::Size(3000, 3000))
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-{ \
-public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-private: \
-    type val_; \
-}; \
-    inline void PrintTo( name param, std::ostream* os) \
-{ \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-}
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
-
-const char *TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
-
-PARAM_TEST_CASE(MatchTemplate, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-    //vector<cv::ocl::Info> oclinfo;
-
-    virtual void SetUp()
-    {
-        size = GET_PARAM(0);
-        templ_size = GET_PARAM(1);
-        cn = GET_PARAM(2);
-        method = GET_PARAM(3);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-struct MatchTemplate8U : MatchTemplate {};

-TEST_P(MatchTemplate8U, Performance)
+/////////// matchTemplate ////////////////////////
+//void InitMatchTemplate()
+//{
+//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
+//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
+//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
+//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+//}
+TEST(matchTemplate)
 {
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-
+    //InitMatchTemplate();

+    Mat src, templ, dst;
+    int templ_size = 5;

-    double totalgputick = 0;
-    double totalgputick_kernel = 0;

-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
+        int all_type[] = {CV_32FC1, CV_32FC4};
+        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};

-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
+            {
+                gen(src, size, size, all_type[j], 0, 1);

-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";

+                gen(templ, templ_size, templ_size, all_type[j], 0, 1);

-}
+                matchTemplate(src, templ, dst, CV_TM_CCORR);

+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR);
+                CPU_OFF;

-struct MatchTemplate32F : MatchTemplate {};
-TEST_P(MatchTemplate32F, Performance)
-{
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+                ocl::oclMat d_src(src), d_templ, d_dst;

-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
+                d_templ.upload(templ);

+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                WARMUP_OFF;

+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                 ;
+                GPU_OFF;

+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }

-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
+        int all_type_8U[] = {CV_8UC1};
+        std::string type_name_8U[] = {"CV_8UC1"};

-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES; j ++)
+        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
            {
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";

-        t1 = (double)cvGetTickCount();//gpu start1
+                gen(src, size, size, all_type_8U[j], 0, 255);

-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
+                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);

-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);

-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
+                CPU_OFF;

-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+                ocl::oclMat d_src(src);
+                ocl::oclMat d_templ(templ), d_dst;

-        totalgputick = t1 + totalgputick;
+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                WARMUP_OFF;

-        totalgputick_kernel = t2 + totalgputick_kernel;
+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                 ;
+                GPU_OFF;

+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }
    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-
 }
\ No newline at end of file
-
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
-                        testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4)/*, Channels(3)*/),
-                            ALL_TEMPLATE_METHODS
-                        )
-                       );
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4) /*, Channels(3)*/),
-                            testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-#endif //HAVE_OPENCL
\ No newline at end of file
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
--- a/modules/ocl/perf/perf_norm.cpp
+++ b/modules/ocl/perf/perf_norm.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// norm////////////////////////
+TEST(norm)
+{
+    Mat src, buf;
+    ocl::oclMat d_src, d_buf;
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
+
+        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+
+        norm(src, NORM_INF);
+
+        CPU_ON;
+        norm(src, NORM_INF);
+        CPU_OFF;
+
+        d_src.upload(src);
+        d_buf.upload(buf);
+
+        WARMUP_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::norm(d_src, d_buf, NORM_INF);
+        GPU_FULL_OFF;
+    }
+}
\ No newline at end of file
--- a/modules/ocl/perf/perf_pyrdown.cpp
+++ b/modules/ocl/perf/perf_pyrdown.cpp
-///////////////////////////////////////////////////////////////////////////////////////
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,96 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL

-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(PyrDown, MatType, int)
+///////////// pyrDown //////////////////////
+TEST(pyrDown)
 {
-    int type;
-    int channels;
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-
-
-};
-
-#define VARNAME(A) string(#A);
-
-////////////////////////////////PyrDown/////////////////////////////////////////////////
-TEST_P(PyrDown, Mat)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    mat1 = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-
-
-    cv::ocl::oclMat gdst;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat gmat1(mat1);
+            gen(src, size, size, all_type[j], 0, 256);

-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrDown(gmat1, gdst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+            pyrDown(src, dst);

-        cv::Mat cpu_dst;
-        gdst.download(cpu_dst);
+            CPU_ON;
+            pyrDown(src, dst);
+            CPU_OFF;

-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if (j == 0)
-        {
-            continue;
-        }
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;

-        totalgputick = t1 + totalgputick;
+            WARMUP_ON;
+            ocl::pyrDown(d_src, d_dst);
+            WARMUP_OFF;

-        totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::pyrDown(d_src, d_dst);
+             ;
+            GPU_OFF;

+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrDown(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
 }
\ No newline at end of file
-
-//********test****************
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-
-#endif // HAVE_OPENCL
--- a/modules/ocl/perf/interpolation.hpp
+++ b/modules/ocl/perf/interpolation.hpp
@@ -7,12 +7,16 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +25,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -38,83 +42,102 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+#include "precomp.hpp"

-#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
-#define __OPENCV_TEST_INTERPOLATION_HPP__
-
-template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+///////////// PyrLKOpticalFlow ////////////////////////
+TEST(PyrLKOpticalFlow)
 {
-    if (border_type == cv::BORDER_CONSTANT)
-        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
-
-    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
-}
+    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
+    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};

-template <typename T> struct NearestInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
    {
-        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
-    }
-};
+        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);

-template <typename T> struct LinearInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+        if (frame0.empty())
        {
-        x -= 0.5f;
-        y -= 0.5f;
-
-        int x1 = cvFloor(x);
-        int y1 = cvFloor(y);
-        int x2 = x1 + 1;
-        int y2 = y1 + 1;
-
-        float res = 0;
+            std::string errstr = "can't open " + images1[i];
+            throw runtime_error(errstr);
+        }

-        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
-        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
-        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
-        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
+        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);

-        return cv::saturate_cast<T>(res);
+        if (frame1.empty())
+        {
+            std::string errstr = "can't open " + images2[i];
+            throw runtime_error(errstr);
        }
-};

-template <typename T> struct CubicInterpolator
-{
-    static float getValue(float p[4], float x)
+        Mat gray_frame;
+
+        if (i == 0)
        {
-        return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
        }

-    static float getValue(float p[4][4], float x, float y)
+        for (int points = Min_Size; points <= Max_Size; points *= Multiple)
        {
-        float arr[4];
+            if (i == 0)
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
+            else
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
+            Mat nextPts_cpu;
+            Mat status_cpu;

-        arr[0] = getValue(p[0], x);
-        arr[1] = getValue(p[1], x);
-        arr[2] = getValue(p[2], x);
-        arr[3] = getValue(p[3], x);
+            vector<Point2f> pts;
+            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);

-        return getValue(arr, y);
-    }
+            vector<Point2f> nextPts;
+            vector<unsigned char> status;

-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+            vector<float> err;
+
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+
+            CPU_ON;
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+            CPU_OFF;
+
+            ocl::PyrLKOpticalFlow d_pyrLK;
+
+            ocl::oclMat d_frame0(frame0);
+            ocl::oclMat d_frame1(frame1);
+
+            ocl::oclMat d_pts;
+            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
+            d_pts.upload(pts_mat);
+
+            ocl::oclMat d_nextPts;
+            ocl::oclMat d_status;
+            ocl::oclMat d_err;
+
+            WARMUP_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_frame0.upload(frame0);
+            d_frame1.upload(frame1);
+            d_pts.upload(pts_mat);
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+
+            if (!d_nextPts.empty())
            {
-        int ix = cvRound(x);
-        int iy = cvRound(y);
+                d_nextPts.download(nextPts_cpu);
+            }

-        float vals[4][4] =
+            if (!d_status.empty())
            {
-            {readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), readVal<T>(src, iy    , ix, c, border_type, borderVal), readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
-        };
+                d_status.download(status_cpu);
+            }

-        return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
+            GPU_FULL_OFF;
        }
-};

-#endif // __OPENCV_TEST_INTERPOLATION_HPP__
+    }
+}
--- a/modules/ocl/perf/perf_pyrup.cpp
+++ b/modules/ocl/perf/perf_pyrup.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,81 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-#include "opencv2/core/core.hpp"
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-

-PARAM_TEST_CASE(PyrUp, MatType, int)
+///////////// pyrUp ////////////////////////
+TEST(pyrUp)
 {
-    int type;
-    int channels;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    virtual void SetUp()
+    for (int size = 500; size <= 2000; size *= 2)
    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-
-TEST_P(PyrUp, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_MAKETYPE(type, channels));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
-        t1 = (double)cvGetTickCount();//gpu start1
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

-        cv::ocl::oclMat srcMat = cv::ocl::oclMat(src);//upload
+            gen(src, size, size, all_type[j], 0, 256);

-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrUp(srcMat, dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
+            pyrUp(src, dst);

-        cv::Mat cpu_dst;
-        dst.download(cpu_dst); //download
+            CPU_ON;
+            pyrUp(src, dst);
+            CPU_OFF;

-        t1 = (double)cvGetTickCount() - t1;//gpu end1
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;

-        if (j == 0)
-        {
-            continue;
-        }
-
-        totalgputick = t1 + totalgputick;
+            WARMUP_ON;
+            ocl::pyrUp(d_src, d_dst);
+            WARMUP_OFF;

-        totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::pyrUp(d_src, d_dst);
+             ;
+            GPU_OFF;

+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrUp(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
    }
-
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
 }
\ No newline at end of file
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-#endif // HAVE_OPENCL
\ No newline at end of file
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ b/modules/ocl/perf/perf_split_merge.cpp
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,4 +42,321 @@

 #include "precomp.hpp"

+// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
+// All images needed in this test are in samples/gpu folder.
+// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory
+void TestSystem::run()
+{
+    if (is_list_mode_)
+    {
+        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+        {
+            cout << (*it)->name() << endl;
+        }
+
+        return;
+    }
+
+    // Run test initializers
+    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
+    {
+        if ((*it)->name().find(test_filter_, 0) != string::npos)
+        {
+            (*it)->run();
+        }
+    }
+
+    printHeading();
+    writeHeading();
+
+    // Run tests
+    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+    {
+        try
+        {
+            if ((*it)->name().find(test_filter_, 0) != string::npos)
+            {
+                cout << endl << (*it)->name() << ":\n";
+
+                setCurrentTest((*it)->name());
+                //fprintf(record_,"%s\n",(*it)->name().c_str());
+
+                (*it)->run();
+                finishCurrentSubtest();
+            }
+        }
+        catch (const Exception &)
+        {
+            // Message is printed via callback
+            resetCurrentSubtest();
+        }
+        catch (const runtime_error &e)
+        {
+            printError(e.what());
+            resetCurrentSubtest();
+        }
+    }
+
+    printSummary();
+    writeSummary();
+}
+
+
+void TestSystem::finishCurrentSubtest()
+{
+    if (cur_subtest_is_empty_)
+        // There is no need to print subtest statistics
+    {
+        return;
+    }
+
+    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
+
+    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
+    speedup_total_ += speedup;
+
+    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
+    speedup_full_total_ += fullspeedup;
+
+    if (speedup > top_)
+    {
+        speedup_faster_count_++;
+    }
+    else if (speedup < bottom_)
+    {
+        speedup_slower_count_++;
+    }
+    else
+    {
+        speedup_equal_count_++;
+    }
+
+    if (fullspeedup > top_)
+    {
+        speedup_full_faster_count_++;
+    }
+    else if (fullspeedup < bottom_)
+    {
+        speedup_full_slower_count_++;
+    }
+    else
+    {
+        speedup_full_equal_count_++;
+    }
+
+    // compute min, max and
+    std::sort(gpu_times_.begin(), gpu_times_.end());
+    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
+    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
+    double deviation = 0;
+
+    if (gpu_times_.size() > 1)
+    {
+        double sum = 0;
+
+        for (size_t i = 0; i < gpu_times_.size(); i++)
+        {
+            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
+            double diff_time = diff * 1000 / getTickFrequency();
+            sum += diff_time * diff_time;
+        }
+
+        deviation = std::sqrt(sum / gpu_times_.size());
+    }
+
+    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
+    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
+
+    num_subtests_called_++;
+    resetCurrentSubtest();
+}
+
+
+double TestSystem::meanTime(const vector<int64> &samples)
+{
+    double sum = accumulate(samples.begin(), samples.end(), 0.);
+    return sum / samples.size();
+}
+
+
+void TestSystem::printHeading()
+{
+    cout << endl;
+    cout << setiosflags(ios_base::left);
+    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
+         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
+         << "DESCRIPTION\n";
+
+    cout << resetiosflags(ios_base::left);
+}
+
+void TestSystem::writeHeading()
+{
+    if (!record_)
+    {
+        recordname_ += "_OCL.csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
+
+    fflush(record_);
+}
+
+void TestSystem::printSummary()
+{
+    cout << setiosflags(ios_base::fixed);
+    cout << "\naverage GPU speedup: x"
+         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPU exceeded: "
+         << setprecision(3) << speedup_faster_count_
+         << "\nGPU passed: "
+         << setprecision(3) << speedup_equal_count_
+         << "\nGPU failed: "
+         << setprecision(3) << speedup_slower_count_
+         << endl;
+    cout << "\nGPU exceeded rate: "
+         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU passed rate: "
+         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU failed rate: "
+         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << "\naverage GPUTOTAL speedup: x"
+         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPUTOTAL exceeded: "
+         << setprecision(3) << speedup_full_faster_count_
+         << "\nGPUTOTAL passed: "
+         << setprecision(3) << speedup_full_equal_count_
+         << "\nGPUTOTAL failed: "
+         << setprecision(3) << speedup_full_slower_count_
+         << endl;
+    cout << "\nGPUTOTAL exceeded rate: "
+         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL passed rate: "
+         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL failed rate: "
+         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << resetiosflags(ios_base::fixed);
+}
+
+
+void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
+{
+    cout << TAB << setiosflags(ios_base::left);
+    stringstream stream;
+
+    stream << cpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << gpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << speedup;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << gpu_full_time;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << fullspeedup;
+    cout << setw(14) << stream.str();
+
+    cout << cur_subtest_description_.str();
+    cout << resetiosflags(ios_base::left) << endl;
+}
+
+void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
+            cur_subtest_description_.str().c_str(),
+            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
+            gpu_min, gpu_max, std_dev);
+
+    if (itname_changed_)
+    {
+        itname_changed_ = false;
+    }
+
+    fflush(record_);
+}
+
+void TestSystem::writeSummary()
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n"
+            "\nAverage GPUTOTAL speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n",
+            speedup_total_ / std::max(1, num_subtests_called_),
+            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_total_ / std::max(1, num_subtests_called_),
+            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+           );
+    fflush(record_);
+}
+
+void TestSystem::printError(const std::string &msg)
+{
+	if(msg != "CL_INVALID_BUFFER_SIZE")
+	{
+		cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
+	}
+}
+
+void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
+{
+    mat.create(rows, cols, type);
+    RNG rng(0);
+    rng.fill(mat, RNG::UNIFORM, low, high);
+}
+
+
+string abspath(const string &relpath)
+{
+    return TestSystem::instance().workingDir() + relpath;
+}
+
+
+int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
+                             const char *err_msg, const char * /*file_name*/,
+                             int /*line*/, void * /*userdata*/)
+{
+    TestSystem::instance().printError(err_msg);
+    return 0;
+}
+

--- a/modules/ocl/perf/precomp.hpp
+++ b/modules/ocl/perf/precomp.hpp
--- a/modules/ocl/perf/utility.cpp
+++ b/modules/ocl/perf/utility.cpp
--- a/modules/ocl/perf/utility.hpp
+++ b/modules/ocl/perf/utility.hpp
--- a/samples/ocl/performance.cpp
+++ b/samples/ocl/performance.cpp