diff --git a/modules/features2d/src/matchers.cpp b/modules/features2d/src/matchers.cpp index 17697768c32626e318c388ca83553ca03616f42f..f4a0c8f6a4a4940a87c0b361ac18b7db89517810 100644 --- a/modules/features2d/src/matchers.cpp +++ b/modules/features2d/src/matchers.cpp @@ -60,113 +60,58 @@ static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m) m.create(rows, cols, type); } - -template < int BLOCK_SIZE, int MAX_DESC_LEN > -static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, - const UMat &trainIdx, const UMat &distance, int distType) +static bool ocl_matchSingle(InputArray query, InputArray train, + UMat &trainIdx, UMat &distance, int distType) { - int depth = _query.depth(); - cv::String opts; - opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", - ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN ); - ocl::Kernel k("BruteForceMatch_UnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts); - if(k.empty()) + if (query.empty() || train.empty()) return false; - size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; - size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - - if(globalSize[0] != 0) - { - UMat query = _query.getUMat(), train = _train.getUMat(); - - int idx = 0; - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); - idx = k.set(idx, (void *)NULL, smemSize); - idx = k.set(idx, query.rows); - idx = k.set(idx, query.cols); - idx = k.set(idx, train.rows); - idx = k.set(idx, train.cols); - idx = k.set(idx, (int)query.step); - - return k.run(2, globalSize, localSize, false); - } - return true; -} + const int query_rows = query.rows(); + const int query_cols = query.cols(); -template < int BLOCK_SIZE > -static bool ocl_match(InputArray _query, InputArray _train, - const UMat &trainIdx, const UMat &distance, int distType) -{ - int depth = _query.depth(); - cv::String opts; - opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", - ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE); - ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts); - if(k.empty()) - return false; + ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx); + ensureSizeIsEnough(1, query_rows, CV_32F, distance); - size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; - size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + ocl::Device devDef = ocl::Device::getDefault(); - if(globalSize[0] != 0) - { - UMat query = _query.getUMat(), train = _train.getUMat(); - - int idx = 0; - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); - idx = k.set(idx, (void *)NULL, smemSize); - idx = k.set(idx, query.rows); - idx = k.set(idx, query.cols); - idx = k.set(idx, train.rows); - idx = k.set(idx, train.cols); - idx = k.set(idx, (int)query.step); - - return k.run(2, globalSize, localSize, false); - } - return true; -} + UMat uquery = query.getUMat(), utrain = train.getUMat(); + int kercn = 1; + if (devDef.isIntel() && + (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) && + (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4))) + kercn = 4; -static bool ocl_matchDispatcher(InputArray query, InputArray train, - const UMat &trainIdx, const UMat &distance, int distType) -{ - int query_cols = query.size().width; - bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU; + int block_size = 16; + int max_desc_len = 0; + bool is_cpu = devDef.type() == ocl::Device::TYPE_CPU; if (query_cols <= 64) - { - if(!ocl_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) return false; - } + max_desc_len = 64 / kercn; else if (query_cols <= 128 && !is_cpu) - { - if(!ocl_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType)) return false; - } - else - { - if(!ocl_match<16>(query, train, trainIdx, distance, distType)) return false; - } - return true; -} + max_desc_len = 128 / kercn; -static bool ocl_matchSingle(InputArray query, InputArray train, - UMat &trainIdx, UMat &distance, int dstType) -{ - if (query.empty() || train.empty()) + int depth = query.depth(); + cv::String opts; + opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", + ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size, max_desc_len); + ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) return false; - int query_rows = query.size().height; - - ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx); - ensureSizeIsEnough(1, query_rows, CV_32F, distance); - - return ocl_matchDispatcher(query, train, trainIdx, distance, dstType); + size_t globalSize[] = {(query.size().height + block_size - 1) / block_size * block_size, block_size}; + size_t localSize[] = {block_size, block_size}; + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, uquery.rows); + idx = k.set(idx, uquery.cols); + idx = k.set(idx, utrain.rows); + idx = k.set(idx, utrain.cols); + idx = k.set(idx, (int)(uquery.step / sizeof(float))); + + return k.run(2, globalSize, localSize, false); } static bool ocl_matchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector > &matches) @@ -213,121 +158,60 @@ static bool ocl_matchDownload(const UMat &trainIdx, const UMat &distance, std::v return ocl_matchConvert(trainIdxCPU, distanceCPU, matches); } -template < int BLOCK_SIZE, int MAX_DESC_LEN > -static bool ocl_knn_matchUnrolledCached(InputArray _query, InputArray _train, - const UMat &trainIdx, const UMat &distance, int distType) +static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx, + UMat &distance, int distType) { - int depth = _query.depth(); - cv::String opts; - opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", - ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN ); - ocl::Kernel k("BruteForceMatch_knnUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts); - if(k.empty()) + if (query.empty() || train.empty()) return false; - size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; - size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - - if(globalSize[0] != 0) - { - UMat query = _query.getUMat(), train = _train.getUMat(); - - int idx = 0; - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); - idx = k.set(idx, (void *)NULL, smemSize); - idx = k.set(idx, query.rows); - idx = k.set(idx, query.cols); - idx = k.set(idx, train.rows); - idx = k.set(idx, train.cols); - idx = k.set(idx, (int)query.step); - - return k.run(2, globalSize, localSize, false); - } - return true; -} + const int query_rows = query.rows(); + const int query_cols = query.cols(); -template < int BLOCK_SIZE > -static bool ocl_knn_match(InputArray _query, InputArray _train, - const UMat &trainIdx, const UMat &distance, int distType) -{ - int depth = _query.depth(); - cv::String opts; - opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", - ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE); - ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts); - if(k.empty()) - return false; + ensureSizeIsEnough(1, query_rows, CV_32SC2, trainIdx); + ensureSizeIsEnough(1, query_rows, CV_32FC2, distance); - size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1}; - size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + trainIdx.setTo(Scalar::all(-1)); - if(globalSize[0] != 0) - { - UMat query = _query.getUMat(), train = _train.getUMat(); - - int idx = 0; - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); - idx = k.set(idx, (void*)NULL, smemSize); - idx = k.set(idx, query.rows); - idx = k.set(idx, query.cols); - idx = k.set(idx, train.rows); - idx = k.set(idx, train.cols); - idx = k.set(idx, (int)query.step); - - return k.run(2, globalSize, localSize, false); - } - return true; -} + ocl::Device devDef = ocl::Device::getDefault(); -static bool ocl_match2Dispatcher(InputArray query, InputArray train, const UMat &trainIdx, const UMat &distance, int distType) -{ - bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU; - if (query.size().width <= 64) - { - if(!ocl_knn_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) - return false; - } - else if (query.size().width <= 128 && !is_cpu) - { - if(!ocl_knn_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType)) - return false; - } - else - { - if(!ocl_knn_match<16>(query, train, trainIdx, distance, distType)) - return false; - } - return true; -} + UMat uquery = query.getUMat(), utrain = train.getUMat(); + int kercn = 1; + if (devDef.isIntel() && + (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) && + (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4))) + kercn = 4; -static bool ocl_kmatchDispatcher(InputArray query, InputArray train, const UMat &trainIdx, - const UMat &distance, int distType) -{ - return ocl_match2Dispatcher(query, train, trainIdx, distance, distType); -} + int block_size = 16; + int max_desc_len = 0; + bool is_cpu = devDef.type() == ocl::Device::TYPE_CPU; + if (query_cols <= 64) + max_desc_len = 64 / kercn; + else if (query_cols <= 128 && !is_cpu) + max_desc_len = 128 / kercn; -static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx, - UMat &distance, int dstType) -{ - if (query.empty() || train.empty()) + int depth = query.depth(); + cv::String opts; + opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", + ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size, max_desc_len); + ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts); + if(k.empty()) return false; - const int nQuery = query.size().height; - - ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx); - ensureSizeIsEnough(1, nQuery, CV_32FC2, distance); - - trainIdx.setTo(Scalar::all(-1)); - - return ocl_kmatchDispatcher(query, train, trainIdx, distance, dstType); + size_t globalSize[] = {(query_rows + block_size - 1) / block_size * block_size, block_size}; + size_t localSize[] = {block_size, block_size}; + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, uquery.rows); + idx = k.set(idx, uquery.cols); + idx = k.set(idx, utrain.rows); + idx = k.set(idx, utrain.cols); + idx = k.set(idx, (int)(uquery.step / sizeof(float))); + + return k.run(2, globalSize, localSize, false); } static bool ocl_knnMatchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector > &matches, bool compactResult) @@ -383,134 +267,64 @@ static bool ocl_knnMatchDownload(const UMat &trainIdx, const UMat &distance, std Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ); Mat distanceCPU = distance.getMat(ACCESS_READ); - if (ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult) ) - return true; - return false; -} - -template < int BLOCK_SIZE, int MAX_DESC_LEN > -static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, float maxDistance, - const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType) -{ - int depth = _query.depth(); - cv::String opts; - opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", - ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN); - ocl::Kernel k("BruteForceMatch_RadiusUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts); - if(k.empty()) - return false; - - size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; - size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - - if(globalSize[0] != 0) - { - UMat query = _query.getUMat(), train = _train.getUMat(); - - int idx = 0; - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); - idx = k.set(idx, maxDistance); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches)); - idx = k.set(idx, (void*)NULL, smemSize); - idx = k.set(idx, query.rows); - idx = k.set(idx, query.cols); - idx = k.set(idx, train.rows); - idx = k.set(idx, train.cols); - idx = k.set(idx, trainIdx.cols); - idx = k.set(idx, (int)query.step); - idx = k.set(idx, (int)trainIdx.step); - - return k.run(2, globalSize, localSize, false); - } - return true; -} - -//radius_match -template < int BLOCK_SIZE > -static bool ocl_radius_match(InputArray _query, InputArray _train, float maxDistance, - const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType) -{ - int depth = _query.depth(); - cv::String opts; - opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE); - ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts); - if(k.empty()) - return false; - - size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1}; - size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1}; - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - - if(globalSize[0] != 0) - { - UMat query = _query.getUMat(), train = _train.getUMat(); - - int idx = 0; - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query)); - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train)); - idx = k.set(idx, maxDistance); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); - idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches)); - idx = k.set(idx, (void*)NULL, smemSize); - idx = k.set(idx, query.rows); - idx = k.set(idx, query.cols); - idx = k.set(idx, train.rows); - idx = k.set(idx, train.cols); - idx = k.set(idx, trainIdx.cols); - idx = k.set(idx, (int)query.step); - idx = k.set(idx, (int)trainIdx.step); - - return k.run(2, globalSize, localSize, false); - } - return true; -} - -static bool ocl_rmatchDispatcher(InputArray query, InputArray train, - UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType) -{ - bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU; - int query_cols = query.size().width; - if (query_cols <= 64) - { - if(!ocl_matchUnrolledCached<16, 64>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false; - } - else if (query_cols <= 128 && !is_cpu) - { - if(!ocl_matchUnrolledCached<16, 128>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false; - } - else - { - if(!ocl_radius_match<16>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false; - } - return true; + return ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult); } - static bool ocl_radiusMatchSingle(InputArray query, InputArray train, UMat &trainIdx, UMat &distance, UMat &nMatches, float maxDistance, int distType) { if (query.empty() || train.empty()) return false; - const int nQuery = query.size().height; - const int nTrain = train.size().height; + const int query_rows = query.rows(); + const int train_rows = train.rows(); - ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); + ensureSizeIsEnough(1, query_rows, CV_32SC1, nMatches); if (trainIdx.empty()) { - ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx); - ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance); + ensureSizeIsEnough(query_rows, std::max((train_rows / 100), 10), CV_32SC1, trainIdx); + ensureSizeIsEnough(query_rows, std::max((train_rows / 100), 10), CV_32FC1, distance); } nMatches.setTo(Scalar::all(0)); - return ocl_rmatchDispatcher(query, train, trainIdx, distance, nMatches, maxDistance, distType); + ocl::Device devDef = ocl::Device::getDefault(); + UMat uquery = query.getUMat(), utrain = train.getUMat(); + int kercn = 1; + if (devDef.isIntel() && + (0 == (uquery.step % 4)) && (0 == (uquery.cols % 4)) && (0 == (uquery.offset % 4)) && + (0 == (utrain.step % 4)) && (0 == (utrain.cols % 4)) && (0 == (utrain.offset % 4))) + kercn = 4; + + int block_size = 16; + int depth = query.depth(); + cv::String opts; + opts = cv::format("-D T=%s -D TN=%s -D kercn=%d %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", + ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), kercn, depth == CV_32F ? "-D T_FLOAT" : "", distType, block_size); + ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts); + if (k.empty()) + return false; + + size_t globalSize[] = {(train_rows + block_size - 1) / block_size * block_size, (query_rows + block_size - 1) / block_size * block_size, 1}; + size_t localSize[] = {block_size, block_size, 1}; + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(uquery)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(utrain)); + idx = k.set(idx, maxDistance); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches)); + idx = k.set(idx, uquery.rows); + idx = k.set(idx, uquery.cols); + idx = k.set(idx, utrain.rows); + idx = k.set(idx, utrain.cols); + idx = k.set(idx, trainIdx.cols); + idx = k.set(idx, (int)(uquery.step / sizeof(float))); + idx = k.set(idx, (int)(trainIdx.step / sizeof(int))); + + return k.run(2, globalSize, localSize, false); } static bool ocl_radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &_nMatches, diff --git a/modules/features2d/src/opencl/brute_force_match.cl b/modules/features2d/src/opencl/brute_force_match.cl index e2757e172db702c962578fb92a4365590374f0f1..7805e4767b1ee34cdb517455600a227c23da2453 100644 --- a/modules/features2d/src/opencl/brute_force_match.cl +++ b/modules/features2d/src/opencl/brute_force_match.cl @@ -59,39 +59,71 @@ #define MAX_DESC_LEN 64 #endif +#define BLOCK_SIZE_ODD (BLOCK_SIZE + 1) +#ifndef SHARED_MEM_SZ +# if (BLOCK_SIZE < MAX_DESC_LEN) +# define SHARED_MEM_SZ (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE)) +# else +# define SHARED_MEM_SZ (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE) +# endif +#endif + #ifndef DIST_TYPE #define DIST_TYPE 2 #endif // dirty fix for non-template support -#if (DIST_TYPE == 2) // L1Dist +#if (DIST_TYPE == 2) // L1Dist # ifdef T_FLOAT -# define DIST(x, y) fabs((x) - (y)) - typedef float value_type; typedef float result_type; +# if (8 == kercn) + typedef float8 value_type; +# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;} +# elif (4 == kercn) + typedef float4 value_type; +# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;} +# else + typedef float value_type; +# define DIST(x, y) result += fabs((x) - (y)) +# endif # else -# define DIST(x, y) abs((x) - (y)) - typedef int value_type; typedef int result_type; +# if (8 == kercn) + typedef int8 value_type; +# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;} +# elif (4 == kercn) + typedef int4 value_type; +# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;} +# else + typedef int value_type; +# define DIST(x, y) result += abs((x) - (y)) +# endif # endif -#define DIST_RES(x) (x) +# define DIST_RES(x) (x) #elif (DIST_TYPE == 4) // L2Dist -#define DIST(x, y) (((x) - (y)) * ((x) - (y))) -typedef float value_type; -typedef float result_type; -#define DIST_RES(x) sqrt(x) + typedef float result_type; +# if (8 == kercn) + typedef float8 value_type; +# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);} +# elif (4 == kercn) + typedef float4 value_type; +# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d, d);} +# else + typedef float value_type; +# define DIST(x, y) {value_type d = ((x) - (y)); result = mad(d, d, result);} +# endif +# define DIST_RES(x) sqrt(x) #elif (DIST_TYPE == 6) // Hamming -//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel -inline int bit1Count(int v) -{ - v = v - ((v >> 1) & 0x55555555); // reuse input as temporary - v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp - return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count -} -#define DIST(x, y) bit1Count( (x) ^ (y) ) -typedef int value_type; -typedef int result_type; -#define DIST_RES(x) (x) +# if (8 == kercn) + typedef int8 value_type; +# elif (4 == kercn) + typedef int4 value_type; +# else + typedef int value_type; +# endif + typedef int result_type; +# define DIST(x, y) result += popcount( (x) ^ (y) ) +# define DIST_RES(x) (x) #endif inline result_type reduce_block( @@ -105,9 +137,7 @@ inline result_type reduce_block( #pragma unroll for (int j = 0 ; j < BLOCK_SIZE ; j++) { - result += DIST( - s_query[lidy * BLOCK_SIZE + j], - s_train[j * BLOCK_SIZE + lidx]); + DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]); } return DIST_RES(result); } @@ -123,11 +153,9 @@ inline result_type reduce_block_match( #pragma unroll for (int j = 0 ; j < BLOCK_SIZE ; j++) { - result += DIST( - s_query[lidy * BLOCK_SIZE + j], - s_train[j * BLOCK_SIZE + lidx]); + DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]); } - return (result); + return result; } inline result_type reduce_multi_block( @@ -142,23 +170,16 @@ inline result_type reduce_multi_block( #pragma unroll for (int j = 0 ; j < BLOCK_SIZE ; j++) { - result += DIST( - s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], - s_train[j * BLOCK_SIZE + lidx]); + DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]); } return result; } -/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE -local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE. -*/ -__kernel void BruteForceMatch_UnrollMatch( +__kernel void BruteForceMatch_Match( __global T *query, __global T *train, - //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, - __local float *sharebuffer, int query_rows, int query_cols, int train_rows, @@ -170,17 +191,26 @@ __kernel void BruteForceMatch_UnrollMatch( const int lidy = get_local_id(1); const int groupidx = get_group_id(0); + const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy); + const int queryOffset = min(queryIdx, query_rows - 1) * step; + __global TN *query_vec = (__global TN *)(query + queryOffset); + query_cols /= kercn; + + __local float sharebuffer[SHARED_MEM_SZ]; __local value_type *s_query = (__local value_type *)sharebuffer; - __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; - int queryIdx = groupidx * BLOCK_SIZE + lidy; +#if 0 < MAX_DESC_LEN + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; // load the query into local memory. #pragma unroll - for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++) { - int loadx = lidx + i * BLOCK_SIZE; - s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + const int loadx = mad24(BLOCK_SIZE, i, lidx); + s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0; } +#else + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE; +#endif float myBestDistance = MAX_FLOAT; int myBestTrainIdx = -1; @@ -189,12 +219,16 @@ __kernel void BruteForceMatch_UnrollMatch( for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++) { result_type result = 0; + + const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step; + __global TN *train_vec = (__global TN *)(train + trainOffset); +#if 0 < MAX_DESC_LEN #pragma unroll - for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++) { //load a BLOCK_SIZE * BLOCK_SIZE block into local train. - const int loadx = lidx + i * BLOCK_SIZE; - s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + const int loadx = mad24(BLOCK_SIZE, i, lidx); + s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); @@ -203,89 +237,18 @@ __kernel void BruteForceMatch_UnrollMatch( barrier(CLK_LOCAL_MEM_FENCE); } - - result = DIST_RES(result); - - int trainIdx = t * BLOCK_SIZE + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) - { - myBestDistance = result; - myBestTrainIdx = trainIdx; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - __local float *s_distance = (__local float*)(sharebuffer); - __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); - - //find BestMatch - s_distance += lidy * BLOCK_SIZE; - s_trainIdx += lidy * BLOCK_SIZE; - s_distance[lidx] = myBestDistance; - s_trainIdx[lidx] = myBestTrainIdx; - - barrier(CLK_LOCAL_MEM_FENCE); - - //reduce -- now all reduce implement in each threads. - #pragma unroll - for (int k = 0 ; k < BLOCK_SIZE; k++) - { - if (myBestDistance > s_distance[k]) +#else + for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++) { - myBestDistance = s_distance[k]; - myBestTrainIdx = s_trainIdx[k]; - } - } - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -__kernel void BruteForceMatch_Match( - __global T *query, - __global T *train, - //__global float *mask, - __global int *bestTrainIdx, - __global float *bestDistance, - __local float *sharebuffer, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * BLOCK_SIZE + lidy; - - float myBestDistance = MAX_FLOAT; - int myBestTrainIdx = -1; - - __local value_type *s_query = (__local value_type *)sharebuffer; - __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; - - // loop - for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) - { - result_type result = 0; - for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++) - { - const int loadx = lidx + i * BLOCK_SIZE; + const int loadx = mad24(i, BLOCK_SIZE, lidx); //load query and train into local memory - s_query[lidy * BLOCK_SIZE + lidx] = 0; - s_train[lidx * BLOCK_SIZE + lidy] = 0; + s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0; + s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0; if (loadx < query_cols) { - s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx]; + s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx]; } barrier(CLK_LOCAL_MEM_FENCE); @@ -294,10 +257,10 @@ __kernel void BruteForceMatch_Match( barrier(CLK_LOCAL_MEM_FENCE); } - +#endif result = DIST_RES(result); - const int trainIdx = t * BLOCK_SIZE + lidx; + const int trainIdx = mad24(BLOCK_SIZE, t, lidx); if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) { @@ -309,17 +272,18 @@ __kernel void BruteForceMatch_Match( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE); //findBestMatch - s_distance += lidy * BLOCK_SIZE; - s_trainIdx += lidy * BLOCK_SIZE; + s_distance += lidy * BLOCK_SIZE_ODD; + s_trainIdx += lidy * BLOCK_SIZE_ODD; s_distance[lidx] = myBestDistance; s_trainIdx[lidx] = myBestTrainIdx; barrier(CLK_LOCAL_MEM_FENCE); //reduce -- now all reduce implement in each threads. + #pragma unroll for (int k = 0 ; k < BLOCK_SIZE; k++) { if (myBestDistance > s_distance[k]) @@ -336,76 +300,14 @@ __kernel void BruteForceMatch_Match( } } -//radius_unrollmatch -__kernel void BruteForceMatch_RadiusUnrollMatch( - __global T *query, - __global T *train, - float maxDistance, - //__global float *mask, - __global int *bestTrainIdx, - __global float *bestDistance, - __global int *nMatches, - __local float *sharebuffer, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int bestTrainIdx_cols, - int step, - int ostep -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - const int groupidy = get_group_id(1); - - const int queryIdx = groupidy * BLOCK_SIZE + lidy; - const int trainIdx = groupidx * BLOCK_SIZE + lidx; - - __local value_type *s_query = (__local value_type *)sharebuffer; - __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; - - result_type result = 0; - for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i) - { - //load a BLOCK_SIZE * BLOCK_SIZE block into local train. - const int loadx = lidx + i * BLOCK_SIZE; - - s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - result += reduce_block(s_query, s_train, lidx, lidy); - - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (queryIdx < query_rows && trainIdx < train_rows && - convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) - { - int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); - - if(ind < bestTrainIdx_cols) - { - bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; - bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; - } - } -} - //radius_match __kernel void BruteForceMatch_RadiusMatch( __global T *query, __global T *train, float maxDistance, - //__global float *mask, __global int *bestTrainIdx, __global float *bestDistance, __global int *nMatches, - __local float *sharebuffer, int query_rows, int query_cols, int train_rows, @@ -420,20 +322,34 @@ __kernel void BruteForceMatch_RadiusMatch( const int groupidx = get_group_id(0); const int groupidy = get_group_id(1); - const int queryIdx = groupidy * BLOCK_SIZE + lidy; - const int trainIdx = groupidx * BLOCK_SIZE + lidx; + const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy); + const int queryOffset = min(queryIdx, query_rows - 1) * step; + __global TN *query_vec = (__global TN *)(query + queryOffset); + + const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx); + const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step; + __global TN *train_vec = (__global TN *)(train + trainOffset); + query_cols /= kercn; + + __local float sharebuffer[SHARED_MEM_SZ]; __local value_type *s_query = (__local value_type *)sharebuffer; - __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE; result_type result = 0; for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i) { //load a BLOCK_SIZE * BLOCK_SIZE block into local train. - const int loadx = lidx + i * BLOCK_SIZE; + const int loadx = mad24(BLOCK_SIZE, i, lidx); - s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0; + s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0; + + if (loadx < query_cols) + { + s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx]; + s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx]; + } //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); @@ -442,28 +358,23 @@ __kernel void BruteForceMatch_RadiusMatch( barrier(CLK_LOCAL_MEM_FENCE); } - - if (queryIdx < query_rows && trainIdx < train_rows && - convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/) + if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance) { int ind = atom_inc(nMatches + queryIdx); if(ind < bestTrainIdx_cols) { - bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; - bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; + bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx; + bestDistance[mad24(queryIdx, ostep, ind)] = result; } } } - -__kernel void BruteForceMatch_knnUnrollMatch( +__kernel void BruteForceMatch_knnMatch( __global T *query, __global T *train, - //__global float *mask, __global int2 *bestTrainIdx, __global float2 *bestDistance, - __local float *sharebuffer, int query_rows, int query_cols, int train_rows, @@ -475,31 +386,45 @@ __kernel void BruteForceMatch_knnUnrollMatch( const int lidy = get_local_id(1); const int groupidx = get_group_id(0); - const int queryIdx = groupidx * BLOCK_SIZE + lidy; + const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy); + const int queryOffset = min(queryIdx, query_rows - 1) * step; + __global TN *query_vec = (__global TN *)(query + queryOffset); + query_cols /= kercn; + + __local float sharebuffer[SHARED_MEM_SZ]; __local value_type *s_query = (__local value_type *)sharebuffer; - __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; +#if 0 < MAX_DESC_LEN + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN; // load the query into local memory. + #pragma unroll for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++) { - int loadx = lidx + i * BLOCK_SIZE; - s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + int loadx = mad24(BLOCK_SIZE, i, lidx); + s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0; } +#else + __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE; +#endif float myBestDistance1 = MAX_FLOAT; float myBestDistance2 = MAX_FLOAT; int myBestTrainIdx1 = -1; int myBestTrainIdx2 = -1; - //loopUnrolledCached - for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) + for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++) { result_type result = 0; + + int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step; + __global TN *train_vec = (__global TN *)(train + trainOffset); +#if 0 < MAX_DESC_LEN + #pragma unroll for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++) { //load a BLOCK_SIZE * BLOCK_SIZE block into local train. - const int loadx = lidx + i * BLOCK_SIZE; - s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + const int loadx = mad24(BLOCK_SIZE, i, lidx); + s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0; //synchronize to make sure each elem for reduceIteration in share memory is written already. barrier(CLK_LOCAL_MEM_FENCE); @@ -508,143 +433,18 @@ __kernel void BruteForceMatch_knnUnrollMatch( barrier(CLK_LOCAL_MEM_FENCE); } - - result = DIST_RES(result); - - const int trainIdx = t * BLOCK_SIZE + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows) - { - if (result < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - myBestDistance1 = result; - myBestTrainIdx1 = trainIdx; - } - else if (result < myBestDistance2) - { - myBestDistance2 = result; - myBestTrainIdx2 = trainIdx; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - __local float *s_distance = (local float *)sharebuffer; - __local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); - - // find BestMatch - s_distance += lidy * BLOCK_SIZE; - s_trainIdx += lidy * BLOCK_SIZE; - - s_distance[lidx] = myBestDistance1; - s_trainIdx[lidx] = myBestTrainIdx1; - - float bestDistance1 = MAX_FLOAT; - float bestDistance2 = MAX_FLOAT; - int bestTrainIdx1 = -1; - int bestTrainIdx2 = -1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < BLOCK_SIZE ; i++) - { - float val = s_distance[i]; - if (val < bestDistance1) - { - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestDistance1 = val; - bestTrainIdx1 = s_trainIdx[i]; - } - else if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - s_distance[lidx] = myBestDistance2; - s_trainIdx[lidx] = myBestTrainIdx2; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < BLOCK_SIZE ; i++) - { - float val = s_distance[i]; - - if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - myBestDistance1 = bestDistance1; - myBestDistance2 = bestDistance2; - - myBestTrainIdx1 = bestTrainIdx1; - myBestTrainIdx2 = bestTrainIdx2; - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); - } -} - -__kernel void BruteForceMatch_knnMatch( - __global T *query, - __global T *train, - //__global float *mask, - __global int2 *bestTrainIdx, - __global float2 *bestDistance, - __local float *sharebuffer, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * BLOCK_SIZE + lidy; - __local value_type *s_query = (__local value_type *)sharebuffer; - __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE; - - float myBestDistance1 = MAX_FLOAT; - float myBestDistance2 = MAX_FLOAT; - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - - //loop - for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++) - { - result_type result = 0.0f; - for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++) +#else + for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++) { - const int loadx = lidx + i * BLOCK_SIZE; + const int loadx = mad24(BLOCK_SIZE, i, lidx); //load query and train into local memory - s_query[lidy * BLOCK_SIZE + lidx] = 0; - s_train[lidx * BLOCK_SIZE + lidy] = 0; + s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0; + s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0; if (loadx < query_cols) { - s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx]; + s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx]; } barrier(CLK_LOCAL_MEM_FENCE); @@ -653,12 +453,12 @@ __kernel void BruteForceMatch_knnMatch( barrier(CLK_LOCAL_MEM_FENCE); } - +#endif result = DIST_RES(result); - const int trainIdx = t * BLOCK_SIZE + lidx; + const int trainIdx = mad24(BLOCK_SIZE, t, lidx); - if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) + if (queryIdx < query_rows && trainIdx < train_rows) { if (result < myBestDistance1) { @@ -678,12 +478,11 @@ __kernel void BruteForceMatch_knnMatch( barrier(CLK_LOCAL_MEM_FENCE); __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE); - - //findBestMatch - s_distance += lidy * BLOCK_SIZE; - s_trainIdx += lidy * BLOCK_SIZE; + __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE); + // find BestMatch + s_distance += lidy * BLOCK_SIZE_ODD; + s_trainIdx += lidy * BLOCK_SIZE_ODD; s_distance[lidx] = myBestDistance1; s_trainIdx[lidx] = myBestTrainIdx1; @@ -746,44 +545,4 @@ __kernel void BruteForceMatch_knnMatch( bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); } -} - -kernel void BruteForceMatch_calcDistanceUnrolled( - __global T *query, - __global T *train, - //__global float *mask, - __global float *allDist, - __local float *sharebuffer, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step) -{ - /* Todo */ -} - -kernel void BruteForceMatch_calcDistance( - __global T *query, - __global T *train, - //__global float *mask, - __global float *allDist, - __local float *sharebuffer, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step) -{ - /* Todo */ -} - -kernel void BruteForceMatch_findBestMatch( - __global float *allDist, - __global int *bestTrainIdx, - __global float *bestDistance, - int k -) -{ - /* Todo */ -} +} \ No newline at end of file