Merging branch with multithreading search. Thanks Nick!

Conflicts: CMakeLists.txt src/cpp/CMakeLists.txt

Merging branch with multithreading search. Thanks Nick!
Conflicts: CMakeLists.txt src/cpp/CMakeLists.txt
f24e2ecf · Marius Muja · 0de930aa · 6761469a · f24e2ecf · f24e2ecf
12 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,8 @@ set(FLANN_VERSION 1.7.0)
 DISSECT_VERSION()
 GET_OS_INFO()

+list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+
 # Add an "uninstall" target
 CONFIGURE_FILE ("${PROJECT_SOURCE_DIR}/cmake/uninstall_target.cmake.in"
    "${PROJECT_BINARY_DIR}/uninstall_target.cmake" IMMEDIATE @ONLY)
@@ -101,6 +103,27 @@ else(NOT CUDA_FOUND)
    include_directories(${CUDA_INCLUDE_DIRS})
 endif(NOT CUDA_FOUND)

+# find Intel TBB
+find_package(TBB)
+if(TBB_FOUND AND TBB_DEBUG_FOUND)
+    message(STATUS "Intel TBB include dir: " ${TBB_INCLUDE_DIRS})
+    message(STATUS "Intel TBB libs: " ${TBB_LIBRARIES})
+    message(STATUS "Intel TBB libs (debug): " ${TBB_DEBUG_LIBRARIES})
+	include_directories(${TBB_INCLUDE_DIRS})
+endif(TBB_FOUND AND TBB_DEBUG_FOUND)
+
+# print additional info
+if(TBB_FOUND AND NOT TBB_DEBUG_FOUND)
+    message(WARNING "Only the Intel TBB (release) libraries were found")
+endif()
+
+if(TBB_DEBUG_FOUND AND NOT TBB_FOUND)
+    message(WARNING "Only the Intel TBB (debug) libraries were found")
+endif()
+
+if(NOT TBB_FOUND AND NOT TBB_DEBUG_FOUND)
+    message(WARNING "No intel TBB libraries were found")
+endif()


 #set the C/C++ include path to the "include" directory

--- a/README.md
+++ b/README.md
@@ -31,6 +31,40 @@ If you want to try out the latest changes or contribute to FLANN, then it's reco

 If you just want to browse the repository, you can do so by going [here](https://github.com/mariusmuja/flann).

+
+Compiling FLANN with multithreading support
+------------------------------------------
+
+Make sure you have Intel Threading Building Blocks installed. You can get the latest version from www.threadingbuildingblocks.org. Alternatively you can also get it from package manager (this will probably not be the latest version, e.g. on Ubuntu 10.04 LTS only version 2.2 is found this way).
+
+For CMake to be able to detect the installation of Intel TBB, you need to have a tbb.pc in one of the directories of your PKG_CONFIG_PATH, it should look somewhat like this:
+    _________________________________________________
+
+    tbb.pc
+    _________________________________________________
+
+    prefix=/usr
+    exec_prefix=${prefix}
+    libdir=${exec_prefix}/lib
+    includedir=${prefix}/include
+
+    Name: Threading Building Blocks
+    Description: Intel's parallelism library for C++
+    URL: http://www.threadingbuildingblocks.org/
+    Version: 3.0update7
+    Libs: -L${libdir} -ltbb
+    Cflags: -I${includedir} 
+    _________________________________________________
+
+
+Using multithreaded FLANN in your project
+-----------------------------------------
+
+When multithreaded FLANN is compiled and installed, all you need to do now is compile your project with the -DTBB compiler flag to enable multithreading support. For example; in CMake you can achieve this by stating "ADD_DEFINITIONS(-DTBB)" for your target.
+
+Have a look at the "flann::Index::knnSearch" section in the manual on how to specify the number of cores for knn- and radiussearch.
+
+
 Conditions of use
 -----------------


--- a/cmake/FindTBB.cmake
+++ b/cmake/FindTBB.cmake
+###############################################################################
+# Find Intel Threading Building Blocks
+#
+# This sets the following variables:
+#
+# TBB_INCLUDE_DIRS - Directories containing the TBB include files.
+# TBB_LIBRARY_DIRS - Directories containing the TBB libs.
+#
+# (release libs)
+# TBB_FOUND - True if TBB was found.
+# TBB_LIBRARIES - Libraries needed to use TBB.
+#
+# (debug libs)
+# TBB_DEBUG_FOUND - True if TBB was found.
+# TBB_DEBUG_LIBRARIES - Libraries needed to use TBB.
+
+find_package(PkgConfig)
+pkg_check_modules(PC_TBB tbb)
+
+# Find include directory
+find_path(TBB_INCLUDE_DIR tbb/task_scheduler_init.h
+    HINTS ${PC_TBB_INCLUDEDIR} ${PC_TBB_INCLUDE_DIRS})
+
+# Find libraries
+find_library(TBB_LIBRARY tbb
+    HINTS ${PC_TBB_LIBDIR} ${PC_TBB_LIBRARY_DIRS})
+
+find_library(TBB_DEBUG_LIBRARY tbb_debug
+    HINTS ${PC_TBB_LIBDIR} ${PC_TBB_LIBRARY_DIRS})
+
+#find_library(TBB_MALLOC_LIBRARY tbbmalloc
+#    HINTS ${PC_TBB_LIBDIR} ${PC_TBB_LIBRARY_DIRS})
+
+#find_library(TBB_MALLOC_LIBRARY tbbmalloc_debug
+#    HINTS ${PC_TBB_LIBDIR} ${PC_TBB_LIBRARY_DIRS})
+
+#find_library(TBB_MALLOC_PROXY_LIBRARY tbbmalloc_proxy
+#    HINTS ${PC_TBB_LIBDIR} ${PC_TBB_LIBRARY_DIRS})
+
+#find_library(TBB_MALLOC_PROXY_LIBRARY tbbmalloc_proxy_debug
+#    HINTS ${PC_TBB_LIBDIR} ${PC_TBB_LIBRARY_DIRS})
+
+# Set the appropriate CMake variables and mark them as advanced
+set(TBB_INCLUDE_DIRS ${PC_TBB_INCLUDEDIR})
+set(TBB_LIBRARY_DIRS ${PC_TBB_LIBDIR})
+#set(TBB_LIBRARIES ${TBB_LIBRARY};${TBB_MALLOC_LIBRARY};${TBB_MALLOC_PROXY_LIBRARY})
+set(TBB_LIBRARIES ${TBB_LIBRARY})
+#set(TBB_DEBUG_LIBRARIES ${TBB_DEBUG_LIBRARY};${TBB_MALLOC_DEBUG_LIBRARY};${TBB_MALLOC_PROXY_DEBUG_LIBRARY})
+set(TBB_DEBUG_LIBRARIES ${TBB_DEBUG_LIBRARY})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(tbb DEFAULT_MSG TBB_LIBRARY TBB_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(tbb_debug DEFAULT_MSG TBB_DEBUG_LIBRARY TBB_INCLUDE_DIR)
+
+mark_as_advanced(TBB_LIBRARY TBB_DEBUG_LIBRARY TBB_INCLUDE_DIR)
+#mark_as_advanced(TBB_LIBRARY TBB_DEBUG_LIBRARY TBB_MALLOC_LIBRARY TBB_DEBUG_MALLOC_LIBRARY TBB_MALLOC_PROXY_LIBRARY TBB_MALLOC_PROXY_DEBUG_LIBRARY TBB_INCLUDE_DIR)
+
--- a/doc/manual.tex
+++ b/doc/manual.tex
@@ -247,8 +247,15 @@ options use the \texttt{cmake-gui} application after \texttt{cmake} has finished
  \end{center}
 \end{figure}

+\subsection{Compiling FLANN with multithreading support}
+To make use of FLANN multithreading support, it is required that Intel Threading Building Blocks is installed correctly
+on your system. You can either get it from your package manager or from:
+\begin{center}
+    \texttt{http://threadingbuildingblocks.org/}
+\end{center}

-
+You also need a pkgconfig file for locating the Intel TBB install and need to specify a -DTBB compiler flag on compilation
+of your project. Have a look at the readme file in the root folder of FLANN for more detailed instructions.

 \section{Using FLANN}

@@ -491,7 +498,8 @@ struct SearchParams
 {
 	SearchParams(int checks = 32,
 		  float eps = 0,
-		  bool sorted = true);
+		  bool sorted = true,
+		  int cores = 1);
 };
 \end{Verbatim}
 \begin{description}
@@ -502,6 +510,7 @@ If automatic configuration was used when the index was created, the number of ch
 required to achieve the specified precision was also computed, to use that value specify \texttt{CHECKS\_AUTOTUNED}.
 \item[eps] Search for eps-approximate neighbors (only used by KDTreeSingleIndex and KDTreeCuda3dIndex).
 \item[sorted] Used only by radius search, specifies if the neighbors returned should be sorted by distance.
+ \item[cores] How many cores to assign to the search (specify -1 for automatic core selection).
 \end{description}
 \end{description}


--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -50,6 +50,9 @@ else()
 		cuda_add_library(flann_cpp SHARED ${CPP_SOURCES})
    endif()
 endif()
+if(TBB_FOUND AND TBB_DEBUG_FOUND)
+    target_link_libraries(flann_cpp ${TBB_LIBRARIES})
+endif(TBB_FOUND  AND TBB_DEBUG_FOUND)

 set_target_properties(flann_cpp PROPERTIES
   VERSION ${FLANN_VERSION}
@@ -68,6 +71,9 @@ endif()

 #debug libraries
 add_library(flann_cpp-gd SHARED ${CPP_SOURCES})
+if(TBB_FOUND AND TBB_DEBUG_FOUND)
+    target_link_libraries(flann_cpp-gd ${TBB_DEBUG_LIBRARIES})
+endif(TBB_FOUND AND TBB_DEBUG_FOUND)
 set_target_properties(flann_cpp-gd PROPERTIES 
    COMPILE_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}
    DEFINE_SYMBOL FLANN_EXPORTS

--- a/src/cpp/flann/algorithms/kdtree_single_index.h
+++ b/src/cpp/flann/algorithms/kdtree_single_index.h
@@ -60,7 +60,7 @@ struct KDTreeSingleIndexParams : public IndexParams


 /**
- * Randomized kd-tree index
+ * Single kd-tree index
 *
 * Contains the k-d trees and other information for indexing a set of points
 * for nearest-neighbor matching.

--- a/src/cpp/flann/flann.h
+++ b/src/cpp/flann/flann.h
@@ -211,7 +211,6 @@ FLANN_EXPORT flann_index_t flann_load_index_int(char* filename,
    indices = pointer to matrix for the indices of the nearest neighbors of the testset features in the dataset
            (must have trows number of rows and nn number of columns)
    nn = how many nearest neighbors to return
-    index_params = index related parameters
    flann_params = generic flann parameters

   Returns: zero or -1 for error
@@ -276,6 +275,8 @@ FLANN_EXPORT int flann_find_nearest_neighbors_int(int* dataset,
    trows = number of rows (features) in the query dataset (same dimensionality as features in the dataset)
    indices = pointer to matrix for the indices of the nearest neighbors of the testset features in the dataset
            (must have trows number of rows and nn number of columns)
+    dists = pointer to matrix for the distances of the nearest neighbors of the testset features in the dataset
+            (must have trows number of rows and 1 column)
    nn = how many nearest neighbors to return
    flann_params = generic flann parameters

@@ -330,11 +331,16 @@ FLANN_EXPORT int flann_find_nearest_neighbors_index_int(flann_index_t index_id,
 * search will return all the neighbours found within a search radius
 * of the query point.
 *
- * The check parameter in the function below sets the level of approximation
+ * The check parameter in the FLANNParameters below sets the level of approximation
 * for the search by only visiting "checks" number of features in the index
 * (the same way as for the KNN search). A lower value for checks will give
 * a higher search speedup at the cost of potentially not returning all the
 * neighbours in the specified radius.
+ *
+ * The cores parameter in the FLANNParameters below sets the number of cores
+ * that will be used for the radius search, in case Intel TBB is present on
+ * the system and FLANN is built with multicore support on. Auto core selection
+ * can be achieved by setting the number of cores to -1.
 */
 FLANN_EXPORT int flann_radius_search(flann_index_t index_ptr, /* the index */
                                     float* query, /* query point */

--- a/src/cpp/flann/flann.hpp
+++ b/src/cpp/flann/flann.hpp
@@ -36,6 +36,14 @@
 #include <cassert>
 #include <cstdio>

+#ifdef TBB
+  #include <tbb/parallel_for.h>
+  #include <tbb/blocked_range.h>
+  #include <tbb/atomic.h>
+  #include <tbb/task_scheduler_init.h>
+#endif
+
+
 #include "flann/general.h"
 #include "flann/util/matrix.h"
 #include "flann/util/params.h"
@@ -43,6 +51,10 @@

 #include "flann/algorithms/all_indices.h"

+#ifdef TBB
+  #include "flann/tbb/bodies.hpp"
+#endif
+
 namespace flann
 {

@@ -104,8 +116,13 @@ public:
    typedef typename Distance::ElementType ElementType;
    typedef typename Distance::ResultType DistanceType;

+#ifdef TBB
+    Index(const Matrix<ElementType>& features, const IndexParams& params, Distance distance = Distance() )
+        : index_params_(params), atomic_count_()
+#else
    Index(const Matrix<ElementType>& features, const IndexParams& params, Distance distance = Distance() )
        : index_params_(params)
+#endif
    {
        flann_algorithm_t index_type = get_param<flann_algorithm_t>(params,"algorithm");
        loaded_ = false;
@@ -213,12 +230,69 @@ public:
     * \param[in] params Search parameters
     */
    int knnSearch(const Matrix<ElementType>& queries,
-    				Matrix<int>& indices,
-    				Matrix<DistanceType>& dists,
-    				size_t knn,
-    				const SearchParams& params)
+                                 Matrix<int>& indices,
+                                 Matrix<DistanceType>& dists,
+                                 size_t knn,
+                           const SearchParams& params)
    {
-        return nnIndex_->knnSearch(queries, indices, dists, knn, params);
+        assert(queries.cols == veclen());
+        assert(indices.rows >= queries.rows);
+        assert(dists.rows >= queries.rows);
+        assert(indices.cols >= knn);
+        assert(dists.cols >= knn);
+        bool sorted = get_param(params,"sorted",true);
+        bool use_heap = get_param(params,"use_heap",false);
+#ifdef TBB
+        int cores = get_param(params,"cores",1);
+        assert(cores >= 1 || cores == -1);
+#endif
+
+        int count = 0;
+
+#ifdef TBB
+        // Check if we need to do multicore search or stick with singlecore FLANN (less overhead)
+        if(cores == 1)
+        {
+#endif
+            if (use_heap) {
+                  KNNResultSet2<DistanceType> resultSet(knn);
+                  for (size_t i = 0; i < queries.rows; i++) {
+                          resultSet.clear();
+                          nnIndex_->findNeighbors(resultSet, queries[i], params);
+                          resultSet.copy(indices[i], dists[i], knn, sorted);
+                          count += resultSet.size();
+                  }
+            }
+            else {
+                  KNNSimpleResultSet<DistanceType> resultSet(knn);
+                  for (size_t i = 0; i < queries.rows; i++) {
+                          resultSet.clear();
+                          nnIndex_->findNeighbors(resultSet, queries[i], params);
+                          resultSet.copy(indices[i], dists[i], knn, sorted);
+                          count += resultSet.size();
+                  }
+            }
+#ifdef TBB
+        }
+        else
+        {
+            // Initialise the task scheduler for the use of Intel TBB parallel constructs
+            tbb::task_scheduler_init task_sched(cores);
+
+            // Make an atomic integer count, such that we can keep track of amount of neighbors found
+            atomic_count_ = 0;
+
+            // Use auto partitioner to choose the optimal grainsize for dividing the query points
+            flann::parallel_knnSearch<Distance> parallel_knn(queries, indices, dists, knn, params, nnIndex_, atomic_count_);
+            tbb::parallel_for(tbb::blocked_range<size_t>(0,queries.rows),
+                              parallel_knn,
+                              tbb::auto_partitioner());
+
+            count = atomic_count_;
+        }
+#endif
+
+        return count;
    }


@@ -231,12 +305,74 @@ public:
     * \param[in] params Search parameters
     */
    int knnSearch(const Matrix<ElementType>& queries,
-					std::vector< std::vector<int> >& indices,
-					std::vector<std::vector<DistanceType> >& dists,
-    				size_t knn,
-    				const SearchParams& params)
+                                 std::vector< std::vector<int> >& indices,
+                                 std::vector<std::vector<DistanceType> >& dists,
+                                 size_t knn,
+                           const SearchParams& params)
    {
-        return nnIndex_->knnSearch(queries, indices, dists, knn, params);
+        assert(queries.cols == veclen());
+        bool sorted = get_param(params,"sorted",true);
+        bool use_heap = get_param(params,"use_heap",false);
+#ifdef TBB
+        int cores = get_param(params,"cores",1);
+        assert(cores >= 1 || cores == -1);
+#endif
+
+        if (indices.size() < queries.rows ) indices.resize(queries.rows);
+        if (dists.size() < queries.rows ) dists.resize(queries.rows);
+
+        int count = 0;
+
+#ifdef TBB
+        // Check if we need to do multicore search or stick with singlecore FLANN (less overhead)
+        if(cores == 1)
+        {
+#endif
+            if (use_heap) {
+                KNNResultSet2<DistanceType> resultSet(knn);
+                for (size_t i = 0; i < queries.rows; i++) {
+                    resultSet.clear();
+                    nnIndex_->findNeighbors(resultSet, queries[i], params);
+                    size_t n = std::min(resultSet.size(), knn);
+                    indices[i].resize(n);
+                    dists[i].resize(n);
+                    resultSet.copy(&indices[i][0], &dists[i][0], n, sorted);
+                    count += n;
+                }
+            }
+            else {
+                KNNSimpleResultSet<DistanceType> resultSet(knn);
+                for (size_t i = 0; i < queries.rows; i++) {
+                    resultSet.clear();
+                    nnIndex_->findNeighbors(resultSet, queries[i], params);
+                    size_t n = std::min(resultSet.size(), knn);
+                    indices[i].resize(n);
+                    dists[i].resize(n);
+                    resultSet.copy(&indices[i][0], &dists[i][0], n, sorted);
+                    count += n;
+                }
+            }
+#ifdef TBB
+        }
+        else
+        {
+            // Initialise the task scheduler for the use of Intel TBB parallel constructs
+            tbb::task_scheduler_init task_sched(cores);
+
+            // Make an atomic integer count, such that we can keep track of amount of neighbors found
+            atomic_count_ = 0;
+
+            // Use auto partitioner to choose the optimal grainsize for dividing the query points
+            flann::parallel_knnSearch2<Distance> parallel_knn(queries, indices, dists, knn, params, nnIndex_, atomic_count_);
+            tbb::parallel_for(tbb::blocked_range<size_t>(0,queries.rows),
+                              parallel_knn,
+                              tbb::auto_partitioner());
+
+            count = atomic_count_;
+        }
+#endif
+
+        return count;
    }


@@ -250,12 +386,97 @@ public:
     * \returns Number of neighbors found
     */
    int radiusSearch(const Matrix<ElementType>& queries,
-    				Matrix<int>& indices,
-    				Matrix<DistanceType>& dists,
-    				float radius,
-    				const SearchParams& params)
+                                    Matrix<int>& indices,
+                                    Matrix<DistanceType>& dists,
+                                    float radius,
+                              const SearchParams& params)
    {
-        return nnIndex_->radiusSearch(queries, indices, dists, radius, params);
+        assert(queries.cols == veclen());
+#ifdef TBB
+        int cores = get_param(params,"cores",1);
+        assert(cores >= 1 || cores == -1);
+#endif
+
+        int count = 0;
+
+#ifdef TBB
+        // Check if we need to do multicore search or stick with singlecore FLANN (less overhead)
+        if(cores == 1)
+        {
+#endif
+            int max_neighbors = get_param(params, "max_neighbors", -1);
+
+            // just count neighbors
+            if (max_neighbors==0) {
+                CountRadiusResultSet<DistanceType> resultSet(radius);
+                for (size_t i = 0; i < queries.rows; i++) {
+                    resultSet.clear();
+                    findNeighbors(resultSet, queries[i], params);
+                    count += resultSet.size();
+                }
+            }
+            else {
+                size_t num_neighbors = std::min(indices.cols, dists.cols);
+                bool sorted = get_param(params, "sorted", true);
+                bool has_max_neighbors = has_param(params,"max_neighbors");
+
+                // explicitly indicated to use unbounded radius result set
+                // or we know there'll be enough room for resulting indices and dists
+                if (max_neighbors<0 && (has_max_neighbors || num_neighbors>=size())) {
+                    RadiusResultSet<DistanceType> resultSet(radius);
+                    for (size_t i = 0; i < queries.rows; i++) {
+                        resultSet.clear();
+                        nnIndex_->findNeighbors(resultSet, queries[i], params);
+                        size_t n = resultSet.size();
+                        count += n;
+                        if (n>num_neighbors) n = num_neighbors;
+                        resultSet.copy(indices[i], dists[i], n, sorted);
+
+                        // mark the next element in the output buffers as unused
+                        if (n<indices.cols) indices[i][n] = -1;
+                        if (n<dists.cols) dists[i][n] = std::numeric_limits<DistanceType>::infinity();
+                    }
+                }
+                else {
+                    if (max_neighbors<0) max_neighbors = num_neighbors;
+                    else max_neighbors = std::min(max_neighbors,(int)num_neighbors);
+                    // number of neighbors limited to max_neighbors
+                    KNNRadiusResultSet<DistanceType> resultSet(radius, max_neighbors);
+                    for (size_t i = 0; i < queries.rows; i++) {
+                        resultSet.clear();
+                        nnIndex_->findNeighbors(resultSet, queries[i], params);
+                        size_t n = resultSet.size();
+                        count += n;
+                        if ((int)n>max_neighbors) n = max_neighbors;
+                        resultSet.copy(indices[i], dists[i], n, sorted);
+
+                        // mark the next element in the output buffers as unused
+                        if (n<indices.cols) indices[i][n] = -1;
+                        if (n<dists.cols) dists[i][n] = std::numeric_limits<DistanceType>::infinity();
+                    }
+                }
+            }
+#ifdef TBB
+        }
+        else
+        {
+            // Initialise the task scheduler for the use of Intel TBB parallel constructs
+            tbb::task_scheduler_init task_sched(cores);
+
+            // Make an atomic integer count, such that we can keep track of amount of neighbors found
+            atomic_count_ = 0;
+
+            // Use auto partitioner to choose the optimal grainsize for dividing the query points
+            flann::parallel_radiusSearch<Distance> parallel_radius(queries, indices, dists, radius, params, nnIndex_, atomic_count_);
+            tbb::parallel_for(tbb::blocked_range<size_t>(0,queries.rows),
+                              parallel_radius,
+                              tbb::auto_partitioner());
+
+            count = atomic_count_;
+        }
+#endif
+
+        return count;
    }


@@ -269,12 +490,89 @@ public:
     * \returns Number of neighbors found
     */
    int radiusSearch(const Matrix<ElementType>& queries,
-    				std::vector< std::vector<int> >& indices,
-    				std::vector<std::vector<DistanceType> >& dists,
-    				float radius,
-    				const SearchParams& params)
+                                    std::vector< std::vector<int> >& indices,
+                                    std::vector<std::vector<DistanceType> >& dists,
+                                    float radius,
+                              const SearchParams& params)
    {
-    	return nnIndex_->radiusSearch(queries, indices, dists, radius, params);
+        assert(queries.cols == veclen());
+#ifdef TBB
+        int cores = get_param(params,"cores",1);
+        assert(cores >= 1 || cores == -1);
+#endif
+
+        int count = 0;
+
+#ifdef TBB
+        // Check if we need to do multicore search or stick with singlecore FLANN (less overhead)
+        if(cores == 1)
+        {
+#endif
+            int max_neighbors = get_param(params, "max_neighbors", -1);
+
+            // just count neighbors
+            if (max_neighbors==0) {
+                    CountRadiusResultSet<DistanceType> resultSet(radius);
+                for (size_t i = 0; i < queries.rows; i++) {
+                    resultSet.clear();
+                    findNeighbors(resultSet, queries[i], params);
+                    count += resultSet.size();
+                }
+            }
+            else {
+                bool sorted = get_param(params, "sorted", true);
+                if (indices.size() < queries.rows ) indices.resize(queries.rows);
+                if (dists.size() < queries.rows ) dists.resize(queries.rows);
+
+                if (max_neighbors<0) {
+                    // search for all neighbors
+                    RadiusResultSet<DistanceType> resultSet(radius);
+                    for (size_t i = 0; i < queries.rows; i++) {
+                        resultSet.clear();
+                        findNeighbors(resultSet, queries[i], params);
+                        size_t n = resultSet.size();
+                        count += n;
+                        indices[i].resize(n);
+                        dists[i].resize(n);
+                        resultSet.copy(&indices[i][0], &dists[i][0], n, sorted);
+                    }
+                }
+                else {
+                    // number of neighbors limited to max_neighbors
+                    KNNRadiusResultSet<DistanceType> resultSet(radius, max_neighbors);
+                    for (size_t i = 0; i < queries.rows; i++) {
+                        resultSet.clear();
+                        findNeighbors(resultSet, queries[i], params);
+                        size_t n = resultSet.size();
+                        count += n;
+                        if ((int)n>max_neighbors) n = max_neighbors;
+                        indices[i].resize(n);
+                        dists[i].resize(n);
+                        resultSet.copy(&indices[i][0], &dists[i][0], n, sorted);
+                    }
+                }
+            }
+#ifdef TBB
+        }
+        else
+        {
+          // Initialise the task scheduler for the use of Intel TBB parallel constructs
+          tbb::task_scheduler_init task_sched(cores);
+
+          // Reset atomic count before passing it on to the threads, such that we can keep track of amount of neighbors found
+          atomic_count_ = 0;
+
+          // Use auto partitioner to choose the optimal grainsize for dividing the query points
+          flann::parallel_radiusSearch2<Distance> parallel_radius(queries, indices, dists, radius, params, nnIndex_, atomic_count_);
+          tbb::parallel_for(tbb::blocked_range<size_t>(0,queries.rows),
+                            parallel_radius,
+                            tbb::auto_partitioner());
+
+          count = atomic_count_;
+        }
+#endif
+
+        return count;
    }

    /**
@@ -309,6 +607,11 @@ private:
    bool loaded_;
    /** Parameters passed to the index */
    IndexParams index_params_;
+#ifdef TBB
+    /** Atomic count variable, passed to the different threads for keeping track of the amount of neighbors found.
+        \note Intel TBB 'catch': must be data member for correct initialization tbb::atomic<T> has no declared constructors !! */
+    tbb::atomic<int> atomic_count_;
+#endif
 };

 /**

--- a/src/cpp/flann/tbb/bodies.hpp
+++ b/src/cpp/flann/tbb/bodies.hpp
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2010-2011  Nick Vanbaelen (nickon@acm.org). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef FLANN_TBB_BODIES_H
+#define FLANN_TBB_BODIES_H
+
+#include <tbb/blocked_range.h>
+#include <tbb/atomic.h>
+
+#include "flann/util/matrix.h"
+#include "flann/util/params.h"
+#include "flann/util/result_set.h"
+
+namespace flann
+{
+
+template<typename Distance>
+class parallel_knnSearch
+{
+public:
+  typedef typename Distance::ElementType ElementType;
+  typedef typename Distance::ResultType DistanceType;
+
+  parallel_knnSearch(const Matrix<ElementType>& queries,
+                           Matrix<int>& indices,
+                           Matrix<DistanceType>& distances,
+                           size_t knn,
+                     const SearchParams& params,
+                           NNIndex<Distance>* nnIndex,
+                           tbb::atomic<int>& count)
+    : queries_(queries),
+      indices_(indices),
+      distances_(distances),
+      knn_(knn),
+      params_(params),
+      nnIndex_(nnIndex),
+      count_(count)
+
+  {}
+
+  /* default destructor will do */
+
+  /* default copy constructor will do,
+     parallel for will use this to create a separate parallel_knnSearch object
+     for each worker thread (pointers will be copied, which is OK) */
+
+  /**
+   * Perform knnSearch for the query points assigned to this worker thread
+   * \param r query point range assigned for this worker thread to operate on
+   */
+  void operator()( const tbb::blocked_range<size_t>& r ) const
+  {
+    bool sorted = get_param(params_,"sorted",true);
+    bool use_heap = get_param(params_,"use_heap",false);
+
+    if (use_heap)
+    {
+      KNNResultSet2<DistanceType> resultSet(knn_);
+      for (size_t i=r.begin(); i!=r.end(); ++i)
+      {
+        resultSet.clear();
+        nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+        resultSet.copy(indices_[i], distances_[i], knn_, sorted);
+        count_ += resultSet.size();
+      }
+    }
+    else
+    {
+      KNNSimpleResultSet<DistanceType> resultSet(knn_);
+      for (size_t i=r.begin(); i!=r.end(); ++i)
+      {
+        resultSet.clear();
+        nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+        resultSet.copy(indices_[i], distances_[i], knn_, sorted);
+        count_ += resultSet.size();
+      }
+    }
+  }
+
+private:
+  //! All query points to perform search on
+  //! \note each worker thread only operates on a specified range
+  const Matrix<ElementType>& queries_;
+
+  //! Matrix for storing the indices of the nearest neighbors
+  //! \note no need for this to be a parallel container, each worker thread
+  //!       solely operates on its specified range!
+  Matrix<int>& indices_;
+
+  //! Matrix for storing the distances to the nearest neighbors
+  //! \note no need for this to be a parallel container, each worker thread
+  //!       solely operates on its specified range!
+  Matrix<DistanceType>& distances_;
+
+  //! Number of nearest neighbors to search for
+  size_t knn_;
+
+  //! The search parameters to take into account
+  const SearchParams& params_;
+
+  //! The nearest neighbor index to perform the search with
+  NNIndex<Distance>* nnIndex_;
+
+  //! Atomic count variable to keep track of the number of neighbors found
+  //! \note must be mutable because body will be casted as const in parallel_for
+  mutable tbb::atomic<int>& count_;
+};
+
+
+template<typename Distance>
+class parallel_knnSearch2
+{
+public:
+  typedef typename Distance::ElementType ElementType;
+  typedef typename Distance::ResultType DistanceType;
+
+  parallel_knnSearch2(const Matrix<ElementType>& queries,
+                            std::vector< std::vector<int> >& indices,
+                            std::vector<std::vector<DistanceType> >& distances,
+                            size_t knn,
+                      const SearchParams& params,
+                            NNIndex<Distance>* nnIndex,
+                            tbb::atomic<int>& count)
+    : queries_(queries),
+      indices_(indices),
+      distances_(distances),
+      knn_(knn),
+      params_(params),
+      nnIndex_(nnIndex),
+      count_(count)
+
+  {}
+
+  /* default destructor will do */
+
+  /* default copy constructor will do,
+     parallel for will use this to create a separate parallel_knnSearch object
+     for each worker thread (pointers will be copied, which is OK) */
+
+  /**
+   * Perform knnSearch for the query points assigned to this worker thread
+   * (specified by the blocked_range parameter)
+   */
+  void operator()( const tbb::blocked_range<size_t>& r ) const
+  {
+    bool sorted = get_param(params_,"sorted",true);
+    bool use_heap = get_param(params_,"use_heap",false);
+
+    if (use_heap) {
+        KNNResultSet2<DistanceType> resultSet(knn_);
+        for (size_t i=r.begin(); i!=r.end(); ++i)
+        {
+            resultSet.clear();
+            nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+            size_t n = std::min(resultSet.size(), knn_);
+            indices_[i].resize(n);
+            distances_[i].resize(n);
+            resultSet.copy(&indices_[i][0], &distances_[i][0], n, sorted);
+            count_ += n;
+        }
+    }
+    else {
+        KNNSimpleResultSet<DistanceType> resultSet(knn_);
+        for (size_t i=r.begin(); i!=r.end(); ++i)
+        {
+            resultSet.clear();
+            nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+            size_t n = std::min(resultSet.size(), knn_);
+            indices_[i].resize(n);
+            distances_[i].resize(n);
+            resultSet.copy(&indices_[i][0], &distances_[i][0], n, sorted);
+            count_ += n;
+        }
+    }
+  }
+
+private:
+  //! All query points to perform search on
+  //! \note each worker thread only operates on a specified range
+  const Matrix<ElementType>& queries_;
+
+  //! Vector for storing the indices of the nearest neighbors
+  //! \note no need for this to be a parallel container, each worker thread
+  //!       solely operates on its specified range!
+  std::vector< std::vector<int> >& indices_;
+
+  //! Vector for storing the distances to the nearest neighbors
+  //! \note no need for this to be a parallel container, each worker thread
+  //!       solely operates on its specified range!
+  std::vector< std::vector<DistanceType> >& distances_;
+
+  //! Number of nearest neighbors to search for
+  size_t knn_;
+
+  //! The search parameters to take into account
+  const SearchParams& params_;
+
+  //! The nearest neighbor index to perform the search with
+  NNIndex<Distance>* nnIndex_;
+
+  //! Atomic count variable to keep track of the number of neighbors found
+  //! \note must be mutable because body will be casted as const in parallel_for
+  mutable tbb::atomic<int>& count_;
+};
+
+
+template<typename Distance>
+class parallel_radiusSearch
+{
+public:
+  typedef typename Distance::ElementType ElementType;
+  typedef typename Distance::ResultType DistanceType;
+
+  /* default destructor will do */
+
+  /* default copy constructor will do,
+     parallel for will use this to create a separate parallel_knnSearch object
+     for each worker thread (pointers will be copied, which is OK) */
+
+  /**
+   * Perform radiusSearch for the query points assigned to this worker thread
+   * (specified by the blocked_range parameter)
+   */
+  parallel_radiusSearch(const Matrix<ElementType>& queries,
+                              Matrix<int>& indices,
+                              Matrix<DistanceType>& distances,
+                              float radius,
+                        const SearchParams& params,
+                              NNIndex<Distance>* nnIndex,
+                              tbb::atomic<int>& count)
+    : queries_(queries),
+      indices_(indices),
+      distances_(distances),
+      radius_(radius),
+      params_(params),
+      nnIndex_(nnIndex),
+      count_(count)
+
+  {}
+
+  void operator()( const tbb::blocked_range<size_t>& r ) const
+  {
+      int max_neighbors = get_param(params_, "max_neighbors", -1);
+
+      if (max_neighbors==0) {
+          CountRadiusResultSet<DistanceType> resultSet(radius_);
+          for (size_t i=r.begin(); i!=r.end(); ++i)
+          {
+              resultSet.clear();
+              nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+              count_ += resultSet.size();
+          }
+      }
+      else {
+          size_t num_neighbors = std::min(indices_.cols, distances_.cols);
+          bool sorted = get_param(params_, "sorted", true);
+          bool has_max_neighbors = has_param(params_,"max_neighbors");
+
+          // explicitly indicated to use unbounded radius result set
+          // or we know there'll be enough room for resulting indices and dists
+          if (max_neighbors<0 && (has_max_neighbors || num_neighbors>=nnIndex_->size())) {
+              RadiusResultSet<DistanceType> resultSet(radius_);
+              for (size_t i=r.begin(); i!=r.end(); ++i)
+              {
+                  resultSet.clear();
+                  nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+                  size_t n = resultSet.size();
+                  count_ += n;
+                  if (n>num_neighbors) n = num_neighbors;
+                  resultSet.copy(indices_[i], distances_[i], n, sorted);
+
+                  // mark the next element in the output buffers as unused
+                  if (n<indices_.cols) indices_[i][n] = -1;
+                  if (n<distances_.cols) distances_[i][n] = std::numeric_limits<DistanceType>::infinity();
+              }
+          }
+          else {
+              if (max_neighbors<0) max_neighbors = num_neighbors;
+              else max_neighbors = std::min(max_neighbors,(int)num_neighbors);
+              // number of neighbors limited to max_neighbors
+              KNNRadiusResultSet<DistanceType> resultSet(radius_, max_neighbors);
+              for (size_t i=r.begin(); i!=r.end(); ++i)
+              {
+                  resultSet.clear();
+                  nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+                  size_t n = resultSet.size();
+                  count_ += n ;
+                  if ((int)n>max_neighbors) n = max_neighbors;
+                  resultSet.copy(indices_[i], distances_[i], n, sorted);
+
+                  // mark the next element in the output buffers as unused
+                  if (n<indices_.cols) indices_[i][n] = -1;
+                  if (n<distances_.cols) distances_[i][n] = std::numeric_limits<DistanceType>::infinity();
+              }
+          }
+      }
+  }
+
+private:
+  //! All query points to perform search on
+  //! \note each worker thread only operates on a specified range
+  const Matrix<ElementType>& queries_;
+
+  //! Matrix for storing the indices of the nearest neighbors
+  //! \note no need for this to be a parallel container, each worker thread
+  //!       solely operates on its specified range!
+  Matrix<int>& indices_;
+
+  //! Matrix for storing the distances to the nearest neighbors
+  //! \note no need for this to be a parallel container, each worker thread
+  //!       solely operates on its specified range!
+  Matrix<DistanceType>& distances_;
+
+  //! Radius size bound on the search for nearest neighbors
+  float radius_;
+
+  //! The search parameters to take into account
+  const SearchParams& params_;
+
+  //! The nearest neighbor index to perform the search with
+  NNIndex<Distance>* nnIndex_;
+
+  //! Atomic count variable to keep track of the number of neighbors found
+  //! \note must be mutable because body will be casted as const in parallel_for
+  mutable tbb::atomic<int>& count_;
+};
+
+
+template<typename Distance>
+class parallel_radiusSearch2
+{
+public:
+  typedef typename Distance::ElementType ElementType;
+  typedef typename Distance::ResultType DistanceType;
+
+  /* default destructor will do */
+
+  /* default copy constructor will do,
+     parallel for will use this to create a separate parallel_knnSearch object
+     for each worker thread (pointers will be copied, which is OK) */
+
+  /**
+   * Perform radiusSearch for the query points assigned to this worker thread
+   * (specified by the blocked_range parameter)
+   */
+  parallel_radiusSearch2(const Matrix<ElementType>& queries,
+                               std::vector< std::vector<int> >& indices,
+                               std::vector<std::vector<DistanceType> >& distances,
+                               float radius,
+                         const SearchParams& params,
+                               NNIndex<Distance>* nnIndex,
+                               tbb::atomic<int>& count)
+    : queries_(queries),
+      indices_(indices),
+      distances_(distances),
+      radius_(radius),
+      params_(params),
+      nnIndex_(nnIndex),
+      count_(count)
+
+  {}
+
+  void operator()( const tbb::blocked_range<size_t>& r ) const
+  {
+      int max_neighbors = get_param(params_, "max_neighbors", -1);
+
+      // just count neighbors
+      if (max_neighbors==0) {
+          CountRadiusResultSet<DistanceType> resultSet(radius_);
+          for (size_t i=r.begin(); i!=r.end(); ++i)
+          {
+            resultSet.clear();
+            nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+            count_ += resultSet.size();
+          }
+      }
+      else {
+          bool sorted = get_param(params_, "sorted", true);
+          if (indices_.size() < queries_.rows ) indices_.resize(queries_.rows);
+          if (distances_.size() < queries_.rows ) distances_.resize(queries_.rows);
+
+          if (max_neighbors<0) {
+              // search for all neighbors
+              RadiusResultSet<DistanceType> resultSet(radius_);
+              for (size_t i=r.begin(); i!=r.end(); ++i)
+              {
+                  resultSet.clear();
+                  nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+                  size_t n = resultSet.size();
+                  count_ += n;
+                  indices_[i].resize(n);
+                  distances_[i].resize(n);
+                  resultSet.copy(&indices_[i][0], &distances_[i][0], n, sorted);
+              }
+          }
+          else {
+              // number of neighbors limited to max_neighbors
+              KNNRadiusResultSet<DistanceType> resultSet(radius_, max_neighbors);
+              for (size_t i=r.begin(); i!=r.end(); ++i)
+              {
+                  resultSet.clear();
+                  nnIndex_->findNeighbors(resultSet, queries_[i], params_);
+                  size_t n = resultSet.size();
+                  count_ += n;
+                  if ((int)n>max_neighbors) n = max_neighbors;
+                  indices_[i].resize(n);
+                  distances_[i].resize(n);
+                  resultSet.copy(&indices_[i][0], &distances_[i][0], n, sorted);
+              }
+          }
+      }
+  }
+
+private:
+    //! All query points to perform search on
+    //! \note each worker thread only operates on a specified range
+    const Matrix<ElementType>& queries_;
+
+    //! Vector for storing the indices of the nearest neighbors
+    //! \note no need for this to be a parallel container, each worker thread
+    //!       solely operates on its specified range!
+    std::vector< std::vector<int> >& indices_;
+
+    //! Vector for storing the distances to the nearest neighbors
+    //! \note no need for this to be a parallel container, each worker thread
+    //!       solely operates on its specified range!
+    std::vector< std::vector<DistanceType> >& distances_;
+
+    //! Radius size bound on the search for nearest neighbors
+    float radius_;
+
+    //! The search parameters to take into account
+    const SearchParams& params_;
+
+    //! The nearest neighbor index to perform the search with
+    NNIndex<Distance>* nnIndex_;
+
+    //! Atomic count variable to keep track of the number of neighbors found
+    //! \note must be mutable because body will be casted as const in parallel_for
+    mutable tbb::atomic<int>& count_;
+};
+
+}
+
+#endif //FLANN_TBB_BODIES_H
--- a/src/cpp/flann/util/params.h
+++ b/src/cpp/flann/util/params.h
@@ -44,7 +44,8 @@ typedef std::map<std::string, any> IndexParams;

 struct SearchParams : public IndexParams
 {
-    SearchParams(int checks = 32, float eps = 0, bool sorted = true )
+
+    SearchParams(int checks = 32, float eps = 0, bool sorted = true, int cores = 1 )
    {
        // how many leafs to visit when searching for neighbours (-1 for unlimited)
        (*this)["checks"] = checks;
@@ -52,6 +53,9 @@ struct SearchParams : public IndexParams
        (*this)["eps"] = eps;
        // only for radius search, require neighbours sorted by distance (default: true)
        (*this)["sorted"] = sorted;
+        // how many cores to assign to the search
+        // this parameter will be ignored if Intel TBB isn't available on the system or no "TBB" macro is defined
+        (*this)["cores"] = cores;
    }
 };


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,9 +21,15 @@ endif()
 if (GTEST_FOUND AND HDF5_FOUND)
    include_directories(${HDF5_INCLUDE_DIR})
 	flann_add_gtest(flann_simple_test flann_simple_test.cpp)
+    if(TBB_FOUND)
+        add_definitions(-DTBB)
+        flann_add_gtest(flann_multithreaded_test flann_multithreaded_test.cpp)
+    endif()
 	    target_link_libraries(flann_simple_test flann_cpp ${HDF5_LIBRARIES})
+        target_link_libraries(flann_multithreaded_test flann_cpp ${HDF5_LIBRARIES})
    if (HDF5_IS_PARALLEL)
 	    target_link_libraries(flann_simple_test ${MPI_LIBRARIES})
+        target_link_libraries(flann_multithreaded_test ${MPI_LIBRARIES})
    endif()
 endif()


--- a/test/flann_multithreaded_test.cpp
+++ b/test/flann_multithreaded_test.cpp
+#include <gtest/gtest.h>
+#include <time.h>
+
+#include <flann/flann.h>
+#include <flann/io/hdf5.h>
+#include <flann/nn/ground_truth.h>
+
+using namespace flann;
+
+float compute_precision(const flann::Matrix<int>& match, const flann::Matrix<int>& indices)
+{
+    int count = 0;
+
+    assert(match.rows == indices.rows);
+    size_t nn = std::min(match.cols, indices.cols);
+
+    for (size_t i=0; i<match.rows; ++i) {
+        for (size_t j=0;j<nn;++j) {
+            for (size_t k=0;k<nn;++k) {
+                if (match[i][j]==indices[i][k]) {
+                    count ++;
+                }
+            }
+        }
+    }
+
+    return float(count)/(nn*match.rows);
+}
+
+class FLANNTestFixture : public ::testing::Test {
+protected:
+    clock_t start_time_;
+
+    void start_timer(const std::string& message = "")
+    {
+        if (!message.empty()) {
+            printf("%s", message.c_str());
+            fflush(stdout);
+        }
+        start_time_ = clock();
+    }
+
+    double stop_timer()
+    {
+        return double(clock()-start_time_)/CLOCKS_PER_SEC;
+    }
+
+};
+
+
+/* Test Fixture which loads the cloud.h5 cloud as data and query matrix */
+class FlannTest : public FLANNTestFixture {
+protected:
+    flann::Matrix<float> data;
+    flann::Matrix<float> query;
+    flann::Matrix<int> match;
+    flann::Matrix<float> dists;
+    flann::Matrix<int> indices;
+
+    int nn;
+
+    void SetUp()
+    {
+        nn = 5;
+
+        printf("Reading test data...");
+        fflush(stdout);
+        flann::load_from_file(data, "cloud.h5","dataset");
+        flann::load_from_file(query,"cloud.h5","query");
+        flann::load_from_file(match,"cloud.h5","match");
+
+        dists = flann::Matrix<float>(new float[query.rows*nn], query.rows, nn);
+        indices = flann::Matrix<int>(new int[query.rows*nn], query.rows, nn);
+
+        printf("done\n");
+    }
+
+    void TearDown()
+    {
+        delete[] data.data;
+        delete[] query.data;
+        delete[] match.data;
+        delete[] dists.data;
+        delete[] indices.data;
+    }
+
+    int GetNN() { return nn; }
+};
+
+TEST_F(FlannTest, HandlesSingleCoreSearch)
+{
+    flann::Index<L2_Simple<float> > index(data, flann::KDTreeSingleIndexParams(50, false));
+    start_timer("Building kd-tree index...");
+    index.buildIndex();
+    printf("done (%g seconds)\n", stop_timer());
+
+    int checks = -1;
+    float eps = 0.0f;
+    bool sorted = true;
+    int cores = 1;
+
+    start_timer("Searching KNN...");
+    index.knnSearch(query, indices, dists, GetNN(), flann::SearchParams(checks,eps,sorted,cores));
+    printf("done (%g seconds)\n", stop_timer());
+
+    float precision = compute_precision(match, indices);
+    EXPECT_GE(precision, 0.99);
+    printf("Precision: %g\n", precision);
+}
+
+TEST_F(FlannTest, HandlesMultiCoreSearch)
+{
+    flann::Index<L2_Simple<float> > index(data, flann::KDTreeSingleIndexParams(50, false));
+    start_timer("Building kd-tree index...");
+    index.buildIndex();
+    printf("done (%g seconds)\n", stop_timer());
+
+    int checks = -1;
+    float eps = 0.0f;
+    bool sorted = true;
+    int cores = -1;
+
+    start_timer("Searching KNN...");
+    index.knnSearch(query, indices, dists, GetNN(), flann::SearchParams(checks,eps,sorted,cores));
+    printf("done (%g seconds)\n", stop_timer());
+
+    float precision = compute_precision(match, indices);
+    EXPECT_GE(precision, 0.99);
+    printf("Precision: %g\n", precision);
+}
+
+
+/* Test Fixture which loads the cloud.h5 cloud as data and query matrix and holds two dists
+   and indices matrices for comparing single and multi core KNN search */
+class FlannCompareKnnTest : public FLANNTestFixture {
+protected:
+    flann::Matrix<float> data;
+    flann::Matrix<float> query;
+    flann::Matrix<float> dists_single;
+    flann::Matrix<int> indices_single;
+    flann::Matrix<float> dists_multi;
+    flann::Matrix<int> indices_multi;
+
+    int nn;
+
+    void SetUp()
+    {
+        nn = 5;
+
+        printf("Reading test data...");
+        fflush(stdout);
+        flann::load_from_file(data, "cloud.h5","dataset");
+        flann::load_from_file(query,"cloud.h5","query");
+
+        dists_single = flann::Matrix<float>(new float[query.rows*nn], query.rows, nn);
+        indices_single = flann::Matrix<int>(new int[query.rows*nn], query.rows, nn);
+        dists_multi = flann::Matrix<float>(new float[query.rows*nn], query.rows, nn);
+        indices_multi = flann::Matrix<int>(new int[query.rows*nn], query.rows, nn);
+
+        printf("done\n");
+    }
+
+    void TearDown()
+    {
+        delete[] data.data;
+        delete[] query.data;
+        delete[] dists_single.data;
+        delete[] indices_single.data;
+        delete[] dists_multi.data;
+        delete[] indices_multi.data;
+    }
+
+    int GetNN() { return nn; }
+};
+
+TEST_F(FlannCompareKnnTest, CompareMultiSingleCoreKnnSearchSorted)
+{
+    flann::Index<L2_Simple<float> > index(data, flann::KDTreeSingleIndexParams(50, false));
+    start_timer("Building kd-tree index...");
+    index.buildIndex();
+    printf("done (%g seconds)\n", stop_timer());
+
+    int checks = -1;
+    float eps = 0.0f;
+    bool sorted = true;
+    int single_core = 1;
+    int multi_core = -1;
+
+    start_timer("Searching KNN (single core)...");
+    int single_neighbor_count = index.knnSearch(query, indices_single, dists_single, GetNN(), flann::SearchParams(checks,eps,sorted,single_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    start_timer("Searching KNN (multi core)...");
+    int multi_neighbor_count = index.knnSearch(query, indices_multi, dists_multi, GetNN(), flann::SearchParams(checks,eps,sorted,multi_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    EXPECT_EQ(single_neighbor_count, multi_neighbor_count);
+
+    float precision = compute_precision(indices_single, indices_multi);
+    EXPECT_GE(precision, 0.99);
+    printf("Precision: %g\n", precision);
+}
+
+TEST_F(FlannCompareKnnTest, CompareMultiSingleCoreKnnSearchUnsorted)
+{
+    flann::Index<L2_Simple<float> > index(data, flann::KDTreeSingleIndexParams(50, false));
+    start_timer("Building kd-tree index...");
+    index.buildIndex();
+    printf("done (%g seconds)\n", stop_timer());
+
+    int checks = -1;
+    float eps = 0.0f;
+    bool sorted = false;
+    int single_core = 1;
+    int multi_core = -1;
+
+    start_timer("Searching KNN (single core)...");
+    int single_neighbor_count = index.knnSearch(query, indices_single, dists_single, GetNN(), flann::SearchParams(checks,eps,sorted,single_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    start_timer("Searching KNN (multi core)...");
+    int multi_neighbor_count = index.knnSearch(query, indices_multi, dists_multi, GetNN(), flann::SearchParams(checks,eps,sorted,multi_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    EXPECT_EQ(single_neighbor_count, multi_neighbor_count);
+
+    float precision = compute_precision(indices_single, indices_multi);
+    EXPECT_GE(precision, 0.99);
+    printf("Precision: %g\n", precision);
+}
+
+
+/* Test Fixture which loads the cloud.h5 cloud as data and query matrix and holds two dists
+   and indices matrices for comparing single and multi core radius search */
+class FlannCompareRadiusTest : public FLANNTestFixture {
+protected:
+    flann::Matrix<float> data;
+    flann::Matrix<float> query;
+    flann::Matrix<float> dists_single;
+    flann::Matrix<int> indices_single;
+    flann::Matrix<float> dists_multi;
+    flann::Matrix<int> indices_multi;
+
+    float radius;
+
+    void SetUp()
+    {
+        radius = 0.1f;
+
+        printf("Reading test data...");
+        fflush(stdout);
+        flann::load_from_file(data, "cloud.h5","dataset");
+        flann::load_from_file(query,"cloud.h5","query");
+
+        // If the indices / dists matrix cannot contain all points found in the radius, only the points
+        // that can be stored in the matrix will be returned and search is stopped. For each query point
+        // we reserve as many space as we think is needed. For large point clouds, reserving 'cloudsize'
+        // space for each query point might cause memory errors.
+        int reserve_size = data.rows / 1000;
+
+        dists_single = flann::Matrix<float>(new float[query.rows*reserve_size], query.rows, reserve_size);
+        indices_single = flann::Matrix<int>(new int[query.rows*reserve_size], query.rows, reserve_size);
+        dists_multi = flann::Matrix<float>(new float[query.rows*reserve_size], query.rows, reserve_size);
+        indices_multi = flann::Matrix<int>(new int[query.rows*reserve_size], query.rows, reserve_size);
+
+        printf("done\n");
+    }
+
+    void TearDown()
+    {
+        delete[] data.data;
+        delete[] query.data;
+        delete[] dists_single.data;
+        delete[] indices_single.data;
+        delete[] dists_multi.data;
+        delete[] indices_multi.data;
+    }
+
+    float GetRadius() { return radius; }
+};
+
+TEST_F(FlannCompareRadiusTest, CompareMultiSingleCoreRadiusSearchSorted)
+{
+    flann::Index<L2_Simple<float> > index(data, flann::KDTreeSingleIndexParams(50, false));
+    start_timer("Building kd-tree index...");
+    index.buildIndex();
+    printf("done (%g seconds)\n", stop_timer());
+
+    int checks = -1;
+    float eps = 0.0f;
+    bool sorted = true;
+    int single_core = 1;
+    int multi_core = -1;
+
+    start_timer("Searching Radius (single core)...");
+    int single_neighbor_count = index.radiusSearch(query, indices_single, dists_single, GetRadius(), flann::SearchParams(checks,eps,sorted,single_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    start_timer("Searching Radius (multi core)...");
+    int multi_neighbor_count = index.radiusSearch(query, indices_multi, dists_multi, GetRadius(), flann::SearchParams(checks,eps,sorted,multi_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    EXPECT_EQ(single_neighbor_count, multi_neighbor_count);
+
+    float precision = compute_precision(indices_single, indices_multi);
+    EXPECT_GE(precision, 0.99);
+    printf("Precision: %g\n", precision);
+}
+
+TEST_F(FlannCompareRadiusTest, CompareMultiSingleCoreRadiusSearchUnsorted)
+{
+    flann::Index<L2_Simple<float> > index(data, flann::KDTreeSingleIndexParams(50, false));
+    start_timer("Building kd-tree index...");
+    index.buildIndex();
+    printf("done (%g seconds)\n", stop_timer());
+
+    int checks = -1;
+    float eps = 0.0f;
+    bool sorted = false;
+    int single_core = 1;
+    int multi_core = -1;
+
+    start_timer("Searching Radius (single core)...");
+    int single_neighbor_count = index.radiusSearch(query, indices_single, dists_single, GetRadius(), flann::SearchParams(checks,eps,sorted,single_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    start_timer("Searching Radius (multi core)...");
+    int multi_neighbor_count = index.radiusSearch(query, indices_multi, dists_multi, GetRadius(), flann::SearchParams(checks,eps,sorted,multi_core));
+    printf("done (%g seconds)\n", stop_timer());
+
+    EXPECT_EQ(single_neighbor_count, multi_neighbor_count);
+
+    float precision = compute_precision(indices_single, indices_multi);
+    EXPECT_GE(precision, 0.99);
+    printf("Precision: %g\n", precision);
+}
+
+
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}